├── .gitignore ├── LICENSE ├── NeMo-Guardrails-Customer-Support ├── README.md ├── config.yml └── tlm.co ├── README.md ├── TLM-Demo-Notebook ├── TLM-Demo.ipynb ├── customer-service-chat-categories.csv ├── customer-service-conversation.csv └── customer-service-policy.md ├── TLM-MLflow-Integration └── evaluating_traces_TLM_mlflow_dl.ipynb ├── TLM-PII-Detection └── TLM-PII-Detection.ipynb ├── TLM-Record-Matching └── data_enrichment_record_matching_tutorial.ipynb ├── TLM-SimpleQA-Benchmark ├── README.md ├── evaluate_response.ipynb ├── get_tlm_response.ipynb ├── openai_utils │ ├── chat_completion_sampler.py │ ├── common.py │ ├── simpleqa_constants.py │ └── types.py └── results │ ├── gpt-4o-baseline-25-responses.csv │ ├── gpt-4o-baseline-80-responses.csv │ ├── gpt-4o-baseline-responses.csv │ ├── gpt-4o-best-25-responses.csv │ ├── gpt-4o-best-80-responses.csv │ └── gpt-4o-best-responses.csv ├── TLM-intro └── Real_time_Eval_for_every_LLM_response_with_Cleanlab_TLM.ipynb ├── TLM-o1-benchmark ├── PII-98-benchmark │ ├── pii_98_dataset.csv │ ├── pii_98_openai_o1_preview_responses.csv │ └── pii_98_tlm_o1_preview_responses.csv ├── README.md ├── SVAMP-49-benchmark │ ├── svamp_49_dataset.csv │ ├── svamp_49_openai_o1_preview_responses.csv │ └── svamp_49_tlm_o1_preview_responses.csv ├── TriviaQA-114-benchmark │ ├── TriviaQA_114_dataset.csv │ ├── TriviaQA_114_openai_o1_preview_responses.csv │ └── TriviaQA_114_tlm_o1_preview_responses.csv ├── openai_o1_preview_benchmark_reproduce.ipynb └── tlm_o1_preview_benchmark_reproduce.ipynb ├── benchmarking_hallucination_metrics └── benchmark_hallucination_metrics.ipynb ├── benchmarking_hallucination_model ├── HHEM.ipynb ├── LLM_as_judge_and_TLM.ipynb ├── Lynx.ipynb ├── Prometheus.ipynb └── README.md ├── few_shot_prompt_selection ├── few_shot_prompt_selection.ipynb └── requirements.txt ├── fine_tuning_classification ├── fine_tuning_LLM_with_noisy_labels.ipynb ├── improving-openai-davinci-with-cleanlab.png ├── improving-openai-models-with-cleanlab.png └── requirements.txt ├── fine_tuning_data_curation ├── fine_tuning_data_curation.ipynb └── requirements.txt ├── fine_tuning_mistral_beavertails ├── beavertails.ipynb └── requirements.txt ├── gdpr_tlm_blog_post ├── application_logs.csv ├── gdpr_analysis_results.csv └── gdpr_tlm_blog_post.ipynb ├── generate_llm_response ├── generate_llm_response.ipynb └── requirements.txt ├── gpt4-rag-logprobs └── gpt4-rag-logprobs.ipynb ├── jigsaw_ai_safety_keras ├── Evaluating_Toxicity_Datasets_Large_Language_Models.ipynb └── requirements.txt ├── time_series_automl ├── cleanlab_time_series_automl.ipynb ├── requirements.txt └── time_series_automl.ipynb └── tlm_call_api_directly └── tlm_api_directly.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints/ 2 | .venv/ 3 | .DS_Store 4 | .python-version 5 | __pycache__/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Cleanlab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /NeMo-Guardrails-Customer-Support/README.md: -------------------------------------------------------------------------------- 1 | # Customer Support Chatbot using NVIDIA NeMo Guardrails and Trustworthy Language Model 2 | 3 | This folder contains the config and the colang definition file required to run a simple customer support chatbot. 4 | 5 | To run this project, please refer to NeMo Guardrails [documentation here](https://docs.nvidia.com/nemo/guardrails/user_guides/cli.html). 6 | 7 | A `CLEANLAB_TLM_API_KEY` is required to run this project. Get an API key at https://tlm.cleanlab.ai -------------------------------------------------------------------------------- /NeMo-Guardrails-Customer-Support/config.yml: -------------------------------------------------------------------------------- 1 | instructions: 2 | - type: general 3 | content: | 4 | The following is the customer service policy of ACME Inc. 5 | # ACME Inc. Customer Service Policy 6 | 7 | ## Table of Contents 8 | 1. Free Shipping Policy 9 | 2. Free Returns Policy 10 | 3. Fraud Detection Guidelines 11 | 4. Customer Interaction Tone 12 | 13 | ## 1. Free Shipping Policy 14 | 15 | ### 1.1 Eligibility Criteria 16 | - Free shipping is available on all orders over $50 within the continental United States. 17 | - For orders under $50, a flat rate shipping fee of $5.99 will be applied. 18 | - Free shipping is not available for expedited shipping methods (e.g., overnight or 2-day shipping). 19 | 20 | ### 1.2 Exclusions 21 | - Free shipping does not apply to orders shipped to Alaska, Hawaii, or international destinations. 22 | - Oversized or heavy items may incur additional shipping charges, which will be clearly communicated to the customer before purchase. 23 | 24 | ### 1.3 Handling Customer Inquiries 25 | - If a customer inquires about free shipping eligibility, verify the order total and shipping destination. 26 | - Inform customers of ways to qualify for free shipping (e.g., adding items to reach the $50 threshold). 27 | - For orders just below the threshold, you may offer a one-time courtesy free shipping if it's the customer's first purchase or if they have a history of large orders. 28 | 29 | ## 2. Free Returns Policy 30 | 31 | ### 2.1 Eligibility Criteria 32 | - Free returns are available for all items within 30 days of the delivery date. 33 | - Items must be unused, unworn, and in their original packaging with all tags attached. 34 | - Free returns are limited to standard shipping methods within the continental United States. 35 | 36 | ### 2.2 Exclusions 37 | - Final sale items, as marked on the product page, are not eligible for free returns. 38 | - Customized or personalized items are not eligible for free returns unless there is a manufacturing defect. 39 | - Undergarments, swimwear, and earrings are not eligible for free returns due to hygiene reasons. 40 | 41 | ### 2.3 Process for Handling Returns 42 | 1. Verify the order date and ensure it falls within the 30-day return window. 43 | 2. Ask the customer about the reason for the return and document it in the system. 44 | 3. Provide the customer with a prepaid return label if they qualify for free returns. 45 | 4. Inform the customer of the expected refund processing time (5-7 business days after receiving the return). 46 | 47 | ### 2.4 Exceptions 48 | - For items damaged during shipping or with manufacturing defects, offer an immediate replacement or refund without requiring a return. 49 | - For returns outside the 30-day window, use discretion based on the customer's history and the reason for the late return. You may offer store credit as a compromise. 50 | 51 | ## 3. Fraud Detection Guidelines 52 | 53 | ### 3.1 Red Flags for Potential Fraud 54 | - Multiple orders from the same IP address with different customer names or shipping addresses. 55 | - Orders with unusually high quantities of the same item. 56 | - Shipping address different from the billing address, especially if in different countries. 57 | - Multiple failed payment attempts followed by a successful one. 58 | - Customers pressuring for immediate shipping or threatening to cancel the order. 59 | 60 | ### 3.2 Verification Process 61 | 1. For orders flagging as potentially fraudulent, place them on hold for review. 62 | 2. Verify the customer's identity by calling the phone number on file. 63 | 3. Request additional documentation (e.g., photo ID, credit card statement) if necessary. 64 | 4. Cross-reference the shipping address with known fraud databases. 65 | 66 | ### 3.3 Actions for Confirmed Fraud 67 | - Cancel the order immediately and refund any charges. 68 | - Document the incident in the customer's account and flag it for future reference. 69 | - Report confirmed fraud cases to the appropriate authorities and credit card companies. 70 | 71 | ### 3.4 False Positives 72 | - If a legitimate customer is flagged, apologize for the inconvenience and offer a small discount or free shipping on their next order. 73 | - Document the incident to improve our fraud detection algorithms. 74 | 75 | ## 4. Customer Interaction Tone 76 | 77 | ### 4.1 General Guidelines 78 | - Always maintain a professional, friendly, and empathetic tone. 79 | - Use the customer's name when addressing them. 80 | - Listen actively and paraphrase the customer's concerns to ensure understanding. 81 | - Avoid negative language; focus on what can be done rather than what can't. 82 | 83 | ### 4.2 Specific Scenarios 84 | 85 | #### Angry or Frustrated Customers 86 | - Remain calm and do not take comments personally. 87 | - Acknowledge the customer's feelings and apologize for their negative experience. 88 | - Focus on finding a solution and clearly explain the steps you'll take to resolve the issue. 89 | - If necessary, offer to escalate the issue to a supervisor. 90 | 91 | #### Confused or Indecisive Customers 92 | - Be patient and offer clear, concise explanations. 93 | - Ask probing questions to better understand their needs. 94 | - Provide options and explain the pros and cons of each. 95 | - Offer to send follow-up information via email if the customer needs time to decide. 96 | 97 | #### VIP or Loyal Customers 98 | - Acknowledge their status and thank them for their continued business. 99 | - Be familiar with their purchase history and preferences. 100 | - Offer exclusive deals or early access to new products when appropriate. 101 | - Go above and beyond to exceed their expectations. 102 | 103 | ### 4.3 Language and Phrasing 104 | - Use positive language: "I'd be happy to help you with that" instead of "I can't do that." 105 | - Avoid technical jargon or abbreviations that customers may not understand. 106 | - Use "we" statements to show unity with the company: "We value your feedback" instead of "The company values your feedback." 107 | - End conversations on a positive note: "Is there anything else I can assist you with today?" 108 | 109 | ### 4.4 Written Communication 110 | - Use proper grammar, spelling, and punctuation in all written communications. 111 | - Keep emails and chat responses concise and to the point. 112 | - Use bullet points or numbered lists for clarity when providing multiple pieces of information. 113 | - Include a clear call-to-action or next steps at the end of each communication. 114 | 115 | The following dialogue features a discussion between a user and a customer service bot. The bot attempts to help the customer but must respect the guidelines in the customer service policy. The bot provides very accurate and concise answers. The bot does not tell the user to contact customer service 116 | Remember, as a representative of ACME Inc., you are often the first point of contact for our customers. Your interactions should always reflect our commitment to exceptional customer service and satisfaction. 117 | 118 | models: 119 | - type: main 120 | engine: openai 121 | model: gpt-4o-mini 122 | 123 | rails: 124 | output: 125 | flows: 126 | - cleanlab trustworthiness 127 | 128 | -------------------------------------------------------------------------------- /NeMo-Guardrails-Customer-Support/tlm.co: -------------------------------------------------------------------------------- 1 | define flow cleanlab trustworthiness 2 | $result = execute call cleanlab api 3 | if $result.trustworthiness_score < 0.7 4 | bot respond untrustworthy 5 | stop 6 | 7 | define bot respond untrustworthy 8 | "I'm sorry, I am unable to help with this request. I'll connect you with another agent who can help..." -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # cleanlab-tools 2 | 3 | Cookbooks showcasing various applications of Cleanlab, as well as code shared for: education, reproducibility, transparency. 4 | 5 | 6 | | Example | Description | 7 | |----------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------| 8 | | [TLM-Demo-Notebook](TLM-Demo-Notebook/TLM-Demo.ipynb) | Demo-ing various applications of the Trustworthy Language Model, particularly in customer support | 9 | | [tlm_call_api_directly](tlm_call_api_directly/tlm_api_directly.ipynb) | Call the TLM REST API directly. You can use any programming language (eg. Typescript) with http lib/tools by providing the necessary payload and headers. | 10 | | [TLM-PII-Detection](TLM-PII-Detection/TLM-PII-Detection.ipynb) | Find and mask PII with the Trustworthy Language Model | 11 | | [Detecting GDPR Violations with TLM](gdpr_tlm_blog_post/gdpr_tlm_blog_post.ipynb) | Analyze application logs using TLM to detect GDPR violations | 12 | | [Customer Support AI Agent with NeMo Guardrails](NeMo-Guardrails-Customer-Support/README.md) | Reliable customer support AI Agent with Guardrails and trustworthiness scoring ([Nvidia Blogpost](https://developer.nvidia.com/blog/prevent-llm-hallucinations-with-the-cleanlab-trustworthy-language-model-in-nvidia-nemo-guardrails/)) | 13 | | [Better LLM Evals in MLFlow](TLM-MLflow-Integration/evaluating_traces_TLM_mlflow_dl.ipynb) | Automatically find the bad LLM responses lurking in your production logs/traces via trustworthiness scoring in MLFlow | 14 | | [TLM-Record-Matching](TLM-Record-Matching/data_enrichment_record_matching_tutorial.ipynb) | Using the Trustworthy Language Model to reliably match records between two different data tables | 15 | | [TLM-SimpleQA-Benchmark](TLM-SimpleQA-Benchmark/) | Benchmarking TLM and OpenAI LLMs on the SimpleQA dataset | 16 | | [benchmarking_hallucination_metrics](benchmarking_hallucination_metrics/benchmark_hallucination_metrics.ipynb) | Evaluate the performance of popular real-time hallucination detection methods on RAG benchmarks | 17 | | [benchmarking_hallucination_model](benchmarking_hallucination_model/README.md) | Evaluate the performance of popular hallucination detection models on RAG benchmarks | 18 | | [fine_tuning_data_curation](fine_tuning_data_curation/fine_tuning_data_curation.ipynb) | Use Cleanlab TLM and Cleanlab Studio to detect bad data in instruction tuning LLM datasets | 19 | | [few_shot_prompt_selection](few_shot_prompt_selection/few_shot_prompt_selection.ipynb) | Clean the pool of few-shot examples to improve prompt template for OpenAI LLM | 20 | | [fine_tuning_classification](fine_tuning_classification/fine_tuning_LLM_with_noisy_labels.ipynb) | Use Cleanlab Studio to improve the accuracy of fine-tuned LLMs for classification tasks | 21 | | [generate_llm_response](generate_llm_response/generate_llm_response.ipynb) | Generate LLM responses for customer service requests using Llama 2 and OpenAI's API | 22 | | [gpt4-rag-logprobs](gpt4-rag-logprobs/gpt4-rag-logprobs.ipynb) | Obtaining logprobs from a GPT-4 based RAG system | 23 | | [fine_tuning_mistral_beavertails](fine_tuning_mistral_beavertails/beavertails.ipynb) | Analyze human annotated AI-safety-related labels (like toxicity) using Cleanlab Studio, and thus generate safer responses from LLMs | 24 | | [Evaluating_Toxicity_Datasets_Large_Language_Models](jigsaw_ai_safety_keras/Evaluating_Toxicity_Datasets_Large_Language_Models.ipynb) | Analyze toxicity annotations in the Jigsaw dataset using Cleanlab Studio | 25 | | [time_series_automl](time_series_automl/cleanlab_time_series_automl.ipynb) | Model time series data in a tabular format and use AutoML with Cleanlab Studio to improve out-of-sample accuracy | 26 | -------------------------------------------------------------------------------- /TLM-Demo-Notebook/customer-service-chat-categories.csv: -------------------------------------------------------------------------------- 1 | id,ground_truth 2 | RU91,Warranty and Product Support 3 | XL37,Returns and Exchanges 4 | OJ95,Shipping and Delivery 5 | YR96,Returns and Exchanges 6 | XZ39,Returns and Exchanges 7 | HH59,Returns and Exchanges 8 | IG79,Order and Payment 9 | QV90,Warranty and Product Support 10 | TD43,Account Management 11 | JN62,Account Management 12 | RH04,Account Management 13 | AK23,Order and Payment 14 | ZS09,Shipping and Delivery 15 | GU39,Returns and Exchanges 16 | OH91,Warranty and Product Support 17 | IB58,Account Management 18 | JU40,Returns and Exchanges 19 | RH37,Shipping and Delivery 20 | NI19,Account Management 21 | OC25,Order and Payment 22 | GW48,Returns and Exchanges 23 | TN62,Warranty and Product Support 24 | KP59,Order and Payment 25 | KB23,Warranty and Product Support 26 | DR23,Returns and Exchanges 27 | IH81,Account Management 28 | VZ10,Returns and Exchanges 29 | QW08,Warranty and Product Support 30 | XZ51,Account Management 31 | ZI50,Shipping and Delivery 32 | TR30,Order and Payment 33 | DJ87,Returns and Exchanges 34 | MI16,Warranty and Product Support 35 | GR70,Order and Payment 36 | LV17,Warranty and Product Support 37 | ZP58,Account Management 38 | CO18,Shipping and Delivery 39 | HM60,Shipping and Delivery 40 | NQ67,Order and Payment 41 | SC68,Account Management 42 | KI47,Returns and Exchanges 43 | CA33,Returns and Exchanges 44 | QK95,Account Management 45 | BT71,Warranty and Product Support 46 | UF01,Order and Payment 47 | BC81,Account Management 48 | DV36,Warranty and Product Support 49 | UW82,Order and Payment 50 | WC52,Shipping and Delivery 51 | IP22,Account Management 52 | KC46,Order and Payment 53 | PR33,Order and Payment 54 | BL44,Account Management 55 | JA38,Warranty and Product Support 56 | JD34,Account Management 57 | JS75,Returns and Exchanges 58 | KI33,Account Management 59 | YR97,Returns and Exchanges 60 | NK41,Order and Payment 61 | CG12,Warranty and Product Support 62 | RB35,Returns and Exchanges 63 | GI10,Warranty and Product Support 64 | XB06,Returns and Exchanges 65 | UI97,Returns and Exchanges 66 | LU58,Returns and Exchanges 67 | HZ43,Shipping and Delivery 68 | GS74,Returns and Exchanges 69 | FD95,Shipping and Delivery 70 | NE98,Shipping and Delivery 71 | NI84,Warranty and Product Support 72 | CE28,Shipping and Delivery 73 | JK11,Returns and Exchanges 74 | US01,Returns and Exchanges 75 | YD85,Returns and Exchanges 76 | ZJ67,Warranty and Product Support 77 | WM34,Returns and Exchanges 78 | PN99,Shipping and Delivery 79 | LP64,Order and Payment 80 | ER30,Warranty and Product Support 81 | BC62,Returns and Exchanges 82 | UM61,Shipping and Delivery 83 | HS06,Warranty and Product Support 84 | PT61,Order and Payment 85 | YJ93,Shipping and Delivery 86 | SI65,Order and Payment 87 | HP46,Returns and Exchanges 88 | LP24,Order and Payment 89 | VD49,Account Management 90 | LY56,Order and Payment 91 | BO55,Warranty and Product Support 92 | WP54,Warranty and Product Support 93 | HB47,Shipping and Delivery 94 | CF85,Warranty and Product Support 95 | RV60,Warranty and Product Support 96 | IK46,Order and Payment 97 | KQ16,Shipping and Delivery 98 | VW10,Shipping and Delivery 99 | SF57,Account Management 100 | HW85,Warranty and Product Support 101 | RY14,Returns and Exchanges 102 | JS23,Order and Payment 103 | OU82,Order and Payment 104 | GY81,Warranty and Product Support 105 | BF21,Returns and Exchanges 106 | SH90,Account Management 107 | LO72,Returns and Exchanges 108 | SX21,Order and Payment 109 | ER27,Shipping and Delivery 110 | LR33,Returns and Exchanges 111 | DF21,Returns and Exchanges 112 | -------------------------------------------------------------------------------- /TLM-Demo-Notebook/customer-service-policy.md: -------------------------------------------------------------------------------- 1 | The following is the customer service policy of ACME Inc. 2 | # ACME Inc. Customer Service Policy 3 | 4 | ## Table of Contents 5 | 1. Free Shipping Policy 6 | 2. Free Returns Policy 7 | 3. Fraud Detection Guidelines 8 | 4. Customer Interaction Tone 9 | 10 | ## 1. Free Shipping Policy 11 | 12 | ### 1.1 Eligibility Criteria 13 | - Free shipping is available on all orders over $50 within the continental United States. 14 | - For orders under $50, a flat rate shipping fee of $5.99 will be applied. 15 | - Free shipping is not available for expedited shipping methods (e.g., overnight or 2-day shipping). 16 | 17 | ### 1.2 Exclusions 18 | - Free shipping does not apply to orders shipped to Alaska, Hawaii, or international destinations. 19 | - Oversized or heavy items may incur additional shipping charges, which will be clearly communicated to the customer before purchase. 20 | 21 | ### 1.3 Handling Customer Inquiries 22 | - If a customer inquires about free shipping eligibility, verify the order total and shipping destination. 23 | - Inform customers of ways to qualify for free shipping (e.g., adding items to reach the $50 threshold). 24 | - For orders just below the threshold, you may offer a one-time courtesy free shipping if it's the customer's first purchase or if they have a history of large orders. 25 | 26 | ## 2. Free Returns Policy 27 | 28 | ### 2.1 Eligibility Criteria 29 | - Free returns are available for all items within 30 days of the delivery date. 30 | - Items must be unused, unworn, and in their original packaging with all tags attached. 31 | - Free returns are limited to standard shipping methods within the continental United States. 32 | 33 | ### 2.2 Exclusions 34 | - Final sale items, as marked on the product page, are not eligible for free returns. 35 | - Customized or personalized items are not eligible for free returns unless there is a manufacturing defect. 36 | - Undergarments, swimwear, and earrings are not eligible for free returns due to hygiene reasons. 37 | 38 | ### 2.3 Process for Handling Returns 39 | 1. Verify the order date and ensure it falls within the 30-day return window. 40 | 2. Ask the customer about the reason for the return and document it in the system. 41 | 3. Provide the customer with a prepaid return label if they qualify for free returns. 42 | 4. Inform the customer of the expected refund processing time (5-7 business days after receiving the return). 43 | 44 | ### 2.4 Exceptions 45 | - For items damaged during shipping or with manufacturing defects, offer an immediate replacement or refund without requiring a return. 46 | - For returns outside the 30-day window, use discretion based on the customer's history and the reason for the late return. You may offer store credit as a compromise. 47 | 48 | ## 3. Fraud Detection Guidelines 49 | 50 | ### 3.1 Red Flags for Potential Fraud 51 | - Multiple orders from the same IP address with different customer names or shipping addresses. 52 | - Orders with unusually high quantities of the same item. 53 | - Shipping address different from the billing address, especially if in different countries. 54 | - Multiple failed payment attempts followed by a successful one. 55 | - Customers pressuring for immediate shipping or threatening to cancel the order. 56 | 57 | ### 3.2 Verification Process 58 | 1. For orders flagging as potentially fraudulent, place them on hold for review. 59 | 2. Verify the customer's identity by calling the phone number on file. 60 | 3. Request additional documentation (e.g., photo ID, credit card statement) if necessary. 61 | 4. Cross-reference the shipping address with known fraud databases. 62 | 63 | ### 3.3 Actions for Confirmed Fraud 64 | - Cancel the order immediately and refund any charges. 65 | - Document the incident in the customer's account and flag it for future reference. 66 | - Report confirmed fraud cases to the appropriate authorities and credit card companies. 67 | 68 | ### 3.4 False Positives 69 | - If a legitimate customer is flagged, apologize for the inconvenience and offer a small discount or free shipping on their next order. 70 | - Document the incident to improve our fraud detection algorithms. 71 | 72 | ## 4. Customer Interaction Tone 73 | 74 | ### 4.1 General Guidelines 75 | - Always maintain a professional, friendly, and empathetic tone. 76 | - Use the customer's name when addressing them. 77 | - Listen actively and paraphrase the customer's concerns to ensure understanding. 78 | - Avoid negative language; focus on what can be done rather than what can't. 79 | 80 | ### 4.2 Specific Scenarios 81 | 82 | #### Angry or Frustrated Customers 83 | - Remain calm and do not take comments personally. 84 | - Acknowledge the customer's feelings and apologize for their negative experience. 85 | - Focus on finding a solution and clearly explain the steps you'll take to resolve the issue. 86 | - If necessary, offer to escalate the issue to a supervisor. 87 | 88 | #### Confused or Indecisive Customers 89 | - Be patient and offer clear, concise explanations. 90 | - Ask probing questions to better understand their needs. 91 | - Provide options and explain the pros and cons of each. 92 | - Offer to send follow-up information via email if the customer needs time to decide. 93 | 94 | #### VIP or Loyal Customers 95 | - Acknowledge their status and thank them for their continued business. 96 | - Be familiar with their purchase history and preferences. 97 | - Offer exclusive deals or early access to new products when appropriate. 98 | - Go above and beyond to exceed their expectations. 99 | 100 | ### 4.3 Language and Phrasing 101 | - Use positive language: "I'd be happy to help you with that" instead of "I can't do that." 102 | - Avoid technical jargon or abbreviations that customers may not understand. 103 | - Use "we" statements to show unity with the company: "We value your feedback" instead of "The company values your feedback." 104 | - End conversations on a positive note: "Is there anything else I can assist you with today?" 105 | 106 | ### 4.4 Written Communication 107 | - Use proper grammar, spelling, and punctuation in all written communications. 108 | - Keep emails and chat responses concise and to the point. 109 | - Use bullet points or numbered lists for clarity when providing multiple pieces of information. 110 | - Include a clear call-to-action or next steps at the end of each communication. 111 | 112 | The following dialogue features a discussion between a user and a customer service bot. The bot attempts to help the customer but must respect the guidelines in the customer service policy. The bot provides very accurate and concise answers. The bot does not tell the user to contact customer service 113 | Remember, as a representative of ACME Inc., you are often the first point of contact for our customers. Your interactions should always reflect our commitment to exceptional customer service and satisfaction. -------------------------------------------------------------------------------- /TLM-MLflow-Integration/evaluating_traces_TLM_mlflow_dl.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Automatically find the bad LLM responses in your LLM Evals with Cleanlab\n", 8 | "\n", 9 | "This guide will walk you through the process of evaluating LLM responses captured in MLflow with Cleanlab's Trustworthy Language Models (TLM).\n", 10 | "\n", 11 | "TLM boosts the reliability of any LLM application by indicating when the model’s response is untrustworthy.\n", 12 | "\n", 13 | "This guide requires a Cleanlab TLM API key. If you don't have one, you can sign up for a free trial [here](https://tlm.cleanlab.ai/)." 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "## Install dependencies & Set environment variables" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "%%bash\n", 30 | "pip install -q mlflow openai cleanlab-tlm --upgrade" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "import mlflow\n", 40 | "import os\n", 41 | "import json\n", 42 | "import pandas as pd\n", 43 | "from getpass import getpass\n", 44 | "import dotenv\n", 45 | "dotenv.load_dotenv()" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "## API Keys\n", 53 | "\n", 54 | "This guide requires four API keys:\n", 55 | "- [OpenAI API Key](https://platform.openai.com/api-keys)\n", 56 | "- [Cleanlab TLM API Key](https://tlm.cleanlab.ai/)\n" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 2, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "if not (openai_api_key := os.getenv(\"OPENAI_API_KEY\")):\n", 66 | " openai_api_key = getpass(\"🔑 Enter your OpenAI API key: \")\n", 67 | "if not (cleanlab_tlm_api_key := os.getenv(\"CLEANLAB_TLM_API_KEY\")):\n", 68 | " cleanlab_tlm_api_key = getpass(\"🔑 Enter your Cleanlab TLM API key: \")\n", 69 | "\n", 70 | "os.environ[\"OPENAI_API_KEY\"] = openai_api_key\n", 71 | "os.environ[\"CLEANLAB_TLM_API_KEY\"] = cleanlab_tlm_api_key" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "## Set Up MLflow Tracking Server and Logging\n", 79 | "\n", 80 | "You can start a tutorial and log models, experiments without a tracking server set up. With this mode, your experiment data and artifacts are saved directly under your current directory." 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "# This will start a server on port 8080, in the background\n", 90 | "# Navigate to http://localhost:8080 to see the MLflow UI\n", 91 | "%%bash --bg\n", 92 | "mlflow server --host 127.0.0.1 --port 8080" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "# Set up MLflow tracking server\n", 102 | "mlflow.set_tracking_uri(\"http://localhost:8080\")\n", 103 | "\n", 104 | "# Enable logging for OpenAI SDK\n", 105 | "mlflow.openai.autolog()\n", 106 | "\n", 107 | "# Set experiment name\n", 108 | "mlflow.set_experiment(\"Eval OpenAI Traces w/ TLM\")\n", 109 | "\n", 110 | "# Get experiment ID\n", 111 | "experiment_id = mlflow.get_experiment_by_name(\"Eval OpenAI Traces w/ TLM\").experiment_id" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": {}, 117 | "source": [ 118 | "## Prepare trace dataset and load into MLflow" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "For the sake of demonstration purposes, we'll briefly generate some traces and track them in MLflow. Typically, you would have already captured traces in MLflow and would skip to \"Download trace dataset from MLflow\"\n", 126 | "\n", 127 | "NOTE: TLM requires the entire input to the LLM to be provided. This includes any system prompts, context, or other information that was originally provided to the LLM to generate the response. Notice below that we include the system prompt in the trace metadata since by default the trace does not include the system prompt within the input." 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 5, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "import openai" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "# Let's use some tricky trivia questions to generate some traces\n", 146 | "trivia_questions = [ \n", 147 | " \"What is the 3rd month of the year in alphabetical order?\",\n", 148 | " \"What is the capital of France?\",\n", 149 | " \"How many seconds are in 100 years?\",\n", 150 | " \"Alice, Bob, and Charlie went to a café. Alice paid twice as much as Bob, and Bob paid three times as much as Charlie. If the total bill was $72, how much did each person pay?\",\n", 151 | " \"When was the Declaration of Independence signed?\"\n", 152 | "]\n", 153 | "\n", 154 | "def generate_answers(trivia_question):\n", 155 | " system_prompt = \"You are a trivia master.\"\n", 156 | "\n", 157 | " response = openai.chat.completions.create(\n", 158 | " model=\"gpt-3.5-turbo\",\n", 159 | " messages=[\n", 160 | " {\"role\": \"system\", \"content\": system_prompt},\n", 161 | " {\"role\": \"user\", \"content\": trivia_question},\n", 162 | " ],\n", 163 | " )\n", 164 | " \n", 165 | " answer = response.choices[0].message.content\n", 166 | " return answer\n", 167 | "\n", 168 | "\n", 169 | "# Generate answers\n", 170 | "answers = []\n", 171 | "for i in range(len(trivia_questions)):\n", 172 | " answer = generate_answers(trivia_questions[i])\n", 173 | " answers.append(answer) \n", 174 | " print(f\"Question {i+1}: {trivia_questions[i]}\")\n", 175 | " print(f\"Answer {i+1}:\\n{answer}\\n\")\n", 176 | "\n", 177 | "print(f\"Generated {len(answers)} answers and tracked them in MLflow.\")" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": {}, 183 | "source": [ 184 | "## Download trace dataset from MLflow\n", 185 | "\n", 186 | "Fetching traces from MLflow is straightforward. Just set up the MLflow client and use one of its functions to fetch the data. We'll fetch the traces and evaluate them. After that, we'll add our scores back into MLflow.\n", 187 | "\n", 188 | "The `search_traces()` function has arguments to filter the traces by tags, timestamps, and beyond. You can find more about other methods to [query traces](https://mlflow.org/docs/latest/python_api/mlflow.client.html#mlflow.client.MlflowClient.search_traces) in the docs.\n", 189 | "\n", 190 | "In this example, we'll fetch all traces from the experiment." 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "client = mlflow.client.MlflowClient()\n", 200 | "traces = client.search_traces(experiment_ids=[experiment_id])" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "## Generate evaluations with TLM\n", 208 | "\n", 209 | "Instead of running TLM individually on each trace, we'll provide all of the prompt, response pairs in a list to TLM in a single call. This is more efficient and allows us to get scores and explanations for all of the traces at once. Then, using the `request.id`, we can attach the scores and explanations back to the correct trace in MLflow.\n", 210 | "\n" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": 8, 216 | "metadata": {}, 217 | "outputs": [], 218 | "source": [ 219 | "from cleanlab_tlm import TLM\n", 220 | "\n", 221 | "tlm = TLM(options={\"log\": [\"explanation\"]})" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 9, 227 | "metadata": {}, 228 | "outputs": [], 229 | "source": [ 230 | "# This helper method will extract the prompt and response from each trace and return three lists: request ID's, prompts, and responses.\n", 231 | "def get_prompt_response_pairs(traces):\n", 232 | " prompts = []\n", 233 | " responses = []\n", 234 | " for trace in traces:\n", 235 | " # Parse request and response JSON\n", 236 | " request_data = json.loads(trace.data.request)\n", 237 | " response_data = json.loads(trace.data.response)\n", 238 | " \n", 239 | " # Extract system prompt and user message from request\n", 240 | " system_prompt = request_data[\"messages\"][0][\"content\"]\n", 241 | " user_message = request_data[\"messages\"][1][\"content\"]\n", 242 | " \n", 243 | " # Extract assistant's response from response\n", 244 | " assistant_response = response_data[\"choices\"][0][\"message\"][\"content\"]\n", 245 | " \n", 246 | " prompts.append(system_prompt + \"\\n\" + user_message)\n", 247 | " responses.append(assistant_response)\n", 248 | " return prompts, responses\n", 249 | "\n", 250 | "request_ids = [trace.info.request_id for trace in traces]\n", 251 | "prompts, responses = get_prompt_response_pairs(traces)" 252 | ] 253 | }, 254 | { 255 | "cell_type": "markdown", 256 | "metadata": {}, 257 | "source": [ 258 | "Now, let's use TLM to generate a `trustworthiness score` and `explanation` for each trace.\n", 259 | "\n", 260 | "**IMPORTANT:** It is essential to always include any system prompts, context, or other information that was originally provided to the LLM to generate the response. You should construct the prompt input to `get_trustworthiness_score()` in a way that is as similar as possible to the original prompt. This is why we included the system prompt in the trace metadata." 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": null, 266 | "metadata": {}, 267 | "outputs": [], 268 | "source": [ 269 | "# Evaluate each of the prompt, response pairs using TLM\n", 270 | "evaluations = tlm.get_trustworthiness_score(prompts, responses)\n", 271 | "\n", 272 | "# Extract the trustworthiness scores and explanations from the evaluations\n", 273 | "trust_scores = [entry[\"trustworthiness_score\"] for entry in evaluations]\n", 274 | "explanations = [entry[\"log\"][\"explanation\"] for entry in evaluations]\n", 275 | "\n", 276 | "# Create a DataFrame with the evaluation results\n", 277 | "trace_evaluations = pd.DataFrame({\n", 278 | " 'request_id': request_ids,\n", 279 | " 'prompt': prompts,\n", 280 | " 'response': responses, \n", 281 | " 'trust_score': trust_scores,\n", 282 | " 'explanation': explanations\n", 283 | "})" 284 | ] 285 | }, 286 | { 287 | "cell_type": "markdown", 288 | "metadata": {}, 289 | "source": [ 290 | "Awesome! Now we have a DataFrame mapping trace IDs to their scores and explanations. We've also included the prompt and response for each trace for demonstration purposes to find the **least trustworthy trace!**" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": null, 296 | "metadata": {}, 297 | "outputs": [], 298 | "source": [ 299 | "sorted_df = trace_evaluations.sort_values(by=\"trust_score\", ascending=True)\n", 300 | "sorted_df.head(3)" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": null, 306 | "metadata": {}, 307 | "outputs": [], 308 | "source": [ 309 | "# Let's look at the least trustworthy trace.\n", 310 | "print(\"Prompt: \", sorted_df.iloc[0][\"prompt\"], \"\\n\")\n", 311 | "print(\"OpenAI Response: \", sorted_df.iloc[0][\"response\"], \"\\n\")\n", 312 | "print(\"TLM Trust Score: \", sorted_df.iloc[0][\"trust_score\"], \"\\n\")\n", 313 | "print(\"TLM Explanation: \", sorted_df.iloc[0][\"explanation\"])\n" 314 | ] 315 | }, 316 | { 317 | "cell_type": "markdown", 318 | "metadata": {}, 319 | "source": [ 320 | "#### Awesome! TLM was able to identify multiple traces that contained incorrect answers from OpenAI." 321 | ] 322 | }, 323 | { 324 | "cell_type": "markdown", 325 | "metadata": {}, 326 | "source": [ 327 | "Let's upload the `trust_score` and `explanation` columns to MLflow." 328 | ] 329 | }, 330 | { 331 | "cell_type": "markdown", 332 | "metadata": {}, 333 | "source": [ 334 | "## Upload evaluations to MLflow" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": 13, 340 | "metadata": {}, 341 | "outputs": [], 342 | "source": [ 343 | "for idx, row in trace_evaluations.iterrows():\n", 344 | " request_id = row[\"request_id\"]\n", 345 | " trust_score = row[\"trust_score\"]\n", 346 | " explanation = row[\"explanation\"]\n", 347 | " \n", 348 | " # Add the trustworthiness score and explanation to the trace as a tag\n", 349 | " client.set_trace_tag(request_id=request_id, key=\"trust_score\", value=trust_score)\n", 350 | " client.set_trace_tag(request_id=request_id, key=\"explanation\", value=explanation)\n", 351 | " " 352 | ] 353 | }, 354 | { 355 | "cell_type": "markdown", 356 | "metadata": {}, 357 | "source": [ 358 | "You should now see the TLM trustworthiness score and explanation in the MLflow UI!\n", 359 | "\n", 360 | "\n", 361 | "From here you can continue collecting and evaluating traces!" 362 | ] 363 | }, 364 | { 365 | "cell_type": "markdown", 366 | "metadata": {}, 367 | "source": [ 368 | "# Evaluator\n", 369 | "\n", 370 | "Here's how you might use TLM with MLflow Evaluation. This will log a table of trustworthiness scores and explanations and also provide an interface in the UI for comparing scores across runs. For example, you could use this to compare the trustworthiness scores of different models across the same set of prompts." 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": 29, 376 | "metadata": {}, 377 | "outputs": [], 378 | "source": [ 379 | "import mlflow\n", 380 | "from mlflow.metrics import MetricValue, make_metric\n", 381 | "from cleanlab_tlm import TLM\n", 382 | "\n", 383 | "def _tlm_eval_fn(predictions, inputs, targets=None):\n", 384 | " \"\"\"\n", 385 | " Evaluate trustworthiness using Cleanlab TLM.\n", 386 | " \n", 387 | " Args:\n", 388 | " predictions: The model outputs/answers\n", 389 | " targets: Not used for this metric\n", 390 | " **kwargs: Should contain 'inputs' with the prompts\n", 391 | " \"\"\"\n", 392 | " # Initialize TLM\n", 393 | " tlm = TLM(options={\"log\": [\"explanation\"]})\n", 394 | " inputs = inputs.to_list()\n", 395 | " predictions = predictions.to_list()\n", 396 | " \n", 397 | " # Get trustworthiness scores\n", 398 | " evaluations = tlm.get_trustworthiness_score(inputs, predictions)\n", 399 | " \n", 400 | " # Extract scores and explanations\n", 401 | " scores = [float(eval_result[\"trustworthiness_score\"]) for eval_result in evaluations]\n", 402 | " justifications = [eval_result[\"log\"][\"explanation\"] for eval_result in evaluations]\n", 403 | " \n", 404 | " # Return metric value\n", 405 | " return MetricValue(\n", 406 | " scores=scores,\n", 407 | " justifications=justifications,\n", 408 | " aggregate_results={\n", 409 | " \"mean\": sum(scores) / len(scores),\n", 410 | " \"min\": min(scores),\n", 411 | " \"max\": max(scores)\n", 412 | " }\n", 413 | " )\n", 414 | "\n", 415 | "def tlm_trustworthiness():\n", 416 | " \"\"\"Creates a metric for evaluating trustworthiness using Cleanlab TLM\"\"\"\n", 417 | " return make_metric(\n", 418 | " eval_fn=_tlm_eval_fn,\n", 419 | " greater_is_better=True,\n", 420 | " name=\"tlm_trustworthiness\"\n", 421 | " )" 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": null, 427 | "metadata": {}, 428 | "outputs": [], 429 | "source": [ 430 | "tlm_metric = tlm_trustworthiness()\n", 431 | "\n", 432 | "eval_df = pd.DataFrame({\n", 433 | " 'inputs': prompts,\n", 434 | " 'outputs': answers\n", 435 | "})\n", 436 | "\n", 437 | "\n", 438 | "results = mlflow.evaluate(\n", 439 | " data=eval_df,\n", 440 | " predictions=\"outputs\",\n", 441 | " model=None,\n", 442 | " extra_metrics=[tlm_metric],\n", 443 | " evaluator_config={\n", 444 | " \"col_mapping\": {\n", 445 | " \"inputs\": \"inputs\",\n", 446 | " \"predictions\": \"outputs\"\n", 447 | " }\n", 448 | " }\n", 449 | ")" 450 | ] 451 | }, 452 | { 453 | "cell_type": "markdown", 454 | "metadata": {}, 455 | "source": [ 456 | "# Tracing TLM\n", 457 | "\n", 458 | "You could also trace the TLM trustworthiness metric itself. This will log the trustworthiness scores and explanations for each trace.\n", 459 | "\n", 460 | "Ultimately you would likely want to set this up with a parent span and nested spans, or just entirely separate spans, when the user passes in a list of prompts and responses, perhaps like [this](https://mlflow.org/docs/latest/tracing/api/manual-instrumentation#context-manager). It would also be fairly straightforward to set up a custom MLflow model (or even just a simple function) that invokes the OpenAI model, passes the results to TLM, and traces both." 461 | ] 462 | }, 463 | { 464 | "cell_type": "code", 465 | "execution_count": null, 466 | "metadata": {}, 467 | "outputs": [], 468 | "source": [ 469 | "# Tracing TLM\n", 470 | "\n", 471 | "@mlflow.trace\n", 472 | "def tlm_trustworthiness_wrapper(inputs, predictions):\n", 473 | " tlm = TLM(options={\"log\": [\"explanation\"]})\n", 474 | " evaluations = tlm.get_trustworthiness_score(inputs, predictions)\n", 475 | " return evaluations\n", 476 | "\n", 477 | "tlm_trustworthiness_wrapper(prompts[0], answers[0])" 478 | ] 479 | }, 480 | { 481 | "cell_type": "markdown", 482 | "metadata": {}, 483 | "source": [] 484 | } 485 | ], 486 | "metadata": { 487 | "kernelspec": { 488 | "display_name": "phoenix", 489 | "language": "python", 490 | "name": "python3" 491 | }, 492 | "language_info": { 493 | "codemirror_mode": { 494 | "name": "ipython", 495 | "version": 3 496 | }, 497 | "file_extension": ".py", 498 | "mimetype": "text/x-python", 499 | "name": "python", 500 | "nbconvert_exporter": "python", 501 | "pygments_lexer": "ipython3", 502 | "version": "3.12.8" 503 | } 504 | }, 505 | "nbformat": 4, 506 | "nbformat_minor": 2 507 | } 508 | -------------------------------------------------------------------------------- /TLM-SimpleQA-Benchmark/README.md: -------------------------------------------------------------------------------- 1 | # TLM-SimpleQA-Benchmark 2 | 3 | This folder contains the dataset and the code to reproduce the SimpleQA benchmark we published in our [blog post](https://cleanlab.ai/blog/simpleqa/). 4 | 5 | API keys: 6 | - A Cleanlab API key is required to run this benchmark, get a Cleanlab API key at https://tlm.cleanlab.ai 7 | - An OpenAI API key is also required to run this benchmark, get an OpenAI API key at https://platform.openai.com/api-keys 8 | 9 | To reproduce the benchmarks: 10 | - Use the `get_tlm_responses.ipynb` notebook to get and save the TLM responses and trustworthiness scores. 11 | - Use the `evaluate_responses.ipynb` notebook to the evaluate the TLM responses. 12 | 13 | -------------------------------------------------------------------------------- /TLM-SimpleQA-Benchmark/evaluate_response.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "2a62d828-e8bc-471c-aa4e-e47a3441a01e", 6 | "metadata": {}, 7 | "source": [ 8 | "# Evaluate LLM Responses" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "id": "651cc58f-df2f-42b3-aaf5-46ae2fde6317", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import os\n", 19 | "os.environ[\"OPENAI_API_KEY\"] = \" str:\n", 67 | " grader_prompt = GRADER_TEMPLATE.format(\n", 68 | " question=question,\n", 69 | " target=target,\n", 70 | " predicted_answer=predicted_answer,\n", 71 | " )\n", 72 | " \n", 73 | " prompt_messages = [\n", 74 | " self.grader_model._pack_message(content=grader_prompt, role=\"user\")\n", 75 | " ]\n", 76 | " grading_response = self.grader_model(prompt_messages)\n", 77 | " \n", 78 | " match = re.search(r\"(A|B|C)\", grading_response)\n", 79 | " return match.group(0) if match else \"C\" # Default to \"NOT_ATTEMPTED\" if no match\n", 80 | "\n", 81 | " def __call__(self) -> EvalResult:\n", 82 | " def fn(row: dict):\n", 83 | " prompt_messages = [\n", 84 | " self.grader_model._pack_message(content=row.get(\"problem\", \"\"), role=\"user\")\n", 85 | " ]\n", 86 | " response_text = row.get(\"response\", \"\")\n", 87 | " grade_letter = self.grade_sample(row.get(\"problem\", \"\"), row.get(\"answer\", \"\"), response_text)\n", 88 | " \n", 89 | " # Metrics based on grading response\n", 90 | " is_correct = grade_letter == \"A\"\n", 91 | " is_incorrect = grade_letter == \"B\"\n", 92 | " is_not_attempted = grade_letter == \"C\"\n", 93 | " \n", 94 | " score = is_correct\n", 95 | "\n", 96 | " # Create HTML for each sample result\n", 97 | " html = common.jinja_env.from_string(common.HTML_JINJA).render(\n", 98 | " prompt_messages=prompt_messages,\n", 99 | " next_message=dict(content=response_text, role=\"assistant\"),\n", 100 | " score=score,\n", 101 | " correct_answer=row[\"answer\"],\n", 102 | " extracted_answer=response_text,\n", 103 | " )\n", 104 | " convo = prompt_messages + [dict(content=response_text, role=\"assistant\")]\n", 105 | " return SingleEvalResult(html=html, score=score, convo=convo, metrics={\n", 106 | " \"is_correct\": is_correct,\n", 107 | " \"is_incorrect\": is_incorrect,\n", 108 | " \"is_not_attempted\": is_not_attempted\n", 109 | " })\n", 110 | "\n", 111 | " # Run evaluation and collect results\n", 112 | " results = common.map_with_progress(fn, self.examples)\n", 113 | "\n", 114 | " # Aggregate metrics\n", 115 | " aggregate_metrics = {\n", 116 | " \"is_correct\": sum(result.metrics[\"is_correct\"] for result in results) / len(results),\n", 117 | " \"is_incorrect\": sum(result.metrics[\"is_incorrect\"] for result in results) / len(results),\n", 118 | " \"is_not_attempted\": sum(result.metrics[\"is_not_attempted\"] for result in results) / len(results),\n", 119 | " }\n", 120 | " aggregate_metrics[\"is_given_attempted\"] = aggregate_metrics[\"is_correct\"] + aggregate_metrics[\"is_incorrect\"]\n", 121 | " # Calculate accuracy_given_attempted\n", 122 | " aggregate_metrics[\"accuracy_given_attempted\"] = (\n", 123 | " aggregate_metrics[\"is_correct\"]\n", 124 | " / aggregate_metrics[\"is_given_attempted\"]\n", 125 | " if aggregate_metrics[\"is_given_attempted\"] > 0\n", 126 | " else 0\n", 127 | " )\n", 128 | " print(\"AGGREGATE METRICS\") \n", 129 | " print(aggregate_metrics) \n", 130 | " print(\"##################\")\n", 131 | "\n", 132 | " output_d = {\n", 133 | " \"accuracy_given_attempted\": aggregate_metrics[\"accuracy_given_attempted\"],\n", 134 | " \"f1\": (\n", 135 | " 2 * aggregate_metrics[\"accuracy_given_attempted\"] * aggregate_metrics[\"is_correct\"]\n", 136 | " / (aggregate_metrics[\"accuracy_given_attempted\"] + aggregate_metrics[\"is_correct\"])\n", 137 | " if (aggregate_metrics[\"accuracy_given_attempted\"] + aggregate_metrics[\"is_correct\"]) > 0\n", 138 | " else 0\n", 139 | " )\n", 140 | " }\n", 141 | " \n", 142 | " print(f\"Accuracy Given Attempted: {output_d['accuracy_given_attempted']:.3f}\")\n", 143 | " print(f\"F1 Score: {output_d['f1']:.3f}\")\n", 144 | " \n", 145 | " return common.aggregate_results(results)" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "id": "2954184a-2b21-43fa-819a-72ff5b27fb35", 151 | "metadata": {}, 152 | "source": [ 153 | "In the next cell we show the evaluation for the GPT-4o baseline responses. Here we only run the evaluation on the first 10 examples as a sample, to run it on all examples, set `num_examples` to None)." 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 4, 159 | "id": "b015b3c6-8b0d-4f76-9fb9-75175d95d983", 160 | "metadata": {}, 161 | "outputs": [ 162 | { 163 | "name": "stderr", 164 | "output_type": "stream", 165 | "text": [ 166 | "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 12.43it/s]" 167 | ] 168 | }, 169 | { 170 | "name": "stdout", 171 | "output_type": "stream", 172 | "text": [ 173 | "AGGREGATE METRICS\n", 174 | "{'is_correct': 0.3, 'is_incorrect': 0.7, 'is_not_attempted': 0.0, 'is_given_attempted': 1.0, 'accuracy_given_attempted': 0.3}\n", 175 | "##################\n", 176 | "Accuracy Given Attempted: 0.300\n", 177 | "F1 Score: 0.300\n" 178 | ] 179 | }, 180 | { 181 | "name": "stderr", 182 | "output_type": "stream", 183 | "text": [ 184 | "\n" 185 | ] 186 | } 187 | ], 188 | "source": [ 189 | "gpt_4o_baseline = pd.read_csv(\"results/gpt-4o-baseline-responses.csv\")\n", 190 | "grading_sampler = ChatCompletionSampler(model=\"gpt-4o\")\n", 191 | "\n", 192 | "simple_qa_eval = SimpleQAEval(grading_sampler, dataset=gpt_4o_baseline, num_examples=10)\n", 193 | "res = simple_qa_eval()" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "id": "5f222b44-91f3-4ae4-810f-60cf525ba74e", 199 | "metadata": {}, 200 | "source": [ 201 | "Execute the function below to evaluate all datasets generated in the [get_tlm_response.ipynb](get_tlm_response.ipynb) script." 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 5, 207 | "id": "c3f64dbb-8f85-4d04-9401-ccf532428335", 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [ 211 | "def evaluate_all_datasets():\n", 212 | " dataset_list = [\n", 213 | " \"results/gpt-4o-baseline-responses.csv\",\n", 214 | " \"results/gpt-4o-baseline-25-responses.csv\",\n", 215 | " \"results/gpt-4o-baseline-80-responses.csv\",\n", 216 | " \"results/gpt-4o-best-responses.csv\",\n", 217 | " \"results/gpt-4o-best-25-responses.csv\",\n", 218 | " \"results/gpt-4o-best-80-responses.csv\",\n", 219 | " ]\n", 220 | "\n", 221 | " grading_sampler = ChatCompletionSampler(model=\"gpt-4o\")\n", 222 | "\n", 223 | " for dataset in dataset_list:\n", 224 | " print(dataset)\n", 225 | " df = pd.read_csv(dataset)\n", 226 | " simple_qa_eval = SimpleQAEval(grading_sampler, dataset=df)\n", 227 | " res = simple_qa_eval()\n", 228 | " print()\n", 229 | "\n", 230 | "# evaluate_all_datasets()" 231 | ] 232 | } 233 | ], 234 | "metadata": { 235 | "kernelspec": { 236 | "display_name": "Python 3 (ipykernel)", 237 | "language": "python", 238 | "name": "python3" 239 | }, 240 | "language_info": { 241 | "codemirror_mode": { 242 | "name": "ipython", 243 | "version": 3 244 | }, 245 | "file_extension": ".py", 246 | "mimetype": "text/x-python", 247 | "name": "python", 248 | "nbconvert_exporter": "python", 249 | "pygments_lexer": "ipython3", 250 | "version": "3.10.14" 251 | } 252 | }, 253 | "nbformat": 4, 254 | "nbformat_minor": 5 255 | } 256 | -------------------------------------------------------------------------------- /TLM-SimpleQA-Benchmark/get_tlm_response.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "cf3ab210-ed11-4500-b829-18a44402fcc6", 6 | "metadata": {}, 7 | "source": [ 8 | "# Obtaining TLM Responses for SimpleQA Dataset" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "id": "4c0d1617", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "# Set your API key\n", 19 | "import os\n", 20 | "os.environ[\"CLEANLAB_TLM_API_KEY\"] = \"\" # Get your API key from: https://tlm.cleanlab.ai/" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 1, 26 | "id": "cf73e3c8-b9e9-4b78-bdf1-0d6a28a27bc9", 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "import pandas as pd\n", 31 | "from cleanlab_tlm import TLM" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "id": "6165fafc-48a5-4fc8-a207-c0738fed6ea0", 38 | "metadata": { 39 | "scrolled": true 40 | }, 41 | "outputs": [ 42 | { 43 | "data": { 44 | "text/html": [ 45 | "
\n", 46 | "\n", 59 | "\n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | "
metadataproblemanswer
0{'topic': 'Science and technology', 'answer_ty...Who received the IEEE Frank Rosenblatt Award i...Michio Sugeno
1{'topic': 'Science and technology', 'answer_ty...Who was awarded the Oceanography Society's Jer...Annick Bricaud
2{'topic': 'Geography', 'answer_type': 'Place',...What's the name of the women's liberal arts co...Radcliffe College
3{'topic': 'Sports', 'answer_type': 'Person', '...In whose honor was the Leipzig 1877 tournament...Adolf Anderssen
4{'topic': 'Art', 'answer_type': 'Person', 'url...According to Karl Küchler, what did Empress El...Poet Henrich Heine.
\n", 101 | "
" 102 | ], 103 | "text/plain": [ 104 | " metadata \\\n", 105 | "0 {'topic': 'Science and technology', 'answer_ty... \n", 106 | "1 {'topic': 'Science and technology', 'answer_ty... \n", 107 | "2 {'topic': 'Geography', 'answer_type': 'Place',... \n", 108 | "3 {'topic': 'Sports', 'answer_type': 'Person', '... \n", 109 | "4 {'topic': 'Art', 'answer_type': 'Person', 'url... \n", 110 | "\n", 111 | " problem answer \n", 112 | "0 Who received the IEEE Frank Rosenblatt Award i... Michio Sugeno \n", 113 | "1 Who was awarded the Oceanography Society's Jer... Annick Bricaud \n", 114 | "2 What's the name of the women's liberal arts co... Radcliffe College \n", 115 | "3 In whose honor was the Leipzig 1877 tournament... Adolf Anderssen \n", 116 | "4 According to Karl Küchler, what did Empress El... Poet Henrich Heine. " 117 | ] 118 | }, 119 | "execution_count": 2, 120 | "metadata": {}, 121 | "output_type": "execute_result" 122 | } 123 | ], 124 | "source": [ 125 | "url = \"https://openaipublic.blob.core.windows.net/simple-evals/simple_qa_test_set.csv\"\n", 126 | "data = pd.read_csv(url)\n", 127 | "data.head()" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "id": "6001f900-f40c-493d-9dd6-e2952c6e8481", 133 | "metadata": {}, 134 | "source": [ 135 | "## Prompt TLM\n", 136 | "Using TLM on `medium` quality preset is the same as directly querying OpenAI for responses, but also returns a trustworthiness score." 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 4, 142 | "id": "b472a5e4-feb2-47cc-bfc6-aa76fb0a6296", 143 | "metadata": {}, 144 | "outputs": [ 145 | { 146 | "name": "stderr", 147 | "output_type": "stream", 148 | "text": [ 149 | "Querying TLM... 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████|\n" 150 | ] 151 | } 152 | ], 153 | "source": [ 154 | "tlm = TLM(\"medium\", options={\"model\": \"gpt-4o\"})\n", 155 | "tlm_results = tlm.try_prompt(data[\"problem\"].tolist())" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 5, 161 | "id": "789a0d32-646a-4070-9c68-0c48a0b056af", 162 | "metadata": {}, 163 | "outputs": [ 164 | { 165 | "data": { 166 | "text/html": [ 167 | "
\n", 168 | "\n", 181 | "\n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | "
metadataproblemanswerresponsetrustworthiness_score
0{'topic': 'Science and technology', 'answer_ty...Who received the IEEE Frank Rosenblatt Award i...Michio SugenoThe IEEE Frank Rosenblatt Award was establishe...0.597950
1{'topic': 'Science and technology', 'answer_ty...Who was awarded the Oceanography Society's Jer...Annick BricaudIn 2018, the Oceanography Society's Jerlov Awa...0.275720
2{'topic': 'Geography', 'answer_type': 'Place',...What's the name of the women's liberal arts co...Radcliffe CollegeThe women's liberal arts college in Cambridge,...0.597658
3{'topic': 'Sports', 'answer_type': 'Person', '...In whose honor was the Leipzig 1877 tournament...Adolf AnderssenThe Leipzig 1877 chess tournament was organize...0.352430
4{'topic': 'Art', 'answer_type': 'Person', 'url...According to Karl Küchler, what did Empress El...Poet Henrich Heine.Empress Elizabeth of Austria's favorite sculpt...0.970854
\n", 235 | "
" 236 | ], 237 | "text/plain": [ 238 | " metadata \\\n", 239 | "0 {'topic': 'Science and technology', 'answer_ty... \n", 240 | "1 {'topic': 'Science and technology', 'answer_ty... \n", 241 | "2 {'topic': 'Geography', 'answer_type': 'Place',... \n", 242 | "3 {'topic': 'Sports', 'answer_type': 'Person', '... \n", 243 | "4 {'topic': 'Art', 'answer_type': 'Person', 'url... \n", 244 | "\n", 245 | " problem answer \\\n", 246 | "0 Who received the IEEE Frank Rosenblatt Award i... Michio Sugeno \n", 247 | "1 Who was awarded the Oceanography Society's Jer... Annick Bricaud \n", 248 | "2 What's the name of the women's liberal arts co... Radcliffe College \n", 249 | "3 In whose honor was the Leipzig 1877 tournament... Adolf Anderssen \n", 250 | "4 According to Karl Küchler, what did Empress El... Poet Henrich Heine. \n", 251 | "\n", 252 | " response trustworthiness_score \n", 253 | "0 The IEEE Frank Rosenblatt Award was establishe... 0.597950 \n", 254 | "1 In 2018, the Oceanography Society's Jerlov Awa... 0.275720 \n", 255 | "2 The women's liberal arts college in Cambridge,... 0.597658 \n", 256 | "3 The Leipzig 1877 chess tournament was organize... 0.352430 \n", 257 | "4 Empress Elizabeth of Austria's favorite sculpt... 0.970854 " 258 | ] 259 | }, 260 | "execution_count": 5, 261 | "metadata": {}, 262 | "output_type": "execute_result" 263 | } 264 | ], 265 | "source": [ 266 | "results_df = pd.concat([data, pd.DataFrame(tlm_results)], axis=1)\n", 267 | "results_df.head()" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 6, 273 | "id": "189b222f-6957-461f-8aa2-83a6796b2301", 274 | "metadata": {}, 275 | "outputs": [], 276 | "source": [ 277 | "results_df.to_csv(\"results/gpt-4o-baseline-responses.csv\", index=None)" 278 | ] 279 | }, 280 | { 281 | "cell_type": "markdown", 282 | "id": "4d31d0a1-ad15-4a11-a9cc-12398adade26", 283 | "metadata": {}, 284 | "source": [ 285 | "## Filter results using TLM Trustworthiness Score" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 7, 291 | "id": "a92a3bf1-b3e2-4d99-aeab-241edc9fe0ed", 292 | "metadata": {}, 293 | "outputs": [], 294 | "source": [ 295 | "def filter_responses(results_df, threshold):\n", 296 | " filtered_results = results_df.copy()\n", 297 | " filter_idx = filtered_results[filtered_results[\"trustworthiness_score\"] < threshold].index\n", 298 | " filtered_results.loc[filter_idx, \"response\"] = \"I'm sorry, I don’t know the answer to that question.\"\n", 299 | "\n", 300 | " return filtered_results" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": 8, 306 | "id": "c3621820-5f4a-48fb-88a1-4796c92df9bb", 307 | "metadata": {}, 308 | "outputs": [], 309 | "source": [ 310 | "# filter results at threshold = 0.25\n", 311 | "filter_25 = filter_responses(results_df, 0.25)\n", 312 | "\n", 313 | "# filter results at threshold = 0.8\n", 314 | "filter_80 = filter_responses(results_df, 0.8)\n", 315 | "\n", 316 | "filter_25.to_csv(\"results/gpt-4o-baseline-25-responses.csv\", index=None)\n", 317 | "filter_80.to_csv(\"results/gpt-4o-baseline-80-responses.csv\", index=None)" 318 | ] 319 | }, 320 | { 321 | "cell_type": "markdown", 322 | "id": "6af9950d-082e-4a0f-8412-635b9e927fa5", 323 | "metadata": {}, 324 | "source": [ 325 | "## Repeat the same process for `best` quality preset\n", 326 | "Using TLM on `best` quality preset improves the LLM responses." 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": 9, 332 | "id": "79fe3fb8-0ae4-4e61-8e34-e43e682d1917", 333 | "metadata": {}, 334 | "outputs": [ 335 | { 336 | "name": "stderr", 337 | "output_type": "stream", 338 | "text": [ 339 | "Querying TLM... 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████|\n" 340 | ] 341 | } 342 | ], 343 | "source": [ 344 | "tlm_best = TLM(\"best\", options={\"model\": \"gpt-4o\"})\n", 345 | "tlm_best_results = tlm_best.try_prompt(data[\"problem\"].tolist())\n", 346 | "\n", 347 | "results_best_df = pd.concat([data, pd.DataFrame(tlm_best_results)], axis=1)\n", 348 | "results_best_df.to_csv(\"results/gpt-4o-best-responses.csv\", index=None)" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": 10, 354 | "id": "71ae8069-117b-4fa1-9ac1-34e189d54bc8", 355 | "metadata": {}, 356 | "outputs": [], 357 | "source": [ 358 | "# filter results at threshold = 0.25\n", 359 | "filter_best_25 = filter_responses(results_best_df, 0.25)\n", 360 | "\n", 361 | "# filter results at threshold = 0.8\n", 362 | "filter_best_80 = filter_responses(results_best_df, 0.8)\n", 363 | "\n", 364 | "filter_best_25.to_csv(\"results/gpt-4o-best-25-responses.csv\", index=None)\n", 365 | "filter_best_80.to_csv(\"results/gpt-4o-best-80-responses.csv\", index=None)" 366 | ] 367 | } 368 | ], 369 | "metadata": { 370 | "kernelspec": { 371 | "display_name": "Python 3 (ipykernel)", 372 | "language": "python", 373 | "name": "python3" 374 | }, 375 | "language_info": { 376 | "codemirror_mode": { 377 | "name": "ipython", 378 | "version": 3 379 | }, 380 | "file_extension": ".py", 381 | "mimetype": "text/x-python", 382 | "name": "python", 383 | "nbconvert_exporter": "python", 384 | "pygments_lexer": "ipython3", 385 | "version": "3.10.14" 386 | } 387 | }, 388 | "nbformat": 4, 389 | "nbformat_minor": 5 390 | } 391 | -------------------------------------------------------------------------------- /TLM-SimpleQA-Benchmark/openai_utils/chat_completion_sampler.py: -------------------------------------------------------------------------------- 1 | """ 2 | File directly obtained (without changes) from https://github.com/openai/simple-evals/blob/main/sampler/chat_completion_sampler.py 3 | """ 4 | 5 | import base64 6 | import time 7 | from typing import Any 8 | 9 | import openai 10 | from openai import OpenAI 11 | 12 | from .types import MessageList, SamplerBase 13 | 14 | OPENAI_SYSTEM_MESSAGE_API = "You are a helpful assistant." 15 | OPENAI_SYSTEM_MESSAGE_CHATGPT = ( 16 | "You are ChatGPT, a large language model trained by OpenAI, based on the GPT-4 architecture." 17 | + "\nKnowledge cutoff: 2023-12\nCurrent date: 2024-04-01" 18 | ) 19 | 20 | 21 | class ChatCompletionSampler(SamplerBase): 22 | """ 23 | Sample from OpenAI's chat completion API 24 | """ 25 | 26 | def __init__( 27 | self, 28 | model: str = "gpt-3.5-turbo", 29 | system_message: str | None = None, 30 | temperature: float = 0.5, 31 | max_tokens: int = 1024, 32 | ): 33 | self.api_key_name = "OPENAI_API_KEY" 34 | self.client = OpenAI() 35 | # using api_key=os.environ.get("OPENAI_API_KEY") # please set your API_KEY 36 | self.model = model 37 | self.system_message = system_message 38 | self.temperature = temperature 39 | self.max_tokens = max_tokens 40 | self.image_format = "url" 41 | 42 | def _handle_image( 43 | self, image: str, encoding: str = "base64", format: str = "png", fovea: int = 768 44 | ): 45 | new_image = { 46 | "type": "image_url", 47 | "image_url": { 48 | "url": f"data:image/{format};{encoding},{image}", 49 | }, 50 | } 51 | return new_image 52 | 53 | def _handle_text(self, text: str): 54 | return {"type": "text", "text": text} 55 | 56 | def _pack_message(self, role: str, content: Any): 57 | return {"role": str(role), "content": content} 58 | 59 | def __call__(self, message_list: MessageList) -> str: 60 | if self.system_message: 61 | message_list = [self._pack_message("system", self.system_message)] + message_list 62 | trial = 0 63 | while True: 64 | try: 65 | response = self.client.chat.completions.create( 66 | model=self.model, 67 | messages=message_list, 68 | temperature=self.temperature, 69 | max_tokens=self.max_tokens, 70 | ) 71 | return response.choices[0].message.content 72 | # NOTE: BadRequestError is triggered once for MMMU, please uncomment if you are reruning MMMU 73 | except openai.BadRequestError as e: 74 | print("Bad Request Error", e) 75 | return "" 76 | except Exception as e: 77 | exception_backoff = 2**trial # expontial back off 78 | print( 79 | f"Rate limit exception so wait and retry {trial} after {exception_backoff} sec", 80 | e, 81 | ) 82 | time.sleep(exception_backoff) 83 | trial += 1 84 | # unknown error shall throw exception 85 | -------------------------------------------------------------------------------- /TLM-SimpleQA-Benchmark/openai_utils/common.py: -------------------------------------------------------------------------------- 1 | """ 2 | File directly obtained (without changes) from https://github.com/openai/simple-evals/blob/main/common.py 3 | """ 4 | 5 | import os 6 | from collections import defaultdict 7 | from multiprocessing.pool import ThreadPool 8 | from typing import Any 9 | 10 | import jinja2 11 | import numpy as np 12 | from tqdm import tqdm 13 | 14 | from .types import EvalResult, Message, SamplerBase, SingleEvalResult 15 | 16 | QUERY_TEMPLATE_MULTICHOICE = """ 17 | Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering. 18 | 19 | {Question} 20 | 21 | A) {A} 22 | B) {B} 23 | C) {C} 24 | D) {D} 25 | """.strip() 26 | 27 | ANSWER_PATTERN_MULTICHOICE = r"(?i)Answer\s*:\s*([A-D])" 28 | ANSWER_PATTERN = r"(?i)Answer\s*:\s*([^\n]+)" 29 | MULTILINGUAL_ANSWER_PATTERN_TEMPLATE = ( 30 | "(?i){}\s*([A-D]|[أ-د]|[অ]|[ব]|[ড]|[ঢ]|[A]|[B]|[C]|[D])" 31 | ) 32 | # All the different ways "Answer" is written in different languages 33 | MULTILINGUAL_ANSWER_REGEXES = [ 34 | "Answer\s*:", 35 | "Answer\s*:​​​​​​", # Korean invisible character 36 | "উত্তর\s*:", 37 | "उत्तर\s*:", 38 | "উত্তরঃ", 39 | "উত্তর\s*:", 40 | "Antwort\s*:", 41 | "답변\s*:", 42 | "정답\s*:", 43 | "답\s*:", 44 | "答案\s*:", 45 | "答案\s*:", 46 | "答\s*:", 47 | "答\s*:", 48 | "答复\s*:", 49 | "答曰\s*:", 50 | "الإجابة:", 51 | "الجواب:", 52 | "إجابة:", 53 | "الإجابة النهائية:", 54 | "الإجابة الصحيحة:", 55 | "الإجابة الصحيحة هي:", 56 | "الإجابة هي:", 57 | "Respuesta\s*:", 58 | "Risposta\s*:", 59 | "答え\s*:", 60 | "答え\s*:", 61 | "回答\s*:", 62 | "回答\s*:", 63 | "解答\s*:", 64 | "Jawaban\s*:", 65 | "Réponse\s*:", 66 | "Resposta\s*:", 67 | "Jibu\s*:", 68 | "Idahun\s*:", 69 | "Ìdáhùn\s*:", 70 | "Idáhùn\s*:", 71 | "Àmọ̀nà\s*:", 72 | "Àdáhùn\s*:", 73 | "Ànúgọ\s*:", 74 | "Àṣàyàn\s*:", 75 | ] 76 | 77 | 78 | EQUALITY_TEMPLATE = r""" 79 | Look at the following two expressions (answers to a math problem) and judge whether they are equivalent. Only perform trivial simplifications 80 | 81 | Examples: 82 | 83 | Expression 1: $2x+3$ 84 | Expression 2: $3+2x$ 85 | 86 | Yes 87 | 88 | Expression 1: 3/2 89 | Expression 2: 1.5 90 | 91 | Yes 92 | 93 | Expression 1: $x^2+2x+1$ 94 | Expression 2: $y^2+2y+1$ 95 | 96 | No 97 | 98 | Expression 1: $x^2+2x+1$ 99 | Expression 2: $(x+1)^2$ 100 | 101 | Yes 102 | 103 | Expression 1: 3245/5 104 | Expression 2: 649 105 | 106 | No 107 | (these are actually equal, don't mark them equivalent if you need to do nontrivial simplifications) 108 | 109 | Expression 1: 2/(-3) 110 | Expression 2: -2/3 111 | 112 | Yes 113 | (trivial simplifications are allowed) 114 | 115 | Expression 1: 72 degrees 116 | Expression 2: 72 117 | 118 | Yes 119 | (give benefit of the doubt to units) 120 | 121 | Expression 1: 64 122 | Expression 2: 64 square feet 123 | 124 | Yes 125 | (give benefit of the doubt to units) 126 | 127 | --- 128 | 129 | YOUR TASK 130 | 131 | 132 | Respond with only "Yes" or "No" (without quotes). Do not include a rationale. 133 | 134 | Expression 1: %(expression1)s 135 | Expression 2: %(expression2)s 136 | """.strip() 137 | 138 | 139 | HTML_JINJA = """ 140 |

Prompt conversation

141 | {% for message in prompt_messages %} 142 | {{ message_to_html(message) | safe }} 143 | {% endfor %} 144 |

Sampled message

145 | {{ message_to_html(next_message) | safe }} 146 |

Results

147 |

Correct Answer: {{ correct_answer }}

148 |

Extracted Answer: {{ extracted_answer }}

149 |

Score: {{ score }}

150 | """ 151 | 152 | 153 | def format_multichoice_question(row): 154 | return QUERY_TEMPLATE_MULTICHOICE.format(**row) 155 | 156 | 157 | def check_equality(sampler: SamplerBase, expr1: str, expr2: str): 158 | prompt = EQUALITY_TEMPLATE % {"expression1": expr1, "expression2": expr2} 159 | response = sampler([dict(content=prompt, role="user")]) 160 | return response.lower().strip() == "yes" 161 | 162 | 163 | def _compute_stat(values: list, stat: str): 164 | if stat == "mean": 165 | return np.mean(values) 166 | elif stat == "std": 167 | return np.std(values) 168 | elif stat == "min": 169 | return np.min(values) 170 | elif stat == "max": 171 | return np.max(values) 172 | else: 173 | raise ValueError(f"Unknown {stat =}") 174 | 175 | 176 | def aggregate_results( 177 | single_eval_results: list[SingleEvalResult], 178 | default_stats: tuple[str] = ("mean", "std"), 179 | name2stats: dict[str, tuple[str]] | None = None, 180 | ) -> EvalResult: 181 | """ 182 | Aggregate results from multiple evaluations into a single EvalResult. 183 | """ 184 | name2stats = name2stats or {} 185 | name2values = defaultdict(list) 186 | htmls = [] 187 | convos = [] 188 | for single_eval_result in single_eval_results: 189 | for name, value in single_eval_result.metrics.items(): 190 | name2values[name].append(value) 191 | if single_eval_result.score is not None: 192 | name2values["score"].append(single_eval_result.score) 193 | htmls.append(single_eval_result.html) 194 | convos.append(single_eval_result.convo) 195 | final_metrics = {} 196 | for name, values in name2values.items(): 197 | stats = name2stats.get(name, default_stats) 198 | for stat in stats: 199 | key = name if stat == "mean" else f"{name}:{stat}" 200 | final_metrics[key] = _compute_stat(values, stat) 201 | return EvalResult( 202 | score=final_metrics.pop("score", None), metrics=final_metrics, htmls=htmls, convos=convos 203 | ) 204 | 205 | 206 | def map_with_progress(f: callable, xs: list[Any], num_threads: int = 50): 207 | """ 208 | Apply f to each element of xs, using a ThreadPool, and show progress. 209 | """ 210 | if os.getenv("debug"): 211 | return list(map(f, tqdm(xs, total=len(xs)))) 212 | else: 213 | with ThreadPool(min(num_threads, len(xs))) as pool: 214 | return list(tqdm(pool.imap(f, xs), total=len(xs))) 215 | 216 | 217 | jinja_env = jinja2.Environment( 218 | loader=jinja2.BaseLoader(), 219 | undefined=jinja2.StrictUndefined, 220 | autoescape=jinja2.select_autoescape(["html", "xml"]), 221 | ) 222 | _message_template = """ 223 |
224 |
225 | {{ role }} 226 | {% if variant %}({{ variant }}){% endif %} 227 |
228 |
229 |
{{ content }}
230 |
231 |
232 | """ 233 | 234 | 235 | def message_to_html(message: Message) -> str: 236 | """ 237 | Generate HTML snippet (inside a
) for a message. 238 | """ 239 | return jinja_env.from_string(_message_template).render( 240 | role=message["role"], content=message["content"], variant=message.get("variant", None) 241 | ) 242 | 243 | 244 | jinja_env.globals["message_to_html"] = message_to_html 245 | 246 | 247 | _report_template = """ 248 | 249 | 250 | 282 | 283 | 284 | {% if metrics %} 285 |

Metrics

286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | {% for name, value in metrics.items() %} 296 | 297 | 298 | 299 | 300 | {% endfor %} 301 |
MetricValue
Score{{ score | float | round(3) }}
{{ name }}{{ value }}
302 | {% endif %} 303 |

Examples

304 | {% for html in htmls %} 305 | {{ html | safe }} 306 |
307 | {% endfor %} 308 | 309 | 310 | """ 311 | 312 | 313 | def make_report(eval_result: EvalResult) -> str: 314 | """ 315 | Create a standalone HTML report from an EvalResult. 316 | """ 317 | return jinja_env.from_string(_report_template).render( 318 | score=eval_result.score, 319 | metrics=eval_result.metrics, 320 | htmls=eval_result.htmls, 321 | ) 322 | 323 | 324 | def make_report_from_example_htmls(htmls: list[str]): 325 | """ 326 | Create a standalone HTML report from a list of example htmls 327 | """ 328 | return jinja_env.from_string(_report_template).render(score=None, metrics={}, htmls=htmls) 329 | 330 | def normalize_response(response: str) -> str: 331 | """ 332 | Normalize the response by removing markdown and LaTeX formatting that may prevent a match. 333 | """ 334 | 335 | return ( 336 | response.replace("**", "") 337 | .replace("$\\boxed{", "") 338 | .replace("}$", "") 339 | .replace("\\$", "") 340 | .replace("$\\text{", "") 341 | .replace("$", "") 342 | .replace("\\mathrm{", "") 343 | .replace("\\{", "") 344 | .replace("\\text", "") 345 | .replace("\\(", "") 346 | .replace("\\mathbf{", "") 347 | .replace("{", "") 348 | .replace("\\boxed", "") 349 | ) 350 | 351 | def normalize_extracted_answer(extracted_answer: str) -> str: 352 | return ( 353 | # In arabic these are the letters used for A-D in multiple choice questions 354 | extracted_answer.replace("أ", " A") 355 | .replace("ب", " B") 356 | .replace("ج", " C") 357 | .replace("د", " D") 358 | # In Bengali these are the letters used for A-D in multiple choice questions 359 | .replace("অ", " A") 360 | .replace("ব", " B") 361 | .replace("ড", " C") 362 | .replace("ঢ", " D") 363 | # In Japanese these are the letters sometimes used for A-D in multiple choice questions 364 | .replace("A", " A") 365 | .replace("B", " B") 366 | .replace("C", " C") 367 | .replace("D", " D") 368 | .strip() 369 | ) 370 | -------------------------------------------------------------------------------- /TLM-SimpleQA-Benchmark/openai_utils/simpleqa_constants.py: -------------------------------------------------------------------------------- 1 | """ 2 | Constants directly obtained from https://github.com/openai/simple-evals/blob/main/simpleqa_eval.py 3 | """ 4 | 5 | GRADER_TEMPLATE = """ 6 | Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"]. 7 | First, I will give examples of each grade, and then you will grade a new example. 8 | 9 | 10 | The following are examples of CORRECT predicted answers. 11 | ``` 12 | Question: What are the names of Barack Obama's children? 13 | Gold target: Malia Obama and Sasha Obama 14 | Predicted answer 1: sasha and malia obama 15 | Predicted answer 2: most people would say Malia and Sasha, but I'm not sure and would have to double check 16 | Predicted answer 3: Barack Obama has two daughters. Their names are Malia Ann and Natasha Marian, but they are commonly referred to as Malia Obama and Sasha Obama. Malia was born on July 4, 1998, and Sasha was born on June 10, 2001. 17 | ``` 18 | These predicted answers are all CORRECT because: 19 | - They fully contain the important information in the gold target. 20 | - They do not contain any information that contradicts the gold target. 21 | - Only semantic meaning matters; capitalization, punctuation, grammar, and order don't matter. 22 | - Hedging and guessing are permissible, provided that the gold target is fully included and the response contains no incorrect information or contradictions. 23 | 24 | 25 | The following are examples of INCORRECT predicted answers. 26 | ``` 27 | Question: What are the names of Barack Obama's children? 28 | Gold target: Malia and Sasha 29 | Predicted answer 1: Malia. 30 | Predicted answer 2: Malia, Sasha, and Susan. 31 | Predicted answer 3: Barack Obama does not have any children. 32 | Predicted answer 4: I think it's either Malia and Sasha. Or it could be Malia and Jackie. Or it could be Joey and Malia. 33 | Predicted answer 4: While I don't know their exact names, I can tell you that Barack Obama has three children. 34 | Predicted answer 5: It's possible you may mean Betsy and Olivia. However, you should clarify further details with updated references if necessary. Is that the correct answer? 35 | Predicted answer 6: It may be the case that Obama's child is named James. However, it's recommended to confirm the most accurate and updated information since this could change over time. This model may not always reflect the most current information. 36 | ``` 37 | These predicted answers are all INCORRECT because: 38 | - A factual statement in the answer contradicts the gold target. Incorrect statements that have some hedging (e.g., "it is possible that", "although i'm not sure, i think") are also considered incorrect. 39 | 40 | 41 | The following are examples of NOT_ATTEMPTED predicted answers. 42 | ``` 43 | Question: What are the names of Barack Obama's children? 44 | Gold target: Malia and Sasha 45 | Predicted answer 1: I don't know. 46 | Predicted answer 2: I need more context about which Obama you are talking about. 47 | Predicted answer 3: Without researching the web, I cannot answer this question. However, I can tell you that Barack Obama has two children. 48 | Predicted answer 4: Barack Obama has two children. I know that one of them is Malia, but I'm not sure about the other one. 49 | ``` 50 | These predicted answers are all NOT_ATTEMPTED because: 51 | - The important information in the gold target is not included in the answer. 52 | - No statements in the answer contradict the gold target. 53 | 54 | 55 | Also note the following things: 56 | - For grading questions where the gold target is a number, the predicted answer needs to be correct to the last significant figure in the gold answer. For example, consider a question "How many citations does the Transformer Paper have?" with gold target "120k". 57 | - Predicted answers "120k", "124k", and 115k" are all CORRECT. 58 | - Predicted answers "100k" and "113k" are INCORRECT. 59 | - Predicted answers "around 100k" and "more than 50k" are considered NOT_ATTEMPTED because they neither confirm nor contradict the gold target. 60 | - The gold target may contain more information than the question. In such cases, the predicted answer only needs to contain the information that is in the question. 61 | - For example, consider the question "What episode did Derek and Meredith get legally married in Grey's Anatomy?" with gold target "Season 7, Episode 20: White Wedding". Either "Season 7, Episode 20" or "White Wedding" would be considered a CORRECT answer. 62 | - Do not punish predicted answers if they omit information that would be clearly inferred from the question. 63 | - For example, consider the question "What city is OpenAI headquartered in?" and the gold target "San Francisco, California". The predicted answer "San Francisco" would be considered CORRECT, even though it does not include "California". 64 | - Consider the question "What award did A pretrainer's guide to training data: Measuring the effects of data age, domain coverage, quality, & toxicity win at NAACL '24?", the gold target is "Outstanding Paper Award". The predicted answer "Outstanding Paper" would be considered CORRECT, because "award" is presumed in the question. 65 | - For the question "What is the height of Jason Wei in meters?", the gold target is "1.73 m". The predicted answer "1.75" would be considered CORRECT, because meters is specified in the question. 66 | - For the question "What is the name of Barack Obama's wife?", the gold target is "Michelle Obama". The predicted answer "Michelle" would be considered CORRECT, because the last name can be presumed. 67 | - Do not punish for typos in people's name if it's clearly the same name. 68 | - For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung". 69 | 70 | 71 | Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer. 72 | ``` 73 | Question: {question} 74 | Gold target: {target} 75 | Predicted answer: {predicted_answer} 76 | ``` 77 | 78 | Grade the predicted answer of this new question as one of: 79 | A: CORRECT 80 | B: INCORRECT 81 | C: NOT_ATTEMPTED 82 | 83 | Just return the letters "A", "B", or "C", with no text around it. 84 | """.strip() 85 | 86 | 87 | CHOICE_LETTERS = ["A", "B", "C"] 88 | CHOICE_STRINGS = ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"] 89 | CHOICE_LETTER_TO_STRING = dict(zip(CHOICE_LETTERS, CHOICE_STRINGS)) 90 | -------------------------------------------------------------------------------- /TLM-SimpleQA-Benchmark/openai_utils/types.py: -------------------------------------------------------------------------------- 1 | """ 2 | File directly obtained (without changes) from https://github.com/openai/simple-evals/blob/main/types.py 3 | """ 4 | 5 | from dataclasses import dataclass, field 6 | from typing import Any 7 | 8 | Message = dict[str, Any] # keys role, content 9 | MessageList = list[Message] 10 | 11 | 12 | class SamplerBase: 13 | """ 14 | Base class for defining a sampling model, which can be evaluated, 15 | or used as part of the grading process. 16 | """ 17 | 18 | def __call__(self, message_list: MessageList) -> str: 19 | raise NotImplementedError 20 | 21 | 22 | @dataclass 23 | class EvalResult: 24 | """ 25 | Result of running an evaluation (usually consisting of many samples) 26 | """ 27 | 28 | score: float | None # top-line metric 29 | metrics: dict[str, float] | None # other metrics 30 | htmls: list[str] # strings of valid HTML 31 | convos: list[MessageList] # sampled conversations 32 | 33 | 34 | @dataclass 35 | class SingleEvalResult: 36 | """ 37 | Result of evaluating a single sample 38 | """ 39 | 40 | score: float | None 41 | metrics: dict[str, float] = field(default_factory=dict) 42 | html: str | None = None 43 | convo: MessageList | None = None # sampled conversation 44 | 45 | 46 | class Eval: 47 | """ 48 | Base class for defining an evaluation. 49 | """ 50 | 51 | def __call__(self, sampler: SamplerBase) -> EvalResult: 52 | raise NotImplementedError 53 | -------------------------------------------------------------------------------- /TLM-intro/Real_time_Eval_for_every_LLM_response_with_Cleanlab_TLM.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "d28JCTXqODOB" 7 | }, 8 | "source": [ 9 | "# Real-time Eval for every LLM response with Cleanlab TLM" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 2, 15 | "metadata": { 16 | "id": "_V8qJHRTJdBW" 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "#@title Install Cleanlab TLM\n", 21 | "%pip install --upgrade cleanlab-tlm" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "# Set your API key\n", 31 | "import os\n", 32 | "os.environ[\"CLEANLAB_TLM_API_KEY\"] = \"\" # Get your API key from: https://tlm.cleanlab.ai/" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 8, 38 | "metadata": { 39 | "colab": { 40 | "base_uri": "https://localhost:8080/" 41 | }, 42 | "id": "Cwns730hK4a-", 43 | "outputId": "467fa637-3a32-4c51-b6db-438fd9703ce5" 44 | }, 45 | "outputs": [ 46 | { 47 | "name": "stdout", 48 | "output_type": "stream", 49 | "text": [ 50 | "TLM model used: gpt-4o-mini \n", 51 | "\n", 52 | "TLM response: The third month of the year alphabetically is \"March.\" The months in alphabetical order are:\n", 53 | "\n", 54 | "1. April\n", 55 | "2. August\n", 56 | "3. December\n", 57 | "4. February\n", 58 | "5. January\n", 59 | "6. July\n", 60 | "7. June\n", 61 | "8. March\n", 62 | "9. May\n", 63 | "10. November\n", 64 | "11. October\n", 65 | "12. September \n", 66 | "\n", 67 | "TLM trustworthiness score: 0.4979648802626605 \n", 68 | "\n", 69 | "TLM Explanation: The answer provided states that \"March\" is the third month of the year alphabetically. However, when listing the months in alphabetical order, \"March\" is actually the eighth month. Therefore, incorrect. \n", 70 | "This response is untrustworthy due to lack of consistency in possible responses from the model. Here's one inconsistent alternate response that the model considered (which may not be accurate either): \n", 71 | "December. \n", 72 | "\n" 73 | ] 74 | } 75 | ], 76 | "source": [ 77 | "from cleanlab_tlm import TLM\n", 78 | "\n", 79 | "tlm = TLM(options={\"log\": [\"explanation\"], \"model\": \"gpt-4o-mini\"}) # supports GPT, Claude, etc\n", 80 | "\n", 81 | "# Use TLM like GPT (with more accurate results). Returns response, trustworthiness score, explanation\n", 82 | "out = tlm.prompt(\"What's the third month of the year alphabetically?\")\n", 83 | "print(\"TLM model used: \", tlm.get_model_name(), \"\\n\")\n", 84 | "print(\"TLM response: \", out['response'], \"\\n\")\n", 85 | "print(\"TLM trustworthiness score: \", out['trustworthiness_score'], \"\\n\")\n", 86 | "print(\"TLM Explanation: \", out['log']['explanation'], \"\\n\")" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": { 92 | "id": "uwCJ4aDNLcr8" 93 | }, 94 | "source": [ 95 | "## You can also use TLM to score the trustworthiness of any response to a given prompt.\n", 96 | "Use `tlm.get_trustworthiness_score` which returns a numerical value between 0-1.\n", 97 | " - Enables you to use TLM with responses from your own custom LLM or LLM in production.\n" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 9, 103 | "metadata": { 104 | "colab": { 105 | "base_uri": "https://localhost:8080/" 106 | }, 107 | "id": "s1Nx0LxGLbGw", 108 | "outputId": "733be2a4-4550-4bbd-a9aa-010e8a2e5443" 109 | }, 110 | "outputs": [ 111 | { 112 | "name": "stdout", 113 | "output_type": "stream", 114 | "text": [ 115 | "Trustworthiness Score: 0.9997738711822411\n", 116 | "Explanation: Did not find a reason to doubt trustworthiness.\n" 117 | ] 118 | } 119 | ], 120 | "source": [ 121 | "# TLM returns a high score when the LLM/RAG/Agent is accurate\n", 122 | "response = tlm.get_trustworthiness_score(\"What's the first month of the year?\", response=\"January\")\n", 123 | "print(\"Trustworthiness Score: \", response[\"trustworthiness_score\"])\n", 124 | "print(\"Explanation: \", response[\"log\"][\"explanation\"])" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 10, 130 | "metadata": { 131 | "colab": { 132 | "base_uri": "https://localhost:8080/" 133 | }, 134 | "id": "ZTgsT_j8LZqk", 135 | "outputId": "c07401dc-b01d-4d90-ad07-d07e1f3b4084" 136 | }, 137 | "outputs": [ 138 | { 139 | "name": "stdout", 140 | "output_type": "stream", 141 | "text": [ 142 | "Trustworthiness Score: 0.04739682241488771\n", 143 | "Explanation: The first month of the year is January, not February. Therefore, factually incorrect. \n", 144 | "This response is untrustworthy due to lack of consistency in possible responses from the model. Here's one inconsistent alternate response that the model considered (which may not be accurate either): \n", 145 | "January.\n" 146 | ] 147 | } 148 | ], 149 | "source": [ 150 | "# TLM returns a low score when the LLM/RAG/Agent is untrustworthy\n", 151 | "response = tlm.get_trustworthiness_score(\"What's the first month of the year?\", response=\"February\")\n", 152 | "print(\"Trustworthiness Score: \", response[\"trustworthiness_score\"])\n", 153 | "print(\"Explanation: \", response[\"log\"][\"explanation\"])" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": { 159 | "id": "xorjnv3kOj8D" 160 | }, 161 | "source": [ 162 | "\n", 163 | "\n", 164 | "# How to interpret the TLM trustworthiness score:\n", 165 | " - `1.0 >= trustworthiness > 0.9` -- highly reliable reponse you can fully trust\n", 166 | " - `0.9 >= trustworthiness > 0.7` -- sometimes a bad retrieval (if RAG), hallucination, or wrong reponse\n", 167 | " - `0.7 >= trustworthiness > 0.3` -- likely a bad retrieval (if RAG), hallucination, or wrong reponse\n", 168 | " - `0.3 >= trustworthiness >= 0.0` -- near-always a bad retrieval (if RAG), hallucination, or wrong reponse" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": { 175 | "id": "B6d_W5g-OnDH" 176 | }, 177 | "outputs": [], 178 | "source": [ 179 | "# Run your own real-time evaluation of LLM outputs with TLM here:\n", 180 | "tlm.prompt(\"ENTER_YOUR_OWN_PROMPT_HERE\")" 181 | ] 182 | } 183 | ], 184 | "metadata": { 185 | "colab": { 186 | "provenance": [], 187 | "toc_visible": true 188 | }, 189 | "kernelspec": { 190 | "display_name": "Python 3", 191 | "name": "python3" 192 | }, 193 | "language_info": { 194 | "codemirror_mode": { 195 | "name": "ipython", 196 | "version": 3 197 | }, 198 | "file_extension": ".py", 199 | "mimetype": "text/x-python", 200 | "name": "python", 201 | "nbconvert_exporter": "python", 202 | "pygments_lexer": "ipython3", 203 | "version": "3.12.0" 204 | } 205 | }, 206 | "nbformat": 4, 207 | "nbformat_minor": 0 208 | } 209 | -------------------------------------------------------------------------------- /TLM-o1-benchmark/README.md: -------------------------------------------------------------------------------- 1 | # TLM o1-preview Benchmark 2 | 3 | This folder contains the dataset and the code to reproduce the TLM-o1 benchmark we published in our [blog post](https://cleanlab.ai/blog/tlm-o1/). 4 | 5 | For SVAMP and TriviaQA, we specifically selected challenging examples that OpenAI’s GPT-4o model got wrong, as OpenAI’s o1-preview API is still slow and costly to benchmark across larger datasets. For our TriviaQA benchmark, we used 114 examples from the validation set which GPT-4o answered wrong, and we were able to manually confirm the answer listed as ground truth is actually correct. For our SVAMP benchmark, we used 49 examples which GPT-4o answered wrong, and we were able to manually confirm the answer listed as ground truth is actually correct. For our PII Detection benchmark, we specifically focused on identifying first-names present in the text, considering a dataset of 98 examples. 6 | 7 | API keys: 8 | - A `CLEANLAB_TLM_API_KEY` is required to run this project. Get a Cleanlab API key at https://tlm.cleanlab.ai 9 | - An `OPENAI_API_KEY` is also required to run this project. Get an OpenAI key at https://platform.openai.com/api-keys 10 | 11 | To reproduce the benchmarks: 12 | - Use the `openai_o1_preview_benchmark_reproduce.ipynb` file to reproduce the OpenAI o1 benchmark. 13 | - Use the `tlm_o1_preview_benchmark_reproduce.ipynb` file to reproduce the TLM o1-preview benchmark. 14 | -------------------------------------------------------------------------------- /TLM-o1-benchmark/SVAMP-49-benchmark/svamp_49_dataset.csv: -------------------------------------------------------------------------------- 1 | prompt,ground_truth 2 | "Matthew gave equal numbers of crackers and cakes to his 28 friends. If he had 13 crackers and 15 cakes initially How many crackers and cakes did each person eat? 3 | Therefore, the answer (arabic numerals) is: ",1 4 | "Jack received 6 emails in the morning, 3 emails in the afternoon and some more in the evening. If he received a total of 10 emails in the day How many emails did Jack receive in the afternoon? 5 | Therefore, the answer (arabic numerals) is: ",3 6 | "After eating a hearty meal they went to see the Buckingham palace. There, Rachel learned that 703 visitors came to the Buckingham palace on the previous day. If there were 246 visitors on that day How many visited the Buckingham palace within 25 days? 7 | Therefore, the answer (arabic numerals) is: ",949 8 | "Mary is baking a cake. The recipe calls for 11 cups of flour and 7 cups of sugar. She already put in some cups of flour. If she still needs 2 more cups of flour than sugar How many cups of flour did she put in? 9 | Therefore, the answer (arabic numerals) is: ",2 10 | "Last week Fred had 111 dollars and Jason had 40 dollars. Fred washed cars over the weekend and now has 115 dollars. Jason delivered newspapers and now has 44 dollars. How much money did they earn by washing cars? 11 | Therefore, the answer (arabic numerals) is: ",4 12 | "The school is planning a field trip. The school has 87 classrooms. There are 58 students in the school with each classroom having the same number of students. If there are 2 seats on each school bus. How many buses are needed to take the trip? 13 | Therefore, the answer (arabic numerals) is: ",29 14 | "Jake has 3 fewer peaches than Steven who has 8 more peaches than Jill. Steven has 5 peaches. How many more peaches does Jake have than Jill? 15 | Therefore, the answer (arabic numerals) is: ",5 16 | "Matthew had 14 crackers and 21 cakes. If Matthew gave equal numbers of crackers and cakes to his 7 friends How many crackers and cakes did each person eat? 17 | Therefore, the answer (arabic numerals) is: ",5 18 | "Jerry had 8 action figures on a shelf in his room. Later he added 4 more action figures to the shelf and removed 5 old ones. How many action figures were on his shelf in all? 19 | Therefore, the answer (arabic numerals) is: ",7 20 | "Josh had 22 marbles in his collection. He found 13 marbles ones while he lost 5 marbles. How many marbles does he have now? 21 | Therefore, the answer (arabic numerals) is: ",30 22 | "After eating a hearty meal they went to see the Buckingham palace. There were 71 paintings in the Buckingham palace. There, Rachel learned that 557 visitors came to the Buckingham palace that day. If there were 188 visitors the previous day How many visited the Buckingham palace within 57 days? 23 | Therefore, the answer (arabic numerals) is: ",745 24 | "Jake has 12 fewer peaches than Steven who has 11 more peaches than Jill. Steven has 4 peaches. How many more peaches does Jill have than Jake? 25 | Therefore, the answer (arabic numerals) is: ",1 26 | "Frank was reading through his favorite book. The book had 2 chapters each with 405 pages. It took frank 664 days to finish the book. How many chapters did he read per day? 27 | Therefore, the answer (arabic numerals) is: ",332 28 | "Marco and his dad went strawberry picking. Together they collected strawberries that weighed 22 pounds. On the way back Marco ' dad found 30 more pounds of strawberries. Marco's strawberries now weighed 36 pounds. How much did his dad's strawberries weigh now? 29 | Therefore, the answer (arabic numerals) is: ",16 30 | "Marco and his dad went strawberry picking. Together they collected strawberries that weighed 24 pounds. On the way back Marco lost 9 pounds of strawberries. Marco's strawberries now weighed 3 pounds. How much did his dad's strawberries weigh? 31 | Therefore, the answer (arabic numerals) is: ",12 32 | "Ed had 2 more marbles than Doug. Doug lost some of his marbles at the playground. Now Ed has 19 more marbles than doug. How many marbles did Doug lose? 33 | Therefore, the answer (arabic numerals) is: ",17 34 | "There are 10 peaches distributed equally in some baskets. If each basket has 4 red peaches and 6 green peaches How many baskets of peaches are there? 35 | Therefore, the answer (arabic numerals) is: ",1 36 | "Lewis earns $ 491 every week during the 1181 weeks of harvest. If he has to pay $ 216 rent every week How much money will have at the end of the harvest season? 37 | Therefore, the answer (arabic numerals) is: ",324775 38 | "Dan has $ 4. He bought 99 candy bar for $ 3 each one costing the same amount of money. How much money is left? 39 | Therefore, the answer (arabic numerals) is: ",1 40 | "After eating a hearty meal they went to see the Buckingham palace. There, Rachel learned that 705 visitors came to the Buckingham palace that day. If there were 191 visitors the previous day and 457 visitors the day before that How many more visitors visited the Buckingham palace on that day than on the previous 245 days? 41 | Therefore, the answer (arabic numerals) is: ",57 42 | "Paul got a box of 521 crayons and 66 erasers for his birthday. At the end of the school year he only had 154 left while not having lost a single erasers. How many crayons had been lost or given away? 43 | Therefore, the answer (arabic numerals) is: ",367 44 | "Zachary did 53 push-ups and 14 crunches in gym class today. David did 17 more push-ups but 10 less crunches than zachary. How many push-ups and crunches did Zachary do? 45 | Therefore, the answer (arabic numerals) is: ",67 46 | "Dan has $ 4. For a total of $ 3 he bought 10 candy bar each one costing the same amount of money. How much money is left? 47 | Therefore, the answer (arabic numerals) is: ",1 48 | "You had 14 bags with equal number of cookies. If you had 28 cookies and 86 candies in total How many bags of cookies do you have? 49 | Therefore, the answer (arabic numerals) is: ",2 50 | "Baker made 62 cakes. Then he made 149 more cakes. He sold 144 of them. How many cakes would baker still have? 51 | Therefore, the answer (arabic numerals) is: ",67 52 | "Lewis earns $ 403 every week during the 233 weeks of harvest. If he has to pay $ 49 rent every week How much money does he earn during harvest season? 53 | Therefore, the answer (arabic numerals) is: ",93899 54 | "He then went to see the oranges being harvested. He found out that they harvest 76 sacks per day and discard 64 of them. How many oranges do they harvest per day if each sack contains 50 oranges? 55 | Therefore, the answer (arabic numerals) is: ",600 56 | "Bobby ate 28 pieces of candy. Then he ate 42 more. He also ate 63 pieces of chocolate. How many pieces of candy did Bobby eat? 57 | Therefore, the answer (arabic numerals) is: ",70 58 | "Every day Ryan spends 4 hours on learning english and 6 hours on learning chinese. If he learns for 86 days How many hours does he spend on learning english and chinese each day? 59 | Therefore, the answer (arabic numerals) is: ",10 60 | "Next on his checklist is wax to stick the feathers together. If he has 557 g of wax and right now he just needs 17 g Total how many grams of wax do the feathers require? 61 | Therefore, the answer (arabic numerals) is: ",574 62 | "After eating a hearty meal they went to see the Buckingham palace. There, Rachel learned that 92 visitors came to the Buckingham palace that day. If there were 419 visitors the previous day and 103 visitors the day before that How many visited the Buckingham palace before that day? 63 | Therefore, the answer (arabic numerals) is: ",522 64 | "Being his favorite, he saved checking on the grapevines for his last stop. He was told by 36 of the pickers that they fill 8 drums of grapes per day. How many days will it take to fill 240 drums of grapes? 65 | Therefore, the answer (arabic numerals) is: ",30 66 | "Frank was reading through his favorite book. The book had 193 pages equally distributed over 15 chapters. It took Frank 660 days to finish the book. How many chapters did he read per day? 67 | Therefore, the answer (arabic numerals) is: ",44 68 | "3 birds were sitting on the fence. 2 more birds and 6 more storks came to join them. How many more storks than birds are sitting on the fence? 69 | Therefore, the answer (arabic numerals) is: ",1 70 | "Paul had 21 books. After selling some in a garage sale he bought 42 new ones. If he has 15 books now How many more books did he sell than he bought? 71 | Therefore, the answer (arabic numerals) is: ",6 72 | "Together Adam and Jackie have 12 apples. He has 9 apples more than Adam and Jackie together do. Adam has 8 more apples than Jackie. How many apples does He have? 73 | Therefore, the answer (arabic numerals) is: ",21 74 | "Adam has 4 more apples than Jackie. Together Adam and Jackie have 14 apples. Bob has 6 apples more than Adam and Jackie together do. How many apples does Bob have? 75 | Therefore, the answer (arabic numerals) is: ",20 76 | "Lewis earns $ 368 every week during the 1359 weeks of harvest. If he has to pay $ 388 rent every week How much money does he pay as rent during the harvest season? 77 | Therefore, the answer (arabic numerals) is: ",527292 78 | "There were 21 roses in the vase. Jessica threw away 34 roses from the vase and cut some more new roses from her flower garden to put in the vase. There are now 15 roses in the vase. How many more roses did she throw away than those she cut from her garden? 79 | Therefore, the answer (arabic numerals) is: ",6 80 | "Faye was placing her pencils and crayons into 16 rows with 6 crayons and 21 pencils in each row. How many crayons does she have? 81 | Therefore, the answer (arabic numerals) is: ",96 82 | "A farmer had 177 tomatoes and 12 potatoes in his garden. If he picked 53 tomatoes How many tomatoes and potatoes does he have left? 83 | Therefore, the answer (arabic numerals) is: ",136 84 | "Debby bought some water bottles when they were on sale. She drank 109 bottles a day. If the bottles lasted for 74 days How many bottles had she bought? 85 | Therefore, the answer (arabic numerals) is: ",8066 86 | "They decided to hold the party in their backyard. They have 10 sets of tables and each set has 6 chairs. If there are 11 people sitting on chairs How many chairs are left unoccupied? 87 | Therefore, the answer (arabic numerals) is: ",49 88 | "After eating a hearty meal they went to see the Buckingham palace. There, Rachel learned that 512 visitors came to the Buckingham palace that day. If there were 471 visitors the previous day and 808 visitors the day before that How many visited the Buckingham palace within the past 89 days? 89 | Therefore, the answer (arabic numerals) is: ",1791 90 | "After eating a hearty meal they went to see the Buckingham palace. There, Rachel learned that 132 visitors came to the Buckingham palace that day. If 406 people visited the Buckingham palace within the past 327 days How many visitors visited the Buckingham palace on the previous day? 91 | Therefore, the answer (arabic numerals) is: ",274 92 | "After eating a hearty meal they went to see the Buckingham palace. There, Rachel learned that 45 visitors came to the Buckingham palace on the previous day. If 829 people visited the Buckingham palace within the past 85 days How many visitors visited the Buckingham palace on that day? 93 | Therefore, the answer (arabic numerals) is: ",784 94 | "Baker made 81 cakes. Then he made 92 more cakes. He sold 46 of them. How many more cakes did baker make than those he sold? 95 | Therefore, the answer (arabic numerals) is: ",127 96 | "Lewis earns $ 28 every week during the 1091 weeks of harvest. He also earns $ 939 per week for working overtime. If he works overtime every week How much money does he earn during harvest season? 97 | Therefore, the answer (arabic numerals) is: ",1054997 98 | "Paige was helping her mom plant flowers and together they planted 36 seeds. They put 12 seeds in each flower bed and only 58 seeds grew into flowers in each flower bed. How many flower beds did they have? 99 | Therefore, the answer (arabic numerals) is: ",3 100 | -------------------------------------------------------------------------------- /TLM-o1-benchmark/openai_o1_preview_benchmark_reproduce.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 14, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "text/html": [ 11 | "
\n", 12 | "\n", 25 | "\n", 26 | " \n", 27 | " \n", 28 | " \n", 29 | " \n", 30 | " \n", 31 | " \n", 32 | " \n", 33 | " \n", 34 | " \n", 35 | " \n", 36 | " \n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | "
promptground_truth
0Matthew gave equal numbers of crackers and cak...1
1Jack received 6 emails in the morning, 3 email...3
2After eating a hearty meal they went to see th...949
3Mary is baking a cake. The recipe calls for 11...2
4Last week Fred had 111 dollars and Jason had 4...4
\n", 61 | "
" 62 | ], 63 | "text/plain": [ 64 | " prompt ground_truth\n", 65 | "0 Matthew gave equal numbers of crackers and cak... 1\n", 66 | "1 Jack received 6 emails in the morning, 3 email... 3\n", 67 | "2 After eating a hearty meal they went to see th... 949\n", 68 | "3 Mary is baking a cake. The recipe calls for 11... 2\n", 69 | "4 Last week Fred had 111 dollars and Jason had 4... 4" 70 | ] 71 | }, 72 | "execution_count": 14, 73 | "metadata": {}, 74 | "output_type": "execute_result" 75 | } 76 | ], 77 | "source": [ 78 | "import pandas as pd\n", 79 | "\n", 80 | "# Here we use the SVAMP-49 dataset as an example. But you can use any other dataset such as PII-98 or TriviaQA-114.\n", 81 | "svamp_49_dataset = pd.read_csv(\"SVAMP-49-benchmark/svamp_49_dataset.csv\")\n", 82 | "svamp_49_dataset.head()" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 15, 88 | "metadata": {}, 89 | "outputs": [ 90 | { 91 | "name": "stdout", 92 | "output_type": "stream", 93 | "text": [ 94 | "Prompt: Matthew gave equal numbers of crackers and cakes to his 28 friends. If he had 13 crackers and 15 cakes initially How many crackers and cakes did each person eat?\n", 95 | "Therefore, the answer (arabic numerals) is: \n", 96 | "Response: To solve this problem, let's determine how many crackers and cakes each friend received.\n", 97 | "\n", 98 | "Let \\( x \\) be the number of crackers (and cakes, since they are equal) each friend received.\n", 99 | "\n", 100 | "Given:\n", 101 | "- Total number of friends: 28\n", 102 | "- Total crackers: 13\n", 103 | "- Total cakes: 15\n", 104 | "\n", 105 | "Since Matthew gave equal numbers of crackers and cakes to his friends:\n", 106 | "\n", 107 | "1. Each friend gets \\( x \\) crackers.\n", 108 | "2. Total crackers distributed: \\( 28x = 13 \\)\n", 109 | "3. Solve for \\( x \\):\n", 110 | " \\[ x = \\frac{13}{28} \\]\n", 111 | "\n", 112 | "Similarly, for cakes:\n", 113 | "\n", 114 | "1. Each friend gets \\( x \\) cakes.\n", 115 | "2. Total cakes distributed: \\( 28x = 15 \\)\n", 116 | "3. Solve for \\( x \\):\n", 117 | " \\[ x = \\frac{15}{28} \\]\n", 118 | "\n", 119 | "However, since \\( x \\) must be the same for both crackers and cakes, but \\(\\frac{13}{28} \\ne \\frac{15}{28}\\), this suggests there might be an error.\n", 120 | "\n", 121 | "But the problem states that Matthew gave equal numbers of crackers and cakes to **each friend**, not that the total number of crackers equals the total number of cakes.\n", 122 | "\n", 123 | "Therefore, each friend received:\n", 124 | "\n", 125 | "- Crackers per friend: \\( \\frac{13}{28} \\)\n", 126 | "- Cakes per friend: \\( \\frac{15}{28} \\)\n", 127 | "\n", 128 | "So,\n", 129 | "\n", 130 | "**Therefore, the answer (arabic numerals) is:** \n", 131 | "Crackers per person: **13⁄28** \n", 132 | "Cakes per person: **15⁄28**\n", 133 | "Ground Truth: 1\n", 134 | "---\n", 135 | "Prompt: Jack received 6 emails in the morning, 3 emails in the afternoon and some more in the evening. If he received a total of 10 emails in the day How many emails did Jack receive in the afternoon?\n", 136 | "Therefore, the answer (arabic numerals) is: \n", 137 | "Response: To solve this problem, we need to determine how many emails Jack received in the **afternoon**.\n", 138 | "\n", 139 | "**Given:**\n", 140 | "- Morning emails: **6**\n", 141 | "- Afternoon emails: **3**\n", 142 | "- Evening emails: Let's denote this as **E**\n", 143 | "- Total emails for the day: **10**\n", 144 | "\n", 145 | "**Setting up the equation:**\n", 146 | "\n", 147 | "\\[\n", 148 | "\\text{Morning emails} + \\text{Afternoon emails} + \\text{Evening emails} = \\text{Total emails}\n", 149 | "\\]\n", 150 | "\n", 151 | "Substitute the known values:\n", 152 | "\n", 153 | "\\[\n", 154 | "6 + 3 + E = 10\n", 155 | "\\]\n", 156 | "\n", 157 | "Now, solve for **E** (Evening emails):\n", 158 | "\n", 159 | "\\[\n", 160 | "6 + 3 + E = 10 \\\\\n", 161 | "9 + E = 10 \\\\\n", 162 | "E = 10 - 9 \\\\\n", 163 | "E = 1\n", 164 | "\\]\n", 165 | "\n", 166 | "So, Jack received **1 email in the evening**.\n", 167 | "\n", 168 | "However, the question asks: **\"How many emails did Jack receive in the afternoon?\"**\n", 169 | "\n", 170 | "Since we were already told that Jack received **3 emails in the afternoon**, the answer is:\n", 171 | "\n", 172 | "**Answer: 3**\n", 173 | "Ground Truth: 3\n", 174 | "---\n", 175 | "Prompt: After eating a hearty meal they went to see the Buckingham palace. There, Rachel learned that 703 visitors came to the Buckingham palace on the previous day. If there were 246 visitors on that day How many visited the Buckingham palace within 25 days?\n", 176 | "Therefore, the answer (arabic numerals) is: \n", 177 | "Response: 1080\n", 178 | "Ground Truth: 949\n", 179 | "---\n", 180 | "Prompt: Mary is baking a cake. The recipe calls for 11 cups of flour and 7 cups of sugar. She already put in some cups of flour. If she still needs 2 more cups of flour than sugar How many cups of flour did she put in?\n", 181 | "Therefore, the answer (arabic numerals) is: \n", 182 | "Response: To solve this problem, we need to determine how many cups of flour Mary has already added to the cake batter.\n", 183 | "\n", 184 | "**Given:**\n", 185 | "- Total flour required: 11 cups\n", 186 | "- Total sugar required: 7 cups\n", 187 | "- Mary still needs 2 more cups of flour than sugar.\n", 188 | "\n", 189 | "Let's let \\( x \\) be the amount of flour Mary has already added.\n", 190 | "\n", 191 | "**Step 1:** Calculate the remaining flour needed.\n", 192 | "- Remaining flour to be added: \\( 11 - x \\) cups\n", 193 | "\n", 194 | "**Step 2:** Determine the amount of sugar Mary still needs to add.\n", 195 | "- Since the recipe requires 7 cups of sugar and we are not told that any sugar has been added yet, we can assume she still needs all 7 cups.\n", 196 | "- Remaining sugar to be added: \\( 7 \\) cups\n", 197 | "\n", 198 | "**Step 3:** Set up an equation based on the statement \"She still needs 2 more cups of flour than sugar.\"\n", 199 | "- \\( \\text{Remaining flour} = \\text{Remaining sugar} + 2 \\)\n", 200 | "- Substitute the known values:\n", 201 | " \\( 11 - x = 7 + 2 \\)\n", 202 | "\n", 203 | "**Step 4:** Solve for \\( x \\).\n", 204 | "- \\( 11 - x = 9 \\)\n", 205 | "- \\( -x = 9 - 11 \\)\n", 206 | "- \\( -x = -2 \\)\n", 207 | "- \\( x = 2 \\)\n", 208 | "\n", 209 | "**Answer:** Mary has already put in **2 cups** of flour.\n", 210 | "\n", 211 | "2\n", 212 | "Ground Truth: 2\n", 213 | "---\n", 214 | "Prompt: Last week Fred had 111 dollars and Jason had 40 dollars. Fred washed cars over the weekend and now has 115 dollars. Jason delivered newspapers and now has 44 dollars. How much money did they earn by washing cars?\n", 215 | "Therefore, the answer (arabic numerals) is: \n", 216 | "Response: 4\n", 217 | "Ground Truth: 4\n", 218 | "---\n" 219 | ] 220 | }, 221 | { 222 | "data": { 223 | "text/html": [ 224 | "
\n", 225 | "\n", 238 | "\n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | "
promptground_truthresponse_openai_o1_preview
0Matthew gave equal numbers of crackers and cak...1To solve this problem, let's determine how man...
1Jack received 6 emails in the morning, 3 email...3To solve this problem, we need to determine ho...
2After eating a hearty meal they went to see th...9491080
3Mary is baking a cake. The recipe calls for 11...2To solve this problem, we need to determine ho...
4Last week Fred had 111 dollars and Jason had 4...44
\n", 280 | "
" 281 | ], 282 | "text/plain": [ 283 | " prompt ground_truth \\\n", 284 | "0 Matthew gave equal numbers of crackers and cak... 1 \n", 285 | "1 Jack received 6 emails in the morning, 3 email... 3 \n", 286 | "2 After eating a hearty meal they went to see th... 949 \n", 287 | "3 Mary is baking a cake. The recipe calls for 11... 2 \n", 288 | "4 Last week Fred had 111 dollars and Jason had 4... 4 \n", 289 | "\n", 290 | " response_openai_o1_preview \n", 291 | "0 To solve this problem, let's determine how man... \n", 292 | "1 To solve this problem, we need to determine ho... \n", 293 | "2 1080 \n", 294 | "3 To solve this problem, we need to determine ho... \n", 295 | "4 4 " 296 | ] 297 | }, 298 | "execution_count": 15, 299 | "metadata": {}, 300 | "output_type": "execute_result" 301 | } 302 | ], 303 | "source": [ 304 | "from openai import OpenAI\n", 305 | "\n", 306 | "open_ai_key = \"\"\n", 307 | "client = OpenAI(api_key=open_ai_key)\n", 308 | "\n", 309 | "responses = []\n", 310 | "# Only use the first 5 rows for illustration. Feel free to use the entire dataset.\n", 311 | "for _, row in svamp_49_dataset.head().iterrows():\n", 312 | " prompt = row['prompt']\n", 313 | " response = client.chat.completions.create(\n", 314 | " messages=[{\"role\": \"user\", \"content\": prompt}],\n", 315 | " model=\"o1-preview\",\n", 316 | " )\n", 317 | " response_text = response.choices[0].message.content\n", 318 | " responses.append(response_text)\n", 319 | " print(f\"Prompt: {prompt}\")\n", 320 | " print(f\"Response: {response_text}\")\n", 321 | " print(f\"Ground Truth: {row['ground_truth']}\")\n", 322 | " print(\"---\")\n", 323 | "\n", 324 | "# Create a new DataFrame with the responses\n", 325 | "results_df = pd.DataFrame({\n", 326 | " 'prompt': svamp_49_dataset.head()['prompt'],\n", 327 | " 'ground_truth': svamp_49_dataset.head()['ground_truth'],\n", 328 | " 'response_openai_o1_preview': responses\n", 329 | "})\n", 330 | "\n", 331 | "results_df.to_csv(\"SVAMP-49-benchmark/svamp_49_openai_o1_preview_responses_new.csv\", index=False)\n", 332 | "\n", 333 | "# Display the results\n", 334 | "results_df" 335 | ] 336 | }, 337 | { 338 | "cell_type": "markdown", 339 | "metadata": {}, 340 | "source": [ 341 | "# Try individual examples" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": 13, 347 | "metadata": {}, 348 | "outputs": [ 349 | { 350 | "name": "stdout", 351 | "output_type": "stream", 352 | "text": [ 353 | "\"To solve this problem, let's carefully extract the information provided:\\n\\n1. **Zachary's Performance**:\\n - **Push-ups**: 53\\n - **Crunches**: 14\\n\\n2. **David's Performance Relative to Zachary**:\\n - David did **17 more push-ups** than Zachary.\\n - David did **10 less crunches** than Zachary.\\n\\nNow, calculate David's actual performance:\\n\\n- **David's Push-ups**:\\n - \\\\(53 \\\\text{ (Zachary's push-ups)} + 17 = 70\\\\)\\n\\n- **David's Crunches**:\\n - \\\\(14 \\\\text{ (Zachary's crunches)} - 10 = 4\\\\)\\n\\n**Answer:** \\\\(70\\\\) \\\\(4\\\\)\"\n" 354 | ] 355 | } 356 | ], 357 | "source": [ 358 | "from openai import OpenAI\n", 359 | "\n", 360 | "open_ai_key = \"\"\n", 361 | "client = OpenAI(api_key=open_ai_key)\n", 362 | "\n", 363 | "prompt = \"\"\"Zachary did 53 push-ups and 14 crunches in gym class today. David did 17 more push-ups but 10 less crunches than zachary. How many push-ups and crunches did Zachary do?\n", 364 | "Therefore, the answer (arabic numerals) is: \n", 365 | "\"\"\"\n", 366 | "response = client.chat.completions.create(\n", 367 | " messages=[{\"role\": \"user\", \"content\": prompt}],\n", 368 | " model=\"o1-preview\",\n", 369 | ")\n", 370 | "\n", 371 | "import json\n", 372 | "print(json.dumps(response.choices[0].message.content, indent=4))" 373 | ] 374 | } 375 | ], 376 | "metadata": { 377 | "kernelspec": { 378 | "display_name": "Python 3", 379 | "language": "python", 380 | "name": "python3" 381 | }, 382 | "language_info": { 383 | "codemirror_mode": { 384 | "name": "ipython", 385 | "version": 3 386 | }, 387 | "file_extension": ".py", 388 | "mimetype": "text/x-python", 389 | "name": "python", 390 | "nbconvert_exporter": "python", 391 | "pygments_lexer": "ipython3", 392 | "version": "3.10.15" 393 | } 394 | }, 395 | "nbformat": 4, 396 | "nbformat_minor": 2 397 | } 398 | -------------------------------------------------------------------------------- /TLM-o1-benchmark/tlm_o1_preview_benchmark_reproduce.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%pip install --upgrade cleanlab-tlm" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "# Set your API key\n", 19 | "import os\n", 20 | "os.environ[\"CLEANLAB_TLM_API_KEY\"] = \"\" # Get your API key from: https://tlm.cleanlab.ai/" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 8, 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "data": { 30 | "text/html": [ 31 | "
\n", 32 | "\n", 45 | "\n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | "
promptground_truth
0Matthew gave equal numbers of crackers and cak...1
1Jack received 6 emails in the morning, 3 email...3
2After eating a hearty meal they went to see th...949
3Mary is baking a cake. The recipe calls for 11...2
4Last week Fred had 111 dollars and Jason had 4...4
\n", 81 | "
" 82 | ], 83 | "text/plain": [ 84 | " prompt ground_truth\n", 85 | "0 Matthew gave equal numbers of crackers and cak... 1\n", 86 | "1 Jack received 6 emails in the morning, 3 email... 3\n", 87 | "2 After eating a hearty meal they went to see th... 949\n", 88 | "3 Mary is baking a cake. The recipe calls for 11... 2\n", 89 | "4 Last week Fred had 111 dollars and Jason had 4... 4" 90 | ] 91 | }, 92 | "execution_count": 8, 93 | "metadata": {}, 94 | "output_type": "execute_result" 95 | } 96 | ], 97 | "source": [ 98 | "import pandas as pd\n", 99 | "\n", 100 | "# Here we use the SVAMP-49 dataset as an example. But you can use any other dataset such as PII-98 or TriviaQA-114.\n", 101 | "svamp_49_dataset = pd.read_csv(\"SVAMP-49-benchmark/svamp_49_dataset.csv\")\n", 102 | "svamp_49_dataset.head()" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 9, 108 | "metadata": {}, 109 | "outputs": [ 110 | { 111 | "name": "stdout", 112 | "output_type": "stream", 113 | "text": [ 114 | "Prompt: Matthew gave equal numbers of crackers and cakes to his 28 friends. If he had 13 crackers and 15 cakes initially How many crackers and cakes did each person eat?\n", 115 | "Therefore, the answer (arabic numerals) is: \n", 116 | "Response: Each friend ate \\(\\frac{13}{28}\\) crackers and \\(\\frac{15}{28}\\) cakes.\n", 117 | "\n", 118 | "So, the answer is: 13⁄28 crackers and 15⁄28 cakes.\n", 119 | "\n", 120 | "**Answer:** 13⁄28 crackers and 15⁄28 cakes.\n", 121 | "Trustworthiness Score: 0.9509885426396171\n", 122 | "Ground Truth: 1\n", 123 | "---\n", 124 | "Prompt: Jack received 6 emails in the morning, 3 emails in the afternoon and some more in the evening. If he received a total of 10 emails in the day How many emails did Jack receive in the afternoon?\n", 125 | "Therefore, the answer (arabic numerals) is: \n", 126 | "Response: To determine how many emails Jack received in the afternoon:\n", 127 | "\n", 128 | "1. **Total emails received in the day**: 10\n", 129 | "2. **Emails received in the morning**: 6\n", 130 | "3. **Emails received in the afternoon**: 3\n", 131 | "4. **Emails received in the evening**: Let's denote this as \\( x \\)\n", 132 | "\n", 133 | "We can set up the equation:\n", 134 | "\n", 135 | "\\[\n", 136 | "\\text{Morning emails} + \\text{Afternoon emails} + \\text{Evening emails} = \\text{Total emails}\n", 137 | "\\]\n", 138 | "\n", 139 | "Plugging in the known values:\n", 140 | "\n", 141 | "\\[\n", 142 | "6 + 3 + x = 10\n", 143 | "\\]\n", 144 | "\n", 145 | "Solving for \\( x \\):\n", 146 | "\n", 147 | "\\[\n", 148 | "9 + x = 10 \\\\\n", 149 | "x = 10 - 9 \\\\\n", 150 | "x = 1\n", 151 | "\\]\n", 152 | "\n", 153 | "So, Jack received **1 email in the evening**.\n", 154 | "\n", 155 | "However, the question asks: **\"How many emails did Jack receive in the afternoon?\"**\n", 156 | "\n", 157 | "Based on the information provided:\n", 158 | "\n", 159 | "**Answer:** **3**\n", 160 | "Trustworthiness Score: 0.826780596200133\n", 161 | "Ground Truth: 3\n", 162 | "---\n", 163 | "Prompt: After eating a hearty meal they went to see the Buckingham palace. There, Rachel learned that 703 visitors came to the Buckingham palace on the previous day. If there were 246 visitors on that day How many visited the Buckingham palace within 25 days?\n", 164 | "Therefore, the answer (arabic numerals) is: \n", 165 | "Response: To determine the total number of visitors to Buckingham Palace within 25 days, we'll calculate the average daily visitors based on the two given days and then multiply by 25.\n", 166 | "\n", 167 | "**Given:**\n", 168 | "- Visitors on the previous day: 703\n", 169 | "- Visitors on the day Rachel visited: 246\n", 170 | "\n", 171 | "**Step 1: Calculate the average number of visitors per day over the two days.**\n", 172 | "\n", 173 | "\\[\n", 174 | "\\text{Average visitors per day} = \\frac{703 + 246}{2} = \\frac{949}{2} = 474.5\n", 175 | "\\]\n", 176 | "\n", 177 | "**Step 2: Multiply the average daily visitors by 25 to find the total visitors over 25 days.**\n", 178 | "\n", 179 | "\\[\n", 180 | "\\text{Total visitors in 25 days} = 474.5 \\times 25 = 11,862.5\n", 181 | "\\]\n", 182 | "\n", 183 | "Since we can't have half a visitor, we'll round to the nearest whole number.\n", 184 | "\n", 185 | "**Answer:**\n", 186 | "\n", 187 | "11863\n", 188 | "Trustworthiness Score: 0.8476286883275025\n", 189 | "Ground Truth: 949\n", 190 | "---\n", 191 | "Prompt: Mary is baking a cake. The recipe calls for 11 cups of flour and 7 cups of sugar. She already put in some cups of flour. If she still needs 2 more cups of flour than sugar How many cups of flour did she put in?\n", 192 | "Therefore, the answer (arabic numerals) is: \n", 193 | "Response: To find out how many cups of flour Mary has already put in, let's analyze the information:\n", 194 | "\n", 195 | "- The recipe calls for **11 cups of flour** and **7 cups of sugar**.\n", 196 | "- She has already added some cups of flour.\n", 197 | "- She still needs **2 more cups of flour than sugar**.\n", 198 | "\n", 199 | "Assuming Mary hasn't added any sugar yet, the amount of sugar she still needs is **7 cups**.\n", 200 | "\n", 201 | "Let \\( F \\) be the amount of flour she has already added. Then, the amount of flour she still needs is \\( 11 - F \\) cups.\n", 202 | "\n", 203 | "According to the problem, the amount of flour she still needs is **2 more cups than the sugar**:\n", 204 | "\n", 205 | "\\[\n", 206 | "11 - F = 7 + 2\n", 207 | "\\]\n", 208 | "\n", 209 | "Simplify:\n", 210 | "\n", 211 | "\\[\n", 212 | "11 - F = 9\n", 213 | "\\]\n", 214 | "\n", 215 | "Subtract 11 from both sides:\n", 216 | "\n", 217 | "\\[\n", 218 | "- F = -2\n", 219 | "\\]\n", 220 | "\n", 221 | "Multiply both sides by -1:\n", 222 | "\n", 223 | "\\[\n", 224 | "F = 2\n", 225 | "\\]\n", 226 | "\n", 227 | "So, Mary has already put in **2 cups of flour**.\n", 228 | "\n", 229 | "**Answer: 2**\n", 230 | "Trustworthiness Score: 0.7999592987447975\n", 231 | "Ground Truth: 2\n", 232 | "---\n", 233 | "Prompt: Last week Fred had 111 dollars and Jason had 40 dollars. Fred washed cars over the weekend and now has 115 dollars. Jason delivered newspapers and now has 44 dollars. How much money did they earn by washing cars?\n", 234 | "Therefore, the answer (arabic numerals) is: \n", 235 | "Response: 4\n", 236 | "Trustworthiness Score: 0.8981307930188448\n", 237 | "Ground Truth: 4\n", 238 | "---\n" 239 | ] 240 | }, 241 | { 242 | "data": { 243 | "text/html": [ 244 | "
\n", 245 | "\n", 258 | "\n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | "
promptground_truthresponse_tlm_o1_previewtrustworthiness_score
0Matthew gave equal numbers of crackers and cak...1Each friend ate \\(\\frac{13}{28}\\) crackers and...0.950989
1Jack received 6 emails in the morning, 3 email...3To determine how many emails Jack received in ...0.826781
2After eating a hearty meal they went to see th...949To determine the total number of visitors to B...0.847629
3Mary is baking a cake. The recipe calls for 11...2To find out how many cups of flour Mary has al...0.799959
4Last week Fred had 111 dollars and Jason had 4...440.898131
\n", 306 | "
" 307 | ], 308 | "text/plain": [ 309 | " prompt ground_truth \\\n", 310 | "0 Matthew gave equal numbers of crackers and cak... 1 \n", 311 | "1 Jack received 6 emails in the morning, 3 email... 3 \n", 312 | "2 After eating a hearty meal they went to see th... 949 \n", 313 | "3 Mary is baking a cake. The recipe calls for 11... 2 \n", 314 | "4 Last week Fred had 111 dollars and Jason had 4... 4 \n", 315 | "\n", 316 | " response_tlm_o1_preview trustworthiness_score \n", 317 | "0 Each friend ate \\(\\frac{13}{28}\\) crackers and... 0.950989 \n", 318 | "1 To determine how many emails Jack received in ... 0.826781 \n", 319 | "2 To determine the total number of visitors to B... 0.847629 \n", 320 | "3 To find out how many cups of flour Mary has al... 0.799959 \n", 321 | "4 4 0.898131 " 322 | ] 323 | }, 324 | "execution_count": 9, 325 | "metadata": {}, 326 | "output_type": "execute_result" 327 | } 328 | ], 329 | "source": [ 330 | "from cleanlab_tlm import TLM\n", 331 | "\n", 332 | "tlm = TLM(options={\"model\": \"o1-preview\"}, quality_preset=\"best\")\n", 333 | "\n", 334 | "responses = []\n", 335 | "trustworthiness_scores = []\n", 336 | "# Only use the first 5 rows for illustration. Feel free to use the entire dataset.\n", 337 | "for _, row in svamp_49_dataset.head().iterrows():\n", 338 | " prompt = row['prompt']\n", 339 | " response_dict = tlm.prompt(prompt)\n", 340 | " responses.append(response_dict['response'])\n", 341 | " trustworthiness_scores.append(response_dict['trustworthiness_score'])\n", 342 | " print(f\"Prompt: {prompt}\")\n", 343 | " print(f\"Response: {response_dict['response']}\")\n", 344 | " print(f\"Trustworthiness Score: {response_dict['trustworthiness_score']}\")\n", 345 | " print(f\"Ground Truth: {row['ground_truth']}\")\n", 346 | " print(\"---\")\n", 347 | "\n", 348 | "# Create a new DataFrame with the responses and trustworthiness scores\n", 349 | "results_df = pd.DataFrame({\n", 350 | " 'prompt': svamp_49_dataset.head()['prompt'],\n", 351 | " 'ground_truth': svamp_49_dataset.head()['ground_truth'],\n", 352 | " 'response_tlm_o1_preview': responses,\n", 353 | " 'trustworthiness_score': trustworthiness_scores\n", 354 | "})\n", 355 | "\n", 356 | "results_df.to_csv(\"SVAMP-49-benchmark/svamp_49_tlm_o1_preview_responses_new.csv\", index=False)\n", 357 | "\n", 358 | "# Display the results\n", 359 | "results_df" 360 | ] 361 | }, 362 | { 363 | "cell_type": "markdown", 364 | "metadata": {}, 365 | "source": [ 366 | "# Try individual examples" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": 3, 372 | "metadata": {}, 373 | "outputs": [ 374 | { 375 | "name": "stdout", 376 | "output_type": "stream", 377 | "text": [ 378 | "{\n", 379 | " \"response\": \"Zachary did 53 push-ups and 14 crunches.\",\n", 380 | " \"trustworthiness_score\": 0.6936745208231877\n", 381 | "}\n" 382 | ] 383 | } 384 | ], 385 | "source": [ 386 | "tlm = TLM(options={\"model\": \"o1-preview\"}, quality_preset=\"best\")\n", 387 | "\n", 388 | "prompt = \"\"\"Zachary did 53 push-ups and 14 crunches in gym class today. David did 17 more push-ups but 10 less crunches than zachary. How many push-ups and crunches did Zachary do?\n", 389 | "Therefore, the answer (arabic numerals) is: \n", 390 | "\"\"\"\n", 391 | "import json\n", 392 | "print(json.dumps(tlm.prompt(prompt), indent=4))" 393 | ] 394 | } 395 | ], 396 | "metadata": { 397 | "kernelspec": { 398 | "display_name": "Python 3", 399 | "language": "python", 400 | "name": "python3" 401 | }, 402 | "language_info": { 403 | "codemirror_mode": { 404 | "name": "ipython", 405 | "version": 3 406 | }, 407 | "file_extension": ".py", 408 | "mimetype": "text/x-python", 409 | "name": "python", 410 | "nbconvert_exporter": "python", 411 | "pygments_lexer": "ipython3", 412 | "version": "3.10.15" 413 | } 414 | }, 415 | "nbformat": 4, 416 | "nbformat_minor": 2 417 | } 418 | -------------------------------------------------------------------------------- /benchmarking_hallucination_model/README.md: -------------------------------------------------------------------------------- 1 | # Benchmark hallucination detection models on RAG datasets 2 | 3 | This folder contains notebooks used to evaluate popular hallucination detection models on various RAG (Context, Question, LLM Response) datasets. 4 | 5 | Datasets used in the benchmark include: 6 | - [ELI5](https://huggingface.co/datasets/explodinggradients/ELI5) 7 | - [FinQA](https://huggingface.co/datasets/Cleanlab/FinQA-hallucination-detection) 8 | - [Halubench](https://huggingface.co/datasets/PatronusAI/HaluBench): 9 | - CovidQA 10 | - PubmedQA 11 | - DROP 12 | - FinanceBench 13 | 14 | We omitted the other datasets from Halubench after discovering too many annotation errors. For FinQA: we specifically used the [FinQA-hallucination-detection](https://huggingface.co/datasets/Cleanlab/FinQA-hallucination-detection) version of this dataset, after discovering annotation errors and synthetic responses in [other versions](https://huggingface.co/datasets/wandb/finqa-data-processed-hallucination). 15 | 16 | 17 | 18 | Models compared in our benchmark include: 19 | 20 | | Notebook | Description | 21 | |----------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------| 22 | | [Patronus Lynx](Lynx.ipynb) | Evaluates [Patronux Lynx 70B](https://huggingface.co/PatronusAI/Llama-3-Patronus-Lynx-70B-Instruct) model 23 | | [Vectara HHEM](HHEM.ipynb) | Evaluates [Vectara's HHEM v2.1](https://huggingface.co/vectara/hallucination_evaluation_model) model 24 | | [Prometheus 2](Prometheus.ipynb) | Evaluates [Prometheus2 8x7B](https://huggingface.co/prometheus-eval/prometheus-8x7b-v2.0) model 25 | | [LLM as judge and TLM](LLM_as_judge_and_TLM.ipynb) | Evaluates LLM-as-judge and the Trustworthy Language Model on ELI5 dataset 26 | | [LLM as judge and TLM](../benchmarking_hallucination_metrics/benchmark_hallucination_metrics.ipynb) | Evaluates LLM-as-judge and the Trustworthy Language Model on Halubench datasets | 27 | -------------------------------------------------------------------------------- /few_shot_prompt_selection/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.24.3 2 | openai==0.27.7 3 | pandas==2.0.1 4 | langchain==0.0.181 5 | scikit-learn==1.2.2 6 | tiktoken==0.4.0 7 | tqdm==4.65.0 -------------------------------------------------------------------------------- /fine_tuning_classification/improving-openai-davinci-with-cleanlab.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cleanlab/cleanlab-tools/158841f9cf7ac1a3227ab45947d39c7161453956/fine_tuning_classification/improving-openai-davinci-with-cleanlab.png -------------------------------------------------------------------------------- /fine_tuning_classification/improving-openai-models-with-cleanlab.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cleanlab/cleanlab-tools/158841f9cf7ac1a3227ab45947d39c7161453956/fine_tuning_classification/improving-openai-models-with-cleanlab.png -------------------------------------------------------------------------------- /fine_tuning_classification/requirements.txt: -------------------------------------------------------------------------------- 1 | openai==0.27.2 -------------------------------------------------------------------------------- /fine_tuning_data_curation/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas==2.0.3 2 | cleanlab_studio==1.1.21 3 | tqdm==4.66.1 -------------------------------------------------------------------------------- /fine_tuning_mistral_beavertails/requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate==0.21.0 2 | aiohttp==3.9.3 3 | aiosignal==1.3.1 4 | attrs==23.2.0 5 | bitsandbytes==0.42.0 6 | certifi==2024.2.2 7 | chardet==5.2.0 8 | charset-normalizer==3.3.2 9 | cleanlab-studio==1.2.4 10 | click==8.1.3 11 | colorama==0.4.6 12 | datasets==2.14.7 13 | dill==0.3.7 14 | et-xmlfile==1.1.0 15 | evaluate==0.4.1 16 | filelock==3.13.1 17 | frozenlist==1.4.1 18 | fsspec==2023.10.0 19 | huggingface-hub==0.17.3 20 | idna==3.6 21 | ijson==3.2.3 22 | Jinja2==3.1.3 23 | joblib==1.3.2 24 | jsonstreams==0.6.0 25 | lml==0.1.0 26 | MarkupSafe==2.1.5 27 | mpmath==1.3.0 28 | multidict==6.0.5 29 | multiprocess==0.70.15 30 | nest-asyncio==1.6.0 31 | networkx==3.2.1 32 | numpy==1.26.4 33 | openpyxl==3.1.2 34 | packaging==23.2 35 | pandas==2.2.1 36 | peft==0.4.0 37 | pillow==10.2.0 38 | psutil==5.9.8 39 | pyarrow==15.0.1 40 | pyarrow-hotfix==0.6 41 | pyexcel==0.7.0 42 | pyexcel-io==0.6.6 43 | pyexcel-xls==0.7.0 44 | pyexcel-xlsx==0.6.0 45 | python-dateutil==2.9.0.post0 46 | pytz==2024.1 47 | PyYAML==6.0.1 48 | regex==2023.12.25 49 | requests==2.31.0 50 | responses==0.18.0 51 | safetensors==0.4.2 52 | scikit-learn==1.4.1.post1 53 | scipy==1.12.0 54 | semver==2.13.0 55 | six==1.16.0 56 | sympy==1.12 57 | texttable==1.7.0 58 | threadpoolctl==3.3.0 59 | tokenizers==0.14.1 60 | torch==2.2.1 61 | tqdm==4.66.2 62 | transformers==4.34.0 63 | trl==0.4.7 64 | typing_extensions==4.10.0 65 | tzdata==2024.1 66 | urllib3==2.2.1 67 | validators==0.22.0 68 | xlrd==2.0.1 69 | xlwt==1.3.0 70 | xxhash==3.4.1 71 | yarl==1.9.4 72 | -------------------------------------------------------------------------------- /generate_llm_response/generate_llm_response.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "d431b273", 6 | "metadata": {}, 7 | "source": [ 8 | "# Generating LLM Responses to Customer Service Requests" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "926caab2-f041-4489-92eb-1e159a621499", 14 | "metadata": {}, 15 | "source": [ 16 | "This notebook demonstrates how to instruct a LLM to simulate the role of a customer service agent and generate responses to customer service queries.\n", 17 | "\n", 18 | "See how you can evaluate the quality of these LLM generated responses in this tutorial: [Detecting Issues in LLM Outputs](https://help.cleanlab.ai/tutorials/llm/)" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "id": "baa6dbea-2e4f-479f-9ae0-8caac6ec361c", 24 | "metadata": {}, 25 | "source": [ 26 | "## Import Dependencies" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "id": "c1988f46-f5b9-4a3e-8818-957ce52d9145", 32 | "metadata": {}, 33 | "source": [ 34 | "Make sure you have wget installed to run this tutorial. You can use pip to install all other packages required for this tutorial as follows:" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 1, 40 | "id": "59c1b469-9e5d-48c7-9205-d44d19ea5dc2", 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "!pip install pandas scipy openai transformers torch accelerate bitsandbytes" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 2, 50 | "id": "7f48f382", 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "import numpy as np\n", 55 | "import pandas as pd\n", 56 | "\n", 57 | "from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline\n", 58 | "import openai\n", 59 | "\n", 60 | "from tqdm import tqdm" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "id": "b4f749d0-892a-4f3d-93b1-f0714024de22", 66 | "metadata": {}, 67 | "source": [ 68 | "## Fetch and View Customer Service Requests\n", 69 | "\n", 70 | "To fetch the data for this tutorial, make sure you have wget installed." 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "id": "13335149-2a99-4d9a-8ed0-c3630411857e", 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "!wget -nc https://cleanlab-public.s3.amazonaws.com/Datasets/llm-customer-service-prompts.csv -P data" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "id": "63f9942d-8fd3-468b-ab29-21064e67b61d", 86 | "metadata": {}, 87 | "source": [ 88 | "For this notebook we have a csv file that contains various customer service queries, let's view a selection of these requests:" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 4, 94 | "id": "e0a1984a", 95 | "metadata": {}, 96 | "outputs": [ 97 | { 98 | "data": { 99 | "text/plain": [ 100 | "array(['help trying to update the delivery address',\n", 101 | " 'want help to get my invoices from {{Person Name}}',\n", 102 | " 'find information about the termination of my {{Account Type}} account',\n", 103 | " 'could you help me to correct my shipping address?',\n", 104 | " 'i cannot make payments help to report an error'], dtype=object)" 105 | ] 106 | }, 107 | "execution_count": 4, 108 | "metadata": {}, 109 | "output_type": "execute_result" 110 | } 111 | ], 112 | "source": [ 113 | "data = pd.read_csv(\"data/llm-customer-service-prompts.csv\")\n", 114 | "\n", 115 | "requests = data[\"request\"].values\n", 116 | "requests[:5]" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "id": "df1a0f85", 122 | "metadata": {}, 123 | "source": [ 124 | "## Obtaining Responses from Llama 2 \n", 125 | "\n", 126 | "First, we demonstrate how to generate responses to our customer service requests using Llama 2, we can load the model from Hugging Face and create a pipeline for text generation.\n", 127 | "\n", 128 | "Note that this portion of the notebook requires GPU, if that is unavailable, scroll to the next section to see how to obtain responses using OpenAI's API." 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "id": "f1a9f647", 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "model_id = 'meta-llama/Llama-2-7b-chat-hf' \n", 139 | "# model_id = \"mistralai/Mistral-7B-v0.1\" # alternatively, can use mistral by changing the model_id\n", 140 | "tokenizer = AutoTokenizer.from_pretrained(model_id)\n", 141 | "model = AutoModelForCausalLM.from_pretrained(\n", 142 | " model_id,\n", 143 | " load_in_8bit=True,\n", 144 | " device_map=\"auto\", \n", 145 | ")" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 6, 151 | "id": "10c2268c", 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "pipe = pipeline(\n", 156 | " \"text-generation\",\n", 157 | " model=model, \n", 158 | " tokenizer=tokenizer, \n", 159 | " max_length=400,\n", 160 | " temperature=0.8, # can adjust temp \n", 161 | ")" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "id": "dfbd45d4-2fc4-4747-9bec-4675ff664370", 167 | "metadata": {}, 168 | "source": [ 169 | "Next, we create a prompt template which we will use to format the inputs to the LLM." 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 7, 175 | "id": "e3f60620", 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [ 179 | "def format_llama_prompt(text):\n", 180 | " return f\"\"\"You are a customer service agent, provide a response with next steps to the following question: \n", 181 | " {text}\n", 182 | " \n", 183 | " Answer:\"\"\"" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "id": "563d7855-c7a2-427c-9e41-2c336dd8abdd", 189 | "metadata": {}, 190 | "source": [ 191 | "Finally, we pass the requests to the LLM using the formatted prompt and save its outputs." 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "id": "cec55b1a", 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "text_gen = (format_llama_prompt(t) for t in requests)\n", 202 | "llama2_outputs = np.zeros(len(requests)).astype(object)\n", 203 | "\n", 204 | "for i, out in tqdm(enumerate(pipe(text_gen))):\n", 205 | " llama2_outputs[i] = out[0][\"generated_text\"].split(\"Answer:\")[1].strip()" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "id": "bb55ef62-54c0-43ad-8170-8cc131d8c9ff", 211 | "metadata": {}, 212 | "source": [ 213 | "Let's view a sample response from Llama 2:" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 9, 219 | "id": "845b4a9d", 220 | "metadata": {}, 221 | "outputs": [ 222 | { 223 | "name": "stdout", 224 | "output_type": "stream", 225 | "text": [ 226 | "Thank you for reaching out to us. I apologize for any inconvenience you're experiencing with your delivery address. To update your delivery address, please follow these steps:\n", 227 | "\n", 228 | "1. Log in to your account on our website or mobile app.\n", 229 | "2. Click on the \"Account\" or \"Settings\" tab.\n", 230 | "3. Scroll down to the \"Delivery\" section and click on \"Edit.\"\n", 231 | "4. Enter your new delivery address and confirm.\n", 232 | "5. If you have any trouble updating your address, please contact our customer service team for further assistance.\n", 233 | "\n", 234 | "Remember, you can also track your packages and view your order history in the \"Account\" or \"Settings\" tab. If you have any other questions or concerns, feel free to reach out to us.\n" 235 | ] 236 | } 237 | ], 238 | "source": [ 239 | "print(llama2_outputs[0])" 240 | ] 241 | }, 242 | { 243 | "cell_type": "markdown", 244 | "id": "b45cb194", 245 | "metadata": {}, 246 | "source": [ 247 | "## Obtaining Responses from OpenAI" 248 | ] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "id": "888372fc-be3c-49dc-8317-d45fd9f1962c", 253 | "metadata": {}, 254 | "source": [ 255 | "Here, we demonstrate how to generate responses to our customer service requests using the OpenAI API. We wil be using the `gpt-3.5-turbo` model in this notebook.\n", 256 | "\n", 257 | "Firstly, we need to enter our OpenAI API key." 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": 10, 263 | "id": "574d21bf-a220-42d1-b44e-8994302844ae", 264 | "metadata": {}, 265 | "outputs": [], 266 | "source": [ 267 | "openai.api_key = \"\"" 268 | ] 269 | }, 270 | { 271 | "cell_type": "markdown", 272 | "id": "181de07d-c80b-4800-9b9a-33047a6acddf", 273 | "metadata": {}, 274 | "source": [ 275 | "Next, we define a function to format our prompts into the appropriate format to pass to the OpenAI API." 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 12, 281 | "id": "61983036-46a9-4c68-b94a-dbcefa54337f", 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [ 285 | "def format_openai_prompt(text):\n", 286 | " return [\n", 287 | " {\"role\": \"system\", \"content\": \"You are a customer service agent, provide a response with next steps to the following question in a few sentences.\"},\n", 288 | " {\"role\": \"user\", \"content\": text}\n", 289 | " ]" 290 | ] 291 | }, 292 | { 293 | "cell_type": "markdown", 294 | "id": "dca40407-3919-4682-8ef9-f685fb234a77", 295 | "metadata": {}, 296 | "source": [ 297 | "Then, we pass the requests to the LLM using the formatted prompt and save its outputs." 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": null, 303 | "id": "6ad2ad74-b0b0-423c-a4b9-21c486440e55", 304 | "metadata": {}, 305 | "outputs": [], 306 | "source": [ 307 | "openai_outputs = np.zeros(len(requests)).astype(object)\n", 308 | "\n", 309 | "for i, text in tqdm(enumerate(requests)):\n", 310 | " output = openai.ChatCompletion.create(\n", 311 | " model=\"gpt-3.5-turbo\",\n", 312 | " messages=format_openai_prompt(text),\n", 313 | " temperature=0.8, # can adjust temp \n", 314 | " ) \n", 315 | " \n", 316 | " openai_outputs[i] = output.choices[0].message.content" 317 | ] 318 | }, 319 | { 320 | "cell_type": "markdown", 321 | "id": "28868b63-7395-432a-a47f-11c0428b0491", 322 | "metadata": {}, 323 | "source": [ 324 | "Finally, let's view a sample response from GPT-3.5:" 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": 14, 330 | "id": "8daa8dfb-8e36-41de-848c-6f99fdd06d0d", 331 | "metadata": {}, 332 | "outputs": [ 333 | { 334 | "name": "stdout", 335 | "output_type": "stream", 336 | "text": [ 337 | "I apologize for any inconvenience you may be facing with updating your delivery address. In order to assist you further, please provide me with your order number and the new delivery address. Once I have this information, I will be able to initiate the process of updating the delivery address for you. Thank you for your patience.\n" 338 | ] 339 | } 340 | ], 341 | "source": [ 342 | "print(openai_outputs[0])" 343 | ] 344 | } 345 | ], 346 | "metadata": { 347 | "kernelspec": { 348 | "display_name": "Python 3 (ipykernel)", 349 | "language": "python", 350 | "name": "python3" 351 | }, 352 | "language_info": { 353 | "codemirror_mode": { 354 | "name": "ipython", 355 | "version": 3 356 | }, 357 | "file_extension": ".py", 358 | "mimetype": "text/x-python", 359 | "name": "python", 360 | "nbconvert_exporter": "python", 361 | "pygments_lexer": "ipython3", 362 | "version": "3.9.6" 363 | } 364 | }, 365 | "nbformat": 4, 366 | "nbformat_minor": 5 367 | } 368 | -------------------------------------------------------------------------------- /generate_llm_response/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas==2.0.3 2 | scipy==1.10.1 3 | openai==0.28.1 4 | transformers==4.34.1 5 | torch==2.1.0 6 | accelerate==0.24.1 7 | bitsandbytes==0.41.1 -------------------------------------------------------------------------------- /gpt4-rag-logprobs/gpt4-rag-logprobs.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Obtaining logprobs from GPT-4 RAG systems\n", 8 | "\n", 9 | "\n" 10 | ] 11 | }, 12 | { 13 | "attachments": {}, 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "This notebook showcases how the GPT-4 logprob results were obtained in our tutorial on integrating Cleanlab TLM within a RAG system. We reference code from OpenAI's [blogpost](https://cookbook.openai.com/examples/using_logprobs) on logprobs.\n", 18 | "\n", 19 | "From the blogpost,\n", 20 | "> Log probabilities of output tokens indicate the likelihood of each token occurring in the sequence given the context. To simplify, a logprob is log(p), where p = probability of a token occurring at a specific position based on the previous tokens in the context." 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "## Installing LlamaIndex" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "%pip install llama-index" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "### Initialize OpenAI Client\n", 44 | "\n", 45 | "LlamaIndex uses OpenAI’s embedding models by default. Make sure your API key is activated in your environment by using the following command." 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 1, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "from openai import OpenAI\n", 55 | "import os\n", 56 | "API_KEY = \"\"\n", 57 | "os.environ['OPENAI_API_KEY'] = API_KEY\n", 58 | "client = OpenAI(api_key=os.environ.get(\"OPENAI_API_KEY\", API_KEY))" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "## Integrating GPT-4 with LlamaIndex" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "The following code defines two utility functions. `get_completion()` returns the output from the OpenAI client given a prompt and a few other customizable parameters. `parse()` extracts our desired fields from the output, `response` and `logprobs`, and calculates the average logprob and linear probability over token in the `response` string." 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 2, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "from math import exp\n", 82 | "import numpy as np\n", 83 | "from IPython.display import display, HTML\n", 84 | "from typing import Dict, List, Optional\n", 85 | "\n", 86 | "def get_completion(\n", 87 | " messages: List[Dict[str, str]],\n", 88 | " model: str = \"gpt-4\",\n", 89 | " max_tokens=500,\n", 90 | " temperature=0,\n", 91 | " stop=None,\n", 92 | " seed=123,\n", 93 | " tools=None,\n", 94 | " logprobs=None, # whether to return log probabilities of the output tokens or not. If true, returns the log probabilities of each output token returned in the content of message..\n", 95 | " top_logprobs=None,\n", 96 | ") -> str:\n", 97 | " params = {\n", 98 | " \"model\": model,\n", 99 | " \"messages\": messages,\n", 100 | " \"max_tokens\": max_tokens,\n", 101 | " \"temperature\": temperature,\n", 102 | " \"stop\": stop,\n", 103 | " \"seed\": seed,\n", 104 | " \"logprobs\": logprobs,\n", 105 | " \"top_logprobs\": top_logprobs,\n", 106 | " }\n", 107 | " if tools:\n", 108 | " params[\"tools\"] = tools\n", 109 | "\n", 110 | " completion = client.chat.completions.create(**params)\n", 111 | " return completion\n", 112 | "\n", 113 | "def parse(api_response):\n", 114 | " choice = api_response.choices[0]\n", 115 | " content = choice.message.content\n", 116 | " logprobs = [logprob.logprob for logprob in choice.logprobs.content]\n", 117 | " \n", 118 | " average_logprob = np.mean(logprobs)\n", 119 | " average_linear_prob = np.exp(average_logprob) * 100\n", 120 | " \n", 121 | " result_string = (f\"Response: {content}\\n\"\n", 122 | " f\"Average Log Probability: {average_logprob:.4f}\\n\"\n", 123 | " f\"Average Linear Probability: {average_linear_prob:.2f}%\")\n", 124 | " \n", 125 | " return result_string" 126 | ] 127 | }, 128 | { 129 | "attachments": {}, 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "The following code sets GPT-4 (with logprobs) as the underlying large language model (LLM) for our RAG system. `GPTWrapper` is built on top of LlamaIndex's [CustomLLM](https://docs.llamaindex.ai/en/stable/module_guides/models/llms/usage_custom.html#using-custom-llm-advanced) class. \n" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 3, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "from typing import Any, Callable, Optional, Sequence\n", 143 | "\n", 144 | "from llama_index.core.base.llms.types import (\n", 145 | " ChatMessage,\n", 146 | " CompletionResponse,\n", 147 | " CompletionResponseGen,\n", 148 | " LLMMetadata,\n", 149 | ")\n", 150 | "from llama_index.core.callbacks import CallbackManager\n", 151 | "from llama_index.core.llms.callbacks import llm_completion_callback\n", 152 | "from llama_index.core.llms.custom import CustomLLM\n", 153 | "from llama_index.core.types import PydanticProgramMode\n", 154 | "from llama_index.core import Settings\n", 155 | "import json\n", 156 | "\n", 157 | "class GPTWrapper(CustomLLM):\n", 158 | " context_window: int = 3900\n", 159 | " num_output: int = 256\n", 160 | " model_name: str = \"ChatGPT-4\"\n", 161 | "\n", 162 | " @property\n", 163 | " def metadata(self) -> LLMMetadata:\n", 164 | " \"\"\"Get LLM metadata.\"\"\"\n", 165 | " return LLMMetadata(\n", 166 | " context_window=self.context_window,\n", 167 | " num_output=self.num_output,\n", 168 | " model_name=self.model_name,\n", 169 | " )\n", 170 | "\n", 171 | " @llm_completion_callback()\n", 172 | " def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:\n", 173 | " API_RESPONSE = get_completion(\n", 174 | " [\n", 175 | " {\n", 176 | " \"role\": \"user\",\n", 177 | " \"content\": prompt\n", 178 | " }\n", 179 | " ],\n", 180 | " model=\"gpt-4\",\n", 181 | " logprobs=True,\n", 182 | " )\n", 183 | " return CompletionResponse(text=parse(API_RESPONSE))\n", 184 | "\n", 185 | "\n", 186 | " @llm_completion_callback()\n", 187 | " def stream_complete(\n", 188 | " self, prompt: str, **kwargs: Any\n", 189 | " ) -> CompletionResponseGen:\n", 190 | " API_RESPONSE = get_completion(\n", 191 | " [\n", 192 | " {\n", 193 | " \"role\": \"user\",\n", 194 | " \"content\": prompt\n", 195 | " }\n", 196 | " ],\n", 197 | " model=\"gpt-4\",\n", 198 | " logprobs=True,\n", 199 | " )\n", 200 | " for char in API_RESPONSE:\n", 201 | " yield CompletionResponse(text=char, delta=char)\n", 202 | "\n", 203 | "Settings.llm = GPTWrapper()" 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "## Load data and build an index" 211 | ] 212 | }, 213 | { 214 | "cell_type": "markdown", 215 | "metadata": {}, 216 | "source": [ 217 | "This tutorial uses Nvidia's earnings report from Q1 FY2024. Download it via the command below and save it in a folder called `data`." 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "metadata": {}, 224 | "outputs": [], 225 | "source": [ 226 | "wget -nc 'https://cleanlab-public.s3.amazonaws.com/Datasets/NVIDIA_Financial_Results_Q1_FY2024.md'\n", 227 | "!mkdir -p data\n", 228 | "!mv NVIDIA_Financial_Results_Q1_FY2024.md data/" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": {}, 234 | "source": [ 235 | "Your directory structure should look like this:\n", 236 | "```\n", 237 | "├── tlm-rag-tutorial.ipynb\n", 238 | "└── data\n", 239 | " └── NVIDIA_Financial_Results_Q1_FY2024.md\n", 240 | "```" 241 | ] 242 | }, 243 | { 244 | "cell_type": "markdown", 245 | "metadata": {}, 246 | "source": [ 247 | "The following cell constructs an index from the content in the `data` folder." 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": 4, 253 | "metadata": {}, 254 | "outputs": [], 255 | "source": [ 256 | "from llama_index.core import VectorStoreIndex, SimpleDirectoryReader\n", 257 | "\n", 258 | "documents = SimpleDirectoryReader(\"data\").load_data()\n", 259 | "index = VectorStoreIndex.from_documents(documents)" 260 | ] 261 | }, 262 | { 263 | "cell_type": "markdown", 264 | "metadata": {}, 265 | "source": [ 266 | "## Query your data" 267 | ] 268 | }, 269 | { 270 | "cell_type": "markdown", 271 | "metadata": {}, 272 | "source": [ 273 | "Now, you can create an Q&A engine over your index and input prompts." 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": 5, 279 | "metadata": {}, 280 | "outputs": [], 281 | "source": [ 282 | "query_engine = index.as_query_engine()" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": 6, 288 | "metadata": {}, 289 | "outputs": [ 290 | { 291 | "name": "stdout", 292 | "output_type": "stream", 293 | "text": [ 294 | "Response: The document does not provide specific information on what Nvidia's largest product is.\n", 295 | "Average Log Probability: -0.0582\n", 296 | "Average Linear Probability: 94.34%\n" 297 | ] 298 | } 299 | ], 300 | "source": [ 301 | "response = query_engine.query(\"What is Nvidia's largest product?\")\n", 302 | "print(response)" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 7, 308 | "metadata": {}, 309 | "outputs": [ 310 | { 311 | "name": "stdout", 312 | "output_type": "stream", 313 | "text": [ 314 | "Response: False\n", 315 | "Average Log Probability: -0.0081\n", 316 | "Average Linear Probability: 99.19%\n" 317 | ] 318 | } 319 | ], 320 | "source": [ 321 | "response = query_engine.query(\"True or False: Nvidia's Professional Visualization division is performing better than their Gaming division in terms of percent change in revenue compared to the previous quarter.\")\n", 322 | "print(response)" 323 | ] 324 | } 325 | ], 326 | "metadata": { 327 | "kernelspec": { 328 | "display_name": "ag", 329 | "language": "python", 330 | "name": "ag" 331 | }, 332 | "language_info": { 333 | "codemirror_mode": { 334 | "name": "ipython", 335 | "version": 3 336 | }, 337 | "file_extension": ".py", 338 | "mimetype": "text/x-python", 339 | "name": "python", 340 | "nbconvert_exporter": "python", 341 | "pygments_lexer": "ipython3", 342 | "version": "3.8.10" 343 | } 344 | }, 345 | "nbformat": 4, 346 | "nbformat_minor": 4 347 | } 348 | -------------------------------------------------------------------------------- /jigsaw_ai_safety_keras/requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==1.4.0 2 | aiohttp==3.9.3 3 | aiosignal==1.3.1 4 | alabaster==0.7.16 5 | albumentations==1.3.1 6 | altair==4.2.2 7 | annotated-types==0.6.0 8 | anyio==3.7.1 9 | appdirs==1.4.4 10 | argon2-cffi==23.1.0 11 | argon2-cffi-bindings==21.2.0 12 | array-record==0.5.0 13 | arviz==0.15.1 14 | astropy==5.3.4 15 | astunparse==1.6.3 16 | async-timeout==4.0.3 17 | atpublic==4.0 18 | attrs==23.2.0 19 | audioread==3.0.1 20 | autograd==1.6.2 21 | Babel==2.14.0 22 | backcall==0.2.0 23 | beautifulsoup4==4.12.3 24 | bidict==0.23.1 25 | bigframes==0.22.0 26 | bleach==6.1.0 27 | blinker==1.4 28 | blis==0.7.11 29 | blosc2==2.0.0 30 | bokeh==3.3.4 31 | bqplot==0.12.43 32 | branca==0.7.1 33 | build==1.1.1 34 | CacheControl==0.14.0 35 | cachetools==5.3.3 36 | catalogue==2.0.10 37 | certifi==2024.2.2 38 | cffi==1.16.0 39 | chardet==5.2.0 40 | charset-normalizer==3.3.2 41 | chex==0.1.85 42 | cleanlab-studio==1.2.4 43 | click==8.1.3 44 | click-plugins==1.1.1 45 | cligj==0.7.2 46 | cloudpathlib==0.16.0 47 | cloudpickle==2.2.1 48 | cmake==3.27.9 49 | cmdstanpy==1.2.1 50 | colorama==0.4.6 51 | colorcet==3.1.0 52 | colorlover==0.3.0 53 | colour==0.1.5 54 | community==1.0.0b1 55 | confection==0.1.4 56 | cons==0.4.6 57 | contextlib2==21.6.0 58 | contourpy==1.2.0 59 | cryptography==42.0.5 60 | cufflinks==0.17.3 61 | cupy-cuda12x==12.2.0 62 | cvxopt==1.3.2 63 | cvxpy==1.3.3 64 | cycler==0.12.1 65 | cymem==2.0.8 66 | Cython==3.0.9 67 | dask==2023.8.1 68 | datascience==0.17.6 69 | datasets==2.18.0 70 | db-dtypes==1.2.0 71 | dbus-python==1.2.18 72 | debugpy==1.6.6 73 | decorator==4.4.2 74 | defusedxml==0.7.1 75 | dill==0.3.8 76 | distributed==2023.8.1 77 | distro==1.7.0 78 | dlib==19.24.2 79 | dm-tree==0.1.8 80 | docutils==0.18.1 81 | dopamine-rl==4.0.6 82 | duckdb==0.9.2 83 | earthengine-api==0.1.392 84 | easydict==1.13 85 | ecos==2.0.13 86 | editdistance==0.6.2 87 | eerepr==0.0.4 88 | en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889 89 | entrypoints==0.4 90 | et-xmlfile==1.1.0 91 | etils==1.7.0 92 | etuples==0.3.9 93 | exceptiongroup==1.2.0 94 | fastai==2.7.14 95 | fastcore==1.5.29 96 | fastdownload==0.0.7 97 | fastjsonschema==2.19.1 98 | fastprogress==1.0.3 99 | fastrlock==0.8.2 100 | filelock==3.13.1 101 | fiona==1.9.5 102 | firebase-admin==5.3.0 103 | Flask==2.2.5 104 | flatbuffers==23.5.26 105 | flax==0.8.1 106 | folium==0.14.0 107 | fonttools==4.49.0 108 | frozendict==2.4.0 109 | frozenlist==1.4.1 110 | fsspec==2023.6.0 111 | future==0.18.3 112 | gast==0.5.4 113 | gcsfs==2023.6.0 114 | GDAL==3.6.4 115 | gdown==4.7.3 116 | geemap==0.32.0 117 | gensim==4.3.2 118 | geocoder==1.38.1 119 | geographiclib==2.0 120 | geopandas==0.13.2 121 | geopy==2.3.0 122 | gin-config==0.5.0 123 | glob2==0.7 124 | google==2.0.3 125 | google-ai-generativelanguage==0.4.0 126 | google-api-core==2.11.1 127 | google-api-python-client==2.84.0 128 | google-auth==2.27.0 129 | google-auth-httplib2==0.1.1 130 | google-auth-oauthlib==1.2.0 131 | google-cloud-aiplatform==1.43.0 132 | google-cloud-bigquery==3.12.0 133 | google-cloud-bigquery-connection==1.12.1 134 | google-cloud-bigquery-storage==2.24.0 135 | google-cloud-core==2.3.3 136 | google-cloud-datastore==2.15.2 137 | google-cloud-firestore==2.11.1 138 | google-cloud-functions==1.13.3 139 | google-cloud-iam==2.14.3 140 | google-cloud-language==2.13.3 141 | google-cloud-resource-manager==1.12.3 142 | google-cloud-storage==2.8.0 143 | google-cloud-translate==3.11.3 144 | google-colab @ file:///colabtools/dist/google-colab-1.0.0.tar.gz#sha256=e916d4e7c3ba6158df864a2e03852211d8fab20abb3db5205b865eedf4be9799 145 | google-crc32c==1.5.0 146 | google-generativeai==0.3.2 147 | google-pasta==0.2.0 148 | google-resumable-media==2.7.0 149 | googleapis-common-protos==1.62.0 150 | googledrivedownloader==0.4 151 | graphviz==0.20.1 152 | greenlet==3.0.3 153 | grpc-google-iam-v1==0.13.0 154 | grpcio==1.62.0 155 | grpcio-status==1.48.2 156 | gspread==3.4.2 157 | gspread-dataframe==3.3.1 158 | gym==0.25.2 159 | gym-notices==0.0.8 160 | h5netcdf==1.3.0 161 | h5py==3.9.0 162 | holidays==0.44 163 | holoviews==1.17.1 164 | html5lib==1.1 165 | httpimport==1.3.1 166 | httplib2==0.22.0 167 | huggingface-hub==0.20.3 168 | humanize==4.7.0 169 | hyperopt==0.2.7 170 | ibis-framework==7.1.0 171 | idna==3.6 172 | ijson==3.2.3 173 | imageio==2.31.6 174 | imageio-ffmpeg==0.4.9 175 | imagesize==1.4.1 176 | imbalanced-learn==0.10.1 177 | imgaug==0.4.0 178 | importlib-metadata==7.0.1 179 | importlib_resources==6.1.2 180 | imutils==0.5.4 181 | inflect==7.0.0 182 | iniconfig==2.0.0 183 | intel-openmp==2023.2.3 184 | ipyevents==2.0.2 185 | ipyfilechooser==0.6.0 186 | ipykernel==5.5.6 187 | ipyleaflet==0.18.2 188 | ipython==7.34.0 189 | ipython-genutils==0.2.0 190 | ipython-sql==0.5.0 191 | ipytree==0.2.2 192 | ipywidgets==7.7.1 193 | itsdangerous==2.1.2 194 | jax==0.4.23 195 | jaxlib @ https://storage.googleapis.com/jax-releases/cuda12/jaxlib-0.4.23+cuda12.cudnn89-cp310-cp310-manylinux2014_x86_64.whl#sha256=8e42000672599e7ec0ea7f551acfcc95dcdd0e22b05a1d1f12f97b56a9fce4a8 196 | jeepney==0.7.1 197 | jieba==0.42.1 198 | Jinja2==3.1.3 199 | joblib==1.3.2 200 | jsonpickle==3.0.3 201 | jsonschema==4.19.2 202 | jsonschema-specifications==2023.12.1 203 | jsonstreams==0.6.0 204 | jupyter-client==6.1.12 205 | jupyter-console==6.1.0 206 | jupyter-server==1.24.0 207 | jupyter_core==5.7.1 208 | jupyterlab_pygments==0.3.0 209 | jupyterlab_widgets==3.0.10 210 | kaggle==1.5.16 211 | kagglehub==0.2.0 212 | keras==3.0.5 213 | keras-core==0.1.7 214 | keras-nlp==0.8.2 215 | keyring==23.5.0 216 | kiwisolver==1.4.5 217 | langcodes==3.3.0 218 | launchpadlib==1.10.16 219 | lazr.restfulclient==0.14.4 220 | lazr.uri==1.0.6 221 | lazy_loader==0.3 222 | libclang==16.0.6 223 | librosa==0.10.1 224 | lightgbm==4.1.0 225 | linkify-it-py==2.0.3 226 | llvmlite==0.41.1 227 | lml==0.1.0 228 | locket==1.0.0 229 | logical-unification==0.4.6 230 | lxml==4.9.4 231 | malloy==2023.1067 232 | Markdown==3.5.2 233 | markdown-it-py==3.0.0 234 | MarkupSafe==2.1.5 235 | matplotlib==3.7.1 236 | matplotlib-inline==0.1.6 237 | matplotlib-venn==0.11.10 238 | mdit-py-plugins==0.4.0 239 | mdurl==0.1.2 240 | miniKanren==1.0.3 241 | missingno==0.5.2 242 | mistune==0.8.4 243 | mizani==0.9.3 244 | mkl==2023.2.0 245 | ml-dtypes==0.2.0 246 | mlxtend==0.22.0 247 | more-itertools==10.1.0 248 | moviepy==1.0.3 249 | mpmath==1.3.0 250 | msgpack==1.0.8 251 | multidict==6.0.5 252 | multipledispatch==1.0.0 253 | multiprocess==0.70.16 254 | multitasking==0.0.11 255 | murmurhash==1.0.10 256 | music21==9.1.0 257 | namex==0.0.7 258 | natsort==8.4.0 259 | nbclassic==1.0.0 260 | nbclient==0.9.0 261 | nbconvert==6.5.4 262 | nbformat==5.9.2 263 | nest-asyncio==1.6.0 264 | networkx==3.2.1 265 | nibabel==4.0.2 266 | nltk==3.8.1 267 | notebook==6.5.5 268 | notebook_shim==0.2.4 269 | numba==0.58.1 270 | numexpr==2.9.0 271 | numpy==1.25.2 272 | oauth2client==4.1.3 273 | oauthlib==3.2.2 274 | opencv-contrib-python==4.8.0.76 275 | opencv-python==4.8.0.76 276 | opencv-python-headless==4.9.0.80 277 | openpyxl==3.1.2 278 | opt-einsum==3.3.0 279 | optax==0.1.9 280 | orbax-checkpoint==0.4.4 281 | osqp==0.6.2.post8 282 | packaging==23.2 283 | pandas==2.2.1 284 | pandas-datareader==0.10.0 285 | pandas-gbq==0.19.2 286 | pandas-stubs==1.5.3.230304 287 | pandocfilters==1.5.1 288 | panel==1.3.8 289 | param==2.0.2 290 | parso==0.8.3 291 | parsy==2.1 292 | partd==1.4.1 293 | pathlib==1.0.1 294 | patsy==0.5.6 295 | peewee==3.17.1 296 | pexpect==4.9.0 297 | pickleshare==0.7.5 298 | Pillow==9.4.0 299 | pins==0.8.4 300 | pip-tools==6.13.0 301 | platformdirs==4.2.0 302 | plotly==5.15.0 303 | plotnine==0.12.4 304 | pluggy==1.4.0 305 | polars==0.20.2 306 | pooch==1.8.1 307 | portpicker==1.5.2 308 | prefetch-generator==1.0.3 309 | preshed==3.0.9 310 | prettytable==3.10.0 311 | proglog==0.1.10 312 | progressbar2==4.2.0 313 | prometheus_client==0.20.0 314 | promise==2.3 315 | prompt-toolkit==3.0.43 316 | prophet==1.1.5 317 | proto-plus==1.23.0 318 | protobuf==3.20.3 319 | psutil==5.9.5 320 | psycopg2==2.9.9 321 | ptyprocess==0.7.0 322 | py-cpuinfo==9.0.0 323 | py4j==0.10.9.7 324 | pyarrow==14.0.2 325 | pyarrow-hotfix==0.6 326 | pyasn1==0.5.1 327 | pyasn1-modules==0.3.0 328 | pycocotools==2.0.7 329 | pycparser==2.21 330 | pydantic==2.6.3 331 | pydantic_core==2.16.3 332 | pydata-google-auth==1.8.2 333 | pydot==1.4.2 334 | pydot-ng==2.0.0 335 | pydotplus==2.0.2 336 | PyDrive==1.3.1 337 | PyDrive2==1.6.3 338 | pyerfa==2.0.1.1 339 | pyexcel==0.7.0 340 | pyexcel-io==0.6.6 341 | pyexcel-xls==0.7.0 342 | pyexcel-xlsx==0.6.0 343 | pygame==2.5.2 344 | Pygments==2.16.1 345 | PyGObject==3.42.1 346 | PyJWT==2.3.0 347 | pymc==5.10.4 348 | pymystem3==0.2.0 349 | PyOpenGL==3.1.7 350 | pyOpenSSL==24.0.0 351 | pyparsing==3.1.1 352 | pyperclip==1.8.2 353 | pyproj==3.6.1 354 | pyproject_hooks==1.0.0 355 | pyshp==2.3.1 356 | PySocks==1.7.1 357 | pytensor==2.18.6 358 | pytest==7.4.4 359 | python-apt @ file:///backend-container/containers/python_apt-0.0.0-cp310-cp310-linux_x86_64.whl#sha256=b209c7165d6061963abe611492f8c91c3bcef4b7a6600f966bab58900c63fefa 360 | python-box==7.1.1 361 | python-dateutil==2.8.2 362 | python-louvain==0.16 363 | python-slugify==8.0.4 364 | python-utils==3.8.2 365 | pytz==2023.4 366 | pyviz_comms==3.0.1 367 | PyWavelets==1.5.0 368 | PyYAML==6.0.1 369 | pyzmq==23.2.1 370 | qdldl==0.1.7.post0 371 | qudida==0.0.4 372 | ratelim==0.1.6 373 | referencing==0.33.0 374 | regex==2023.12.25 375 | requests==2.31.0 376 | requests-oauthlib==1.3.1 377 | requirements-parser==0.5.0 378 | rich==13.7.1 379 | rpds-py==0.18.0 380 | rpy2==3.4.2 381 | rsa==4.9 382 | safetensors==0.4.2 383 | scikit-image==0.19.3 384 | scikit-learn==1.2.2 385 | scipy==1.11.4 386 | scooby==0.9.2 387 | scs==3.2.4.post1 388 | seaborn==0.13.1 389 | SecretStorage==3.3.1 390 | semver==2.13.0 391 | Send2Trash==1.8.2 392 | sentencepiece==0.1.99 393 | shapely==2.0.3 394 | six==1.16.0 395 | sklearn-pandas==2.2.0 396 | smart-open==6.4.0 397 | sniffio==1.3.1 398 | snowballstemmer==2.2.0 399 | sortedcontainers==2.4.0 400 | soundfile==0.12.1 401 | soupsieve==2.5 402 | soxr==0.3.7 403 | spacy==3.7.4 404 | spacy-legacy==3.0.12 405 | spacy-loggers==1.0.5 406 | Sphinx==5.0.2 407 | sphinxcontrib-applehelp==1.0.8 408 | sphinxcontrib-devhelp==1.0.6 409 | sphinxcontrib-htmlhelp==2.0.5 410 | sphinxcontrib-jsmath==1.0.1 411 | sphinxcontrib-qthelp==1.0.7 412 | sphinxcontrib-serializinghtml==1.1.10 413 | SQLAlchemy==2.0.28 414 | sqlglot==19.9.0 415 | sqlparse==0.4.4 416 | srsly==2.4.8 417 | stanio==0.3.0 418 | statsmodels==0.14.1 419 | sympy==1.12 420 | tables==3.8.0 421 | tabulate==0.9.0 422 | tbb==2021.11.0 423 | tblib==3.0.0 424 | tenacity==8.2.3 425 | tensorboard==2.15.2 426 | tensorboard-data-server==0.7.2 427 | tensorflow==2.15.0 428 | tensorflow-datasets==4.9.4 429 | tensorflow-estimator==2.15.0 430 | tensorflow-gcs-config==2.15.0 431 | tensorflow-hub==0.16.1 432 | tensorflow-io-gcs-filesystem==0.36.0 433 | tensorflow-metadata==1.14.0 434 | tensorflow-probability==0.23.0 435 | tensorflow-text==2.15.0 436 | tensorstore==0.1.45 437 | termcolor==2.4.0 438 | terminado==0.18.0 439 | text-unidecode==1.3 440 | textblob==0.17.1 441 | texttable==1.7.0 442 | tf-keras==2.15.0 443 | tf-slim==1.1.0 444 | thinc==8.2.3 445 | threadpoolctl==3.3.0 446 | tifffile==2024.2.12 447 | tinycss2==1.2.1 448 | tokenizers==0.15.2 449 | toml==0.10.2 450 | tomli==2.0.1 451 | toolz==0.12.1 452 | torch @ https://download.pytorch.org/whl/cu121/torch-2.1.0%2Bcu121-cp310-cp310-linux_x86_64.whl#sha256=0d4e8c52a1fcf5ed6cfc256d9a370fcf4360958fc79d0b08a51d55e70914df46 453 | torchaudio @ https://download.pytorch.org/whl/cu121/torchaudio-2.1.0%2Bcu121-cp310-cp310-linux_x86_64.whl#sha256=676bda4042734eda99bc59b2d7f761f345d3cde0cad492ad34e3aefde688c6d8 454 | torchdata==0.7.0 455 | torchsummary==1.5.1 456 | torchtext==0.16.0 457 | torchvision @ https://download.pytorch.org/whl/cu121/torchvision-0.16.0%2Bcu121-cp310-cp310-linux_x86_64.whl#sha256=e76e78d0ad43636c9884b3084ffaea8a8b61f21129fbfa456a5fe734f0affea9 458 | tornado==6.3.3 459 | tqdm==4.66.2 460 | traitlets==5.7.1 461 | traittypes==0.2.1 462 | transformers==4.38.2 463 | triton==2.1.0 464 | tweepy==4.14.0 465 | typer==0.9.0 466 | types-pytz==2024.1.0.20240203 467 | types-setuptools==69.1.0.20240302 468 | typing_extensions==4.10.0 469 | tzdata==2024.1 470 | tzlocal==5.2 471 | uc-micro-py==1.0.3 472 | uritemplate==4.1.1 473 | urllib3==2.0.7 474 | validators==0.22.0 475 | vega-datasets==0.9.0 476 | wadllib==1.3.6 477 | wasabi==1.1.2 478 | wcwidth==0.2.13 479 | weasel==0.3.4 480 | webcolors==1.13 481 | webencodings==0.5.1 482 | websocket-client==1.7.0 483 | Werkzeug==3.0.1 484 | widgetsnbextension==3.6.6 485 | wordcloud==1.9.3 486 | wrapt==1.14.1 487 | xarray==2023.7.0 488 | xarray-einstats==0.7.0 489 | xgboost==2.0.3 490 | xlrd==2.0.1 491 | xlwt==1.3.0 492 | xxhash==3.4.1 493 | xyzservices==2023.10.1 494 | yarl==1.9.4 495 | yellowbrick==1.5 496 | yfinance==0.2.37 497 | zict==3.0.0 498 | zipp==3.17.0 499 | -------------------------------------------------------------------------------- /time_series_automl/requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.9.3 2 | aiosignal==1.3.1 3 | anyio==4.2.0 4 | appnope==0.1.4 5 | argon2-cffi==23.1.0 6 | argon2-cffi-bindings==21.2.0 7 | arrow==1.3.0 8 | asttokens==2.4.1 9 | async-lru==2.0.4 10 | attrs==23.2.0 11 | Babel==2.14.0 12 | beautifulsoup4==4.12.3 13 | bleach==6.1.0 14 | certifi==2024.2.2 15 | cffi==1.16.0 16 | chardet==5.2.0 17 | charset-normalizer==3.3.2 18 | cleanlab-studio==1.1.29 19 | click==8.1.3 20 | cloudpickle==3.0.0 21 | cmdstanpy==1.2.1 22 | colorama==0.4.6 23 | comm==0.2.1 24 | contourpy==1.2.0 25 | cycler==0.12.1 26 | debugpy==1.8.1 27 | decorator==5.1.1 28 | defusedxml==0.7.1 29 | et-xmlfile==1.1.0 30 | executing==2.0.1 31 | fastjsonschema==2.19.1 32 | fonttools==4.49.0 33 | fqdn==1.5.1 34 | frozenlist==1.4.1 35 | h11==0.14.0 36 | holidays==0.42 37 | httpcore==1.0.3 38 | httpx==0.26.0 39 | idna==3.6 40 | ijson==3.2.3 41 | importlib-resources==6.1.1 42 | ipykernel==6.29.2 43 | ipython==8.21.0 44 | isoduration==20.11.0 45 | jedi==0.19.1 46 | Jinja2==3.1.3 47 | joblib==1.3.2 48 | json5==0.9.14 49 | jsonpointer==2.4 50 | jsonschema==4.21.1 51 | jsonschema-specifications==2023.12.1 52 | jsonstreams==0.6.0 53 | jupyter-events==0.9.0 54 | jupyter-lsp==2.2.2 55 | jupyter_client==8.6.0 56 | jupyter_core==5.7.1 57 | jupyter_server==2.12.5 58 | jupyter_server_terminals==0.5.2 59 | jupyterlab==4.1.1 60 | jupyterlab_pygments==0.3.0 61 | jupyterlab_server==2.25.3 62 | kiwisolver==1.4.5 63 | llvmlite==0.42.0 64 | lml==0.1.0 65 | MarkupSafe==2.1.5 66 | matplotlib==3.8.3 67 | matplotlib-inline==0.1.6 68 | mistune==3.0.2 69 | multidict==6.0.5 70 | nbclient==0.9.0 71 | nbconvert==7.16.0 72 | nbformat==5.9.2 73 | nest-asyncio==1.6.0 74 | notebook_shim==0.2.4 75 | numba==0.59.0 76 | numpy==1.26.4 77 | openpyxl==3.1.2 78 | overrides==7.7.0 79 | packaging==23.2 80 | pandas==2.1.4 81 | pandocfilters==1.5.1 82 | parso==0.8.3 83 | patsy==0.5.6 84 | pexpect==4.9.0 85 | pillow==10.2.0 86 | platformdirs==4.2.0 87 | prometheus_client==0.20.0 88 | prompt-toolkit==3.0.43 89 | prophet==1.1.5 90 | psutil==5.9.8 91 | ptyprocess==0.7.0 92 | pure-eval==0.2.2 93 | pycparser==2.21 94 | pyexcel==0.7.0 95 | pyexcel-io==0.6.6 96 | pyexcel-xls==0.7.0 97 | pyexcel-xlsx==0.6.0 98 | Pygments==2.17.2 99 | pyparsing==3.1.1 100 | python-dateutil==2.8.2 101 | python-json-logger==2.0.7 102 | pytz==2024.1 103 | PyYAML==6.0.1 104 | pyzmq==25.1.2 105 | referencing==0.33.0 106 | requests==2.31.0 107 | rfc3339-validator==0.1.4 108 | rfc3986-validator==0.1.1 109 | rpds-py==0.18.0 110 | scikit-base==0.7.2 111 | scikit-learn==1.4.1.post1 112 | scipy==1.12.0 113 | semver==2.13.0 114 | Send2Trash==1.8.2 115 | six==1.16.0 116 | sktime==0.26.0 117 | sniffio==1.3.0 118 | soupsieve==2.5 119 | stack-data==0.6.3 120 | stanio==0.3.0 121 | statsmodels==0.14.1 122 | stumpy==1.12.0 123 | terminado==0.18.0 124 | texttable==1.7.0 125 | threadpoolctl==3.3.0 126 | tinycss2==1.2.1 127 | tornado==6.4 128 | tqdm==4.66.2 129 | traitlets==5.14.1 130 | tsfel==0.1.6 131 | tsfresh==0.20.2 132 | types-python-dateutil==2.8.19.20240106 133 | typing_extensions==4.9.0 134 | tzdata==2024.1 135 | uri-template==1.3.0 136 | urllib3==2.2.1 137 | validators==0.22.0 138 | wcwidth==0.2.13 139 | webcolors==1.13 140 | webencodings==0.5.1 141 | websocket-client==1.7.0 142 | xlrd==2.0.1 143 | xlwt==1.3.0 144 | yarl==1.9.4 145 | -------------------------------------------------------------------------------- /tlm_call_api_directly/tlm_api_directly.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# How to call the TLM REST API directly\n", 8 | "\n", 9 | "Although the Trustworthy Language Model officially offers a Python client library and can be used via OpenAI's Python client library, you can still use TLM with another programming language (eg. Typescript) by directly calling TLM's backend REST API.\n", 10 | "\n", 11 | "Here we demonstrate how to call the REST API using Python, just for reference. Our code here is simply making http requests, you can use any other programming language with http lib/tools by providing the necessary payload and headers." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 30, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# Define the TLM API key, model, and quality preset\n", 21 | "# More details on models supported and quality presets can be found here: https://help.cleanlab.ai/reference/python/trustworthy_language_model/#class-tlmoptions\n", 22 | "API_KEY = ''\n", 23 | "MODEL = \"gpt-4o-mini\"\n", 24 | "QUALITY_PRESET = \"medium\"" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "## Make Prompt API request to TLM to get back a response and trustworthiness score\n", 32 | "\n", 33 | "Note: The `confidence_score` parameter returned in the REST API response is the same as `trustworthiness_score` returned by the Python client library.\n", 34 | "\n", 35 | "You can check out the API documentation for more details on inputs and outputs: https://help.cleanlab.ai/tlm/api/python/tlm/" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": { 41 | "vscode": { 42 | "languageId": "bat" 43 | } 44 | }, 45 | "source": [ 46 | "### JSON payload structure" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 7, 52 | "metadata": {}, 53 | "outputs": [ 54 | { 55 | "data": { 56 | "text/plain": [ 57 | "{'task': 'default',\n", 58 | " 'quality': 'medium',\n", 59 | " 'prompt': \"What's the first month of the year?\",\n", 60 | " 'options': {'model': 'gpt-4o-mini', 'log': ['explanation']}}" 61 | ] 62 | }, 63 | "execution_count": 7, 64 | "metadata": {}, 65 | "output_type": "execute_result" 66 | } 67 | ], 68 | "source": [ 69 | "{\n", 70 | " \"task\": \"default\",\n", 71 | " \"quality\": \"medium\",\n", 72 | " \"prompt\": \"What's the first month of the year?\",\n", 73 | " \"options\": {\n", 74 | " \"model\": \"gpt-4o-mini\",\n", 75 | " \"log\": [\"explanation\"]\n", 76 | " }\n", 77 | "}" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "### Inputs:\n", 85 | "- `prompt` (required): prompt (or list of prompts) for the TLM to evaluate, inclusive of the user's query and any system instructions.\n", 86 | "- `task` (optional): determines details of the algorithm used for scoring LLM response trustworthiness, i.e. `default`, `classification`, or `code_generation`.\n", 87 | "- `quality` (optional): controls the quality of TLM responses and trustworthiness scores vs. latency/costs.\n", 88 | "- `options` (optional):\n", 89 | " - `model` (optional): underlying base LLM to use (better models yield better results, faster models yield faster/cheaper results).\n", 90 | " - `log` (optional): optionally specify additional logs or metadata that TLM should return. For instance, include “explanation” here to get explanations of why a response is scored with low trustworthiness.\n", 91 | "\n", 92 | "\n", 93 | "See more [here](https://help.cleanlab.ai/tlm/api/python/tlm/)\n", 94 | "\n", 95 | "### Outputs:\n", 96 | "- `response`: The response from the model.\n", 97 | "- `confidence_score`: score between 0-1 corresponding to the trustworthiness of the response. A higher score indicates a higher confidence that the response is correct/good.\n", 98 | "- `explanation`: explanation of why a response is scored with low trustworthiness, if `log` includes `explanation`.\n", 99 | "\n" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 9, 105 | "metadata": {}, 106 | "outputs": [ 107 | { 108 | "data": { 109 | "text/plain": [ 110 | "{'confidence_score': 0.9895625234542792,\n", 111 | " 'log': {'explanation': 'Did not find a reason to doubt trustworthiness.'},\n", 112 | " 'response': 'The first month of the year is January.'}" 113 | ] 114 | }, 115 | "execution_count": 9, 116 | "metadata": {}, 117 | "output_type": "execute_result" 118 | } 119 | ], 120 | "source": [ 121 | "import requests\n", 122 | "import json\n", 123 | "\n", 124 | "url = \"https://api.cleanlab.ai/api/v0/trustworthy_llm/prompt\"\n", 125 | "\n", 126 | "\n", 127 | "def make_prompt_api_request(prompt):\n", 128 | " payload = json.dumps({\n", 129 | " \"task\": \"default\",\n", 130 | " \"quality\": QUALITY_PRESET,\n", 131 | " \"prompt\": prompt,\n", 132 | " \"options\": {\n", 133 | " \"model\": MODEL,\n", 134 | " \"log\": [\"explanation\"]\n", 135 | " }\n", 136 | " })\n", 137 | " headers = {\n", 138 | " 'authorization': f'Bearer {API_KEY}',\n", 139 | " 'Content-Type': 'application/json'\n", 140 | " }\n", 141 | "\n", 142 | " response_json = requests.request(\"POST\", url, headers=headers, data=payload).json() \n", 143 | " # This field is not useful\n", 144 | " del response_json['deberta_success']\n", 145 | " return response_json\n", 146 | "\n", 147 | "make_prompt_api_request(\"What's the first month of the year?\")" 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": {}, 153 | "source": [ 154 | "## Make Trusworthiness Score API request to TLM to get back a trustworthiness score\n", 155 | "\n", 156 | "Note: The `confidence_score` parameter returned in the REST API response is the same as `trustworthiness_score` returned by the Python client library." 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 30, 162 | "metadata": {}, 163 | "outputs": [ 164 | { 165 | "data": { 166 | "text/plain": [ 167 | "{'confidence_score': 0.018654437417664177,\n", 168 | " 'log': {'explanation': \"The text expresses a strong positive sentiment towards the product, indicating love for it. Therefore, classifying it as negative is incorrect. A better response would be to classify it as positive. \\nThis response is untrustworthy due to lack of consistency in possible responses from the model. Here's one inconsistent alternate response that the model considered (which may not be accurate either): \\nPositive.\"}}" 169 | ] 170 | }, 171 | "execution_count": 30, 172 | "metadata": {}, 173 | "output_type": "execute_result" 174 | } 175 | ], 176 | "source": [ 177 | "url = \"https://api.cleanlab.ai/api/v0/trustworthy_llm/get_confidence_score\"\n", 178 | "\n", 179 | "def make_score_api_request(prompt, response):\n", 180 | " payload = json.dumps({\n", 181 | " \"task\": \"classification\",\n", 182 | " \"quality\": QUALITY_PRESET,\n", 183 | " \"prompt\": prompt,\n", 184 | " \"response\": response,\n", 185 | " \"options\": {\n", 186 | " \"model\": MODEL,\n", 187 | " \"log\": [\"explanation\"]\n", 188 | " }\n", 189 | " })\n", 190 | " headers = {\n", 191 | " 'authorization': f'Bearer {API_KEY}',\n", 192 | " 'Content-Type': 'application/json'\n", 193 | " }\n", 194 | "\n", 195 | " response_json = requests.request(\"POST\", url, headers=headers, data=payload).json()\n", 196 | " # This field is not useful\n", 197 | " del response_json['deberta_success']\n", 198 | " return response_json\n", 199 | "\n", 200 | "make_score_api_request(\"Classify this text as positive or negative: 'I love this product!'\", \"negative\")" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "## Make Trustworthy RAG API request to evaluate your RAG system" 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "The following evals are the default, optimized evals Cleanlab uses to evaluate your RAG system. You can choose to use a subset of these evals or even define your own custom eval(s).\n", 215 | "\n", 216 | "More details on defining your own custom evals can be found here: https://help.cleanlab.ai/tlm/use-cases/tlm_rag/#custom-evals" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": 53, 222 | "metadata": {}, 223 | "outputs": [], 224 | "source": [ 225 | "DEFAULT_EVALS = [\n", 226 | " {\n", 227 | " \"name\": \"context_sufficiency\",\n", 228 | " \"criteria\": \"Determine if the Document contains 100% of the information needed to answer the Question. If any external knowledge or assumptions are required, it does not meet the criteria. Each Question component must have explicit support in the Document.\",\n", 229 | " \"query_identifier\": \"Question\",\n", 230 | " \"context_identifier\": \"Document\",\n", 231 | " \"response_identifier\": None,\n", 232 | " },\n", 233 | " {\n", 234 | " \"name\": \"response_groundedness\",\n", 235 | " \"criteria\": \"Review the Response to the Query and assess whether every factual claim in the Response is explicitly supported by the provided Context. A Response meets the criteria if all information is directly backed by evidence in the Context, without relying on assumptions, external knowledge, or unstated inferences. The focus is on whether the Response is fully grounded in the Context, rather than whether it fully addresses the Query. If any claim in the Response lacks direct support or introduces information not present in the Context, the Response is bad and does not meet the criteria.\",\n", 236 | " \"query_identifier\": \"Query\",\n", 237 | " \"context_identifier\": \"Context\",\n", 238 | " \"response_identifier\": \"Response\",\n", 239 | " },\n", 240 | " {\n", 241 | " \"name\": \"response_helpfulness\",\n", 242 | " \"criteria\": \"Assess whether the AI Assistant Response is a helpful answer to the User Query. A Response is considered helpful if it makes a genuine attempt to answer the question, even if the answer is incorrect or incomplete. Factual inaccuracies should not affect the assessment. The only thing that matters is whether the Assistant tries to answer the question. A Response is considered not helpful if it avoids answering the question. For example, by saying or implying things like \\\"I don't know\\\", \\\"Sorry\\\", \\\"No information available\\\", or any other form of refusal or deflection.\",\n", 243 | " \"query_identifier\": \"User Query\",\n", 244 | " \"context_identifier\": None,\n", 245 | " \"response_identifier\": \"AI Assistant Response\",\n", 246 | " },\n", 247 | " {\n", 248 | " \"name\": \"query_ease\",\n", 249 | " \"criteria\": \"Determine whether the above User Request appears simple and straightforward. A bad User Request will appear either: ambiguous in intent, complex, purposefully tricky, abnormal, or disgruntled. A good User Request is phrased clearly and expresses an achievable intent. Basic conversational and non-propositional statements are also considered good. Should an AI Assistant be able to properly answer the User Request, it is considered good. The AI Assistant handling this User Request has additional knowledge about: the user, domain-specific terms and abbreviations, and any necessary factual information. So a User Request missing information could still be good; vagueness due to undefined pronouns/terms or references to unknown context does not make a User Request bad.\",\n", 250 | " \"query_identifier\": \"User Request\",\n", 251 | " \"context_identifier\": None,\n", 252 | " \"response_identifier\": None,\n", 253 | " },\n", 254 | "]" 255 | ] 256 | }, 257 | { 258 | "cell_type": "markdown", 259 | "metadata": {}, 260 | "source": [ 261 | "### Inputs:\n", 262 | "- `context` (required): retrieved context for the given query\n", 263 | "- `query` (required): user's query, this is just the question itself\n", 264 | "- `response` (required): response from the LLM\n", 265 | "- `prompt` (required): final prompt used to generate the response, inclusive of the context, user's query and any system instructions, in the same format originally used to generate the response\n", 266 | "- `quality` (optional): controls the quality of TLM responses and trustworthiness scores vs. latency/costs.\n", 267 | "- `options` (optional):\n", 268 | " - `model` (optional): underlying base LLM to use (better models yield better results, faster models yield faster/cheaper results).\n", 269 | " - `log` (optional): optionally specify additional logs or metadata that TLM should return. For instance, include “explanation” here to get explanations of why a response is scored with low trustworthiness.\n", 270 | "\n", 271 | "### Outputs:\n", 272 | "- `trustworthiness`: \n", 273 | " - `score`: score between 0-1 corresponding to the trustworthiness of the response. A higher score indicates a higher confidence that the response is correct/good.\n", 274 | " - `explanation`: explanation of why a response is scored with low trustworthiness, if `log` includes `explanation`.\n", 275 | "- Selected `evals`:\n", 276 | " - `context_sufficiency`: score between 0-1 corresponding to the sufficiency of the context provided to the LLM. Evaluates whether the retrieved context contains sufficient information to answer the query. A low score indicates that key information is missing from the context (useful to diagnose search/retrieval failures or knowledge gaps).\n", 277 | " \n", 278 | " - `response_groundedness`: score between 0-1 corresponding to the groundedness of the response to the context. Evaluates whether claims/information stated in the response are explicitly supported by the provided context (useful to diagnose when your LLM is fabricating claims or relying on its internal world knowledge over the information retrieved from your knowledge base).\n", 279 | " - `response_helpfulness`: score between 0-1 corresponding to the helpfulness of the response to the user's query. Evaluates whether the response attempts to answer the user's query or instead abstain from answering (useful to detect responses unlikely to satisfy the user like generic fallbacks).\n", 280 | " - `query_ease`: score between 0-1 corresponding to the ease of the user's query. Evaluates whether the user query seems easy for an AI system to properly handle (useful to diagnose queries that are: complex, vague, tricky, or disgruntled-sounding).\n", 281 | "\n", 282 | "See more details [here](https://help.cleanlab.ai/tlm/use-cases/tlm_rag/).\n", 283 | "\n" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": 57, 289 | "metadata": {}, 290 | "outputs": [ 291 | { 292 | "data": { 293 | "text/plain": [ 294 | "{'context_sufficiency': {'score': 0.9975124377784942},\n", 295 | " 'query_ease': {'score': 0.9975124377422458},\n", 296 | " 'response_groundedness': {'score': 0.0024875622025382848},\n", 297 | " 'response_helpfulness': {'score': 0.00696355355661231},\n", 298 | " 'trustworthiness': {'log': {'explanation': \"Cannot verify that this response is correct.\\nThis response is untrustworthy due to lack of consistency in possible responses from the model. Here's one inconsistent alternate response that the model considered (which may not be accurate either): \\nThe first month of the year is January.\"},\n", 299 | " 'score': 0.0}}" 300 | ] 301 | }, 302 | "execution_count": 57, 303 | "metadata": {}, 304 | "output_type": "execute_result" 305 | } 306 | ], 307 | "source": [ 308 | "import requests\n", 309 | "import json\n", 310 | "\n", 311 | "url = \"https://api.cleanlab.ai/api/v1/rag_trustworthy_llm/score\"\n", 312 | "\n", 313 | "def make_score_rag_api_request(context, query, response, prompt):\n", 314 | " payload = json.dumps({ \n", 315 | " \"context\": context,\n", 316 | " \"query\": query,\n", 317 | " \"prompt\": prompt,\n", 318 | " \"response\": {\"response\": response},\n", 319 | " \"quality\": QUALITY_PRESET,\n", 320 | " \"options\": {\n", 321 | " \"model\": MODEL,\n", 322 | " \"log\": [\"explanation\"]\n", 323 | " },\n", 324 | " \"evals\": DEFAULT_EVALS\n", 325 | " })\n", 326 | " headers = {\n", 327 | " 'authorization': f'Bearer {API_KEY}',\n", 328 | " 'Content-Type': 'application/json'\n", 329 | " }\n", 330 | "\n", 331 | " response_json = requests.request(\"POST\", url, headers=headers, data=payload).json() \n", 332 | " del response_json['deberta_success']\n", 333 | " return response_json\n", 334 | "\n", 335 | "context = \"The first month of the year is January.\"\n", 336 | "query = \"What's the first month of the year?\"\n", 337 | "response = \"February\"\n", 338 | "prompt = f\"Given the context provided, answer the question. Context: {context} Question: {query} Response:\"\n", 339 | "\n", 340 | "make_score_rag_api_request(context, query, response, prompt)" 341 | ] 342 | }, 343 | { 344 | "cell_type": "markdown", 345 | "metadata": {}, 346 | "source": [ 347 | "## Make Trustworthy RAG API request to generate a response for and evaluate your RAG system" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": 58, 353 | "metadata": {}, 354 | "outputs": [ 355 | { 356 | "data": { 357 | "text/plain": [ 358 | "{'context_sufficiency': {'score': 0.9975124377990071},\n", 359 | " 'query_ease': {'score': 0.9975124377422458},\n", 360 | " 'response': 'The first month of the year is January.',\n", 361 | " 'response_groundedness': {'score': 0.9975124378109894},\n", 362 | " 'response_helpfulness': {'score': 0.9975124378110278},\n", 363 | " 'trustworthiness': {'log': {'explanation': 'Did not find a reason to doubt trustworthiness.'},\n", 364 | " 'score': 0.9998661362818028}}" 365 | ] 366 | }, 367 | "execution_count": 58, 368 | "metadata": {}, 369 | "output_type": "execute_result" 370 | } 371 | ], 372 | "source": [ 373 | "import requests\n", 374 | "import json\n", 375 | "\n", 376 | "url = \"https://api.cleanlab.ai/api/v1/rag_trustworthy_llm/generate\"\n", 377 | "\n", 378 | "def make_generate_rag_api_request(context, query, prompt):\n", 379 | " payload = json.dumps({ \n", 380 | " \"context\": context,\n", 381 | " \"query\": query,\n", 382 | " \"prompt\": prompt,\n", 383 | " \"quality\": QUALITY_PRESET,\n", 384 | " \"options\": {\n", 385 | " \"model\": MODEL,\n", 386 | " \"log\": [\"explanation\"]\n", 387 | " },\n", 388 | " \"evals\": DEFAULT_EVALS\n", 389 | " })\n", 390 | " headers = {\n", 391 | " 'authorization': f'Bearer {API_KEY}',\n", 392 | " 'Content-Type': 'application/json'\n", 393 | " }\n", 394 | "\n", 395 | " response_json = requests.request(\"POST\", url, headers=headers, data=payload).json() \n", 396 | " del response_json['deberta_success']\n", 397 | " return response_json\n", 398 | "\n", 399 | "context = \"The first month of the year is January.\"\n", 400 | "query = \"What's the first month of the year?\"\n", 401 | "prompt = f\"Given the context provided, answer the question. Context: {context} Question: {query} Response:\"\n", 402 | "\n", 403 | "make_generate_rag_api_request(context, query, prompt)" 404 | ] 405 | } 406 | ], 407 | "metadata": { 408 | "kernelspec": { 409 | "display_name": ".env", 410 | "language": "python", 411 | "name": "python3" 412 | }, 413 | "language_info": { 414 | "codemirror_mode": { 415 | "name": "ipython", 416 | "version": 3 417 | }, 418 | "file_extension": ".py", 419 | "mimetype": "text/x-python", 420 | "name": "python", 421 | "nbconvert_exporter": "python", 422 | "pygments_lexer": "ipython3", 423 | "version": "3.12.0" 424 | } 425 | }, 426 | "nbformat": 4, 427 | "nbformat_minor": 2 428 | } 429 | --------------------------------------------------------------------------------