├── README.md └── using_structured_output_and_json_strict_mode_openai.py /README.md: -------------------------------------------------------------------------------- 1 | ## Learn More About Using OpenAI's Structured Outputs 2 | 3 | For a detailed explanation of the techniques used in this project, check out our blog post: 4 | 5 | [Using Structured Output and JSON Strict Mode with OpenAI](https://www.firecrawl.dev/blog/using-structured-output-and-json-strict-mode-openai) 6 | 7 | This article provides in-depth insights into: 8 | 9 | - The benefits of using structured outputs 10 | - How to implement JSON Strict Mode with OpenAI's models 11 | - Best practices for combining web scraping with AI-powered data extraction 12 | 13 | Reading this blog post will give you a deeper understanding of the concepts behind this project and how to make the most of these powerful tools. 14 | -------------------------------------------------------------------------------- /using_structured_output_and_json_strict_mode_openai.py: -------------------------------------------------------------------------------- 1 | # pip install firecrawl openai 2 | # set FIRECRAWL_API_KEY and OPENAI_API_KEY environment variables 3 | 4 | 5 | from firecrawl import FirecrawlApp 6 | from openai import OpenAI 7 | import os 8 | 9 | # Initialize the FirecrawlApp with your API key 10 | firecrawl_app = FirecrawlApp(api_key=os.environ['FIRECRAWL_API_KEY']) 11 | 12 | # Scrape data from mendable.ai 13 | url = 'https://mendable.ai' 14 | scraped_data = firecrawl_app.scrape_url(url) 15 | 16 | # Initialize OpenAI client 17 | client = OpenAI( 18 | api_key=os.environ['OPENAI_API_KEY'] 19 | ) 20 | 21 | # Define the OpenAI API request 22 | messages = [ 23 | { 24 | "role": "system", 25 | "content": "You are a helpful assistant that extracts structured data from web pages." 26 | }, 27 | { 28 | "role": "user", 29 | "content": f"Extract the headline and description from the following HTML content: {scraped_data['content']}" 30 | } 31 | ] 32 | 33 | response_format = { 34 | "type": "json_schema", 35 | "json_schema": { 36 | "name": "extracted_data", 37 | "schema": { 38 | "type": "object", 39 | "properties": { 40 | "headline": { 41 | "type": "string" 42 | }, 43 | "description": { 44 | "type": "string" 45 | } 46 | }, 47 | "required": ["headline", "description"], 48 | "additionalProperties": False 49 | } 50 | } 51 | } 52 | 53 | # Call the OpenAI API to extract structured data 54 | chat_completion = client.chat.completions.create( 55 | model="gpt-4o-mini-2024-07-18", 56 | messages=messages, 57 | response_format=response_format 58 | ) 59 | 60 | # Extracted data 61 | # Access the content of the first choice in the response 62 | extracted_data = chat_completion.choices[0].message.content 63 | 64 | # Print the extracted data 65 | print(extracted_data) --------------------------------------------------------------------------------