├── .gitignore
├── .env_example
├── requirements.txt
├── README.md
└── app.py


/.gitignore:
--------------------------------------------------------------------------------
1 | main.py
2 | venv/
3 | .env


--------------------------------------------------------------------------------
/.env_example:
--------------------------------------------------------------------------------
1 | OPENAI_API_KEY=sk-XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX # Replace with your OpenAI API key


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | aiohttp==3.8.4
 2 | aiosignal==1.3.1
 3 | async-timeout==4.0.2
 4 | attrs==23.1.0
 5 | beautifulsoup4==4.12.2
 6 | Brotli==1.0.9
 7 | cachetools==5.3.0
 8 | certifi==2022.12.7
 9 | charset-normalizer==3.1.0
10 | dataclasses-json==0.5.7
11 | faiss-cpu==1.7.3
12 | frozenlist==1.3.3
13 | gptcache==0.1.11
14 | greenlet==2.0.2
15 | idna==3.4
16 | langchain==0.0.141
17 | marshmallow==3.19.0
18 | marshmallow-enum==1.5.1
19 | multidict==6.0.4
20 | mypy-extensions==1.0.0
21 | numpy==1.24.2
22 | openai==0.27.4
23 | openapi-schema-pydantic==1.2.4
24 | packaging==23.1
25 | pydantic==1.10.7
26 | PyPDF2==3.0.1
27 | python-dotenv==1.0.0
28 | PyYAML==6.0
29 | regex==2023.3.23
30 | requests==2.28.2
31 | soupsieve==2.4
32 | SQLAlchemy==1.4.47
33 | tenacity==8.2.2
34 | tiktoken==0.3.3
35 | tqdm==4.65.0
36 | typing-inspect==0.8.0
37 | typing_extensions==4.5.0
38 | urllib3==1.26.15
39 | yarl==1.8.2
40 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | LLM Data Parser
 2 | ==================
 3 | 
 4 | This proof-of-concept script demonstrates how to use a language model (LLM) like GPT-3 or GPT-4 to find and extract meaningful data from HTML data without extensive HTML parsing, in this case is extracting relelvant data related to product stock and availability from a list of webpages. It uses the Langchain library to achieve this goal.
 5 | 
 6 | Installation
 7 | ------------
 8 | 
 9 | To run this script, you need to have Python 3.11.3 and the following libraries installed:
10 | 
11 | * requests
12 | * json
13 | * openai
14 | * time
15 | * bs4
16 | * langchain
17 | 
18 | The recommended approach is to use a Python virtual environment:
19 | 
20 | ```bash
21 | python3 -m venv venv
22 | . venv/bin/activate
23 | ```
24 | 
25 | Then, install the required libraries using pip:
26 | 
27 | `pip install -r requirements.txt`
28 | 
29 | Additionally, you need to have an OpenAI API key, which can be obtained from their website.
30 | 
31 | Usage
32 | -----
33 | 
34 | To use this script, simply run the `main()` function. The script will scrape product links from the target website and then use the LLM to query each product's availability.
35 | 
36 | The results will be printed to the console in the specified format.
37 | 
38 | As an alternative to OpenAI, you can use GPT4All with the Langchain library, which is a free and open-source model. For more information on using GPT4All with Langchain, refer to the following link:
39 | 
40 | [https://python.langchain.com/en/latest/modules/models/llms/integrations/gpt4all.html](https://python.langchain.com/en/latest/modules/models/llms/integrations/gpt4all.html)
41 | 
42 | For a detailed explanation of the code, you can visit the following resources:
43 | 
44 | *   Colab Notebook: [https://colab.research.google.com/drive/181BSOH6KF\_1o2lFG8DQ6eJd2MZyiSBNt?usp=sharing#scrollTo=3mtAth2jXNKO](https://colab.research.google.com/drive/181BSOH6KF_1o2lFG8DQ6eJd2MZyiSBNt?usp=sharing#scrollTo=3mtAth2jXNKO)
45 | *   YouTube Video: [https://www.youtube.com/watch?v=TLf90ipMzfE](https://www.youtube.com/watch?v=TLf90ipMzfE)
46 | 
47 | Credits
48 | -------
49 | 
50 | This script was extended by repollo from the original code by Prompt Engineering, as shown in this YouTube video: [https://www.youtube.com/watch?v=TLf90ipMzfE](https://www.youtube.com/watch?v=TLf90ipMzfE)
51 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import requests
 3 | import json
 4 | import bs4
 5 | import time
 6 | from dotenv import load_dotenv
 7 | from langchain.chains.question_answering import load_qa_chain
 8 | from langchain.llms import OpenAI as OpenAILLM
 9 | from langchain.embeddings.openai import OpenAIEmbeddings
10 | from langchain.text_splitter import CharacterTextSplitter
11 | from langchain.vectorstores import FAISS
12 | 
13 | load_dotenv()
14 | openai_api_key = os.environ.get("OPENAI_API_KEY")
15 | 
16 | 
17 | # Get the json data from the website
18 | def get_data():
19 |     current_time_milliseconds = int(time.time() * 1000)
20 |     url = "https://rpilocator.com/data.cfm"
21 |     querystring = {"method": "getProductTable", "token": "9pxkgncqyjpsfg2v6ekmsbkw1pfwf0929skq28si", "_": current_time_milliseconds}
22 |     headers = {
23 |         "Accept": "application/json, text/javascript, */*; q=0.01",
24 |         "Sec-Fetch-Site": "same-origin",
25 |         "Accept-Language": "en-US,en;q=0.9",
26 |         "Accept-Encoding": "gzip, deflate, br",
27 |         "Sec-Fetch-Mode": "cors",
28 |         "Host": "rpilocator.com",
29 |         "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4.1 Safari/605.1.15",
30 |         "Connection": "keep-alive",
31 |         "Referer": "https://rpilocator.com/",
32 |         "Cookie": "_ga_JWVD0LRP64=GS1.1.1681654548.5.0.1681654548.0.0.0; _ga=GA1.1.136564858.1681170765; RPILOCATOR=0; CFID=f3a9a0f9-d1de-4533-a1e6-c75fce5e177e; CFTOKEN=0; cfid=f3a9a0f9-d1de-4533-a1e6-c75fce5e177e; cftoken=0",
33 |         "X-Requested-With": "XMLHttpRequest"
34 |     }
35 |     response = requests.request("GET", url, headers=headers, params=querystring)
36 |     return response.content
37 | 
38 | 
39 | # Get all the product links from the json data
40 | def get_price_links(data):
41 |     json_data = json.loads(data)
42 |     data_list = json_data["data"]
43 |     return [entry["link"] for entry in data_list]
44 | 
45 | 
46 | # Get stock info from a single each product link
47 | def get_stock_info(link):
48 |     response = requests.get(link)
49 |     soup = bs4.BeautifulSoup(response.text, "html.parser")
50 |     body = soup.find("body")
51 | 
52 |     text_splitter = CharacterTextSplitter(separator="", chunk_size=1000, chunk_overlap=200, length_function=len)
53 |     texts = text_splitter.split_text(str(body))
54 | 
55 |     embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
56 |     docsearch = FAISS.from_texts(texts, embeddings)
57 | 
58 |     chain = load_qa_chain(OpenAILLM(openai_api_key=openai_api_key), chain_type="stuff")
59 |     query = "Is the main product on the website in stock and how many are in stock? Only respond with the following format: `Product Name: Raspberry Pi 4 model B WiFi DualBand Bluetooth 2GB RAM 1,5GHz Availabity: Yes, 10 or No, 0.` Do not include the backticks in your response and do not deviate from the format given even if you dont know the answer just put 0 when is not in stok and when is in stock but doesnt provide quantity just put Unknown."
60 |     docs = docsearch.similarity_search(query)
61 |     answer = chain.run(input_documents=docs, question=query)
62 |     print(answer)
63 | 
64 | def main():
65 |     data = get_data()
66 |     price_links = get_price_links(data)
67 |     print(len(price_links), "price links found")
68 |     
69 |     for link in price_links:
70 |         get_stock_info(link)
71 |     
72 | if __name__ == "__main__":
73 |     main()


--------------------------------------------------------------------------------