├── .DS_Store
├── get_training_models.py
├── get_text_embeddings.py
├── LICENSE
├── create_fine_tuning_job.py
├── chat.py
├── get_context.py
├── README.md
├── save_to_dynamo.py
├── pineconeClient.py
├── requirements.txt
├── dynamo
    └── dynamo_manager.py
├── .gitignore
├── create_training_data.py
└── scraper.py


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Eversmile12/Mihir/HEAD/.DS_Store


--------------------------------------------------------------------------------
/get_training_models.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import os
 3 | 
 4 | # Set your OpenAI API key as an environment variable
 5 | openai_api_key = os.environ['OPEN_AI']
 6 | 
 7 | # Set the API endpoint URL
 8 | url = 'https://api.openai.com/v1/fine-tunes'
 9 | 
10 | # Set the headers, including the authorization token
11 | headers = {
12 |     'Authorization': f'Bearer {openai_api_key}',
13 | }
14 | 
15 | # Make the API request and print the response to the console
16 | response = requests.get(url, headers=headers)
17 | print(response.json())
18 | 


--------------------------------------------------------------------------------
/get_text_embeddings.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import json
 3 | import os
 4 | 
 5 | def get_text_embeddings(text):
 6 |     url = "https://api.openai.com/v1/embeddings"
 7 |     headers = {
 8 |         "Authorization": f"Bearer {os.environ['OPEN_AI']}",
 9 |         "Content-Type": "application/json"
10 |     }
11 |     payload = {
12 |         "input": text,
13 |         "model": "text-embedding-ada-002"
14 |     }
15 |     response = requests.post(url, headers=headers, data=json.dumps(payload))
16 |     if response.status_code != 200:
17 |         raise ValueError(f"Failed to get embeddings for text: {response.text}")
18 |     embeddings = response.json()["data"][0]["embedding"]
19 |     return embeddings
20 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 VItto Rivabella
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/create_fine_tuning_job.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import requests
 3 | import openai
 4 | import os
 5 | openai.api_key = os.environ["OPEN_AI"]
 6 | 
 7 | 
 8 | def upload_jsonl_file( file_path):
 9 |     url = "https://api.openai.com/v1/files"
10 |     headers = {
11 |         "Authorization": f"Bearer {openai.api_key}"
12 |     }
13 |     data = {
14 |         "purpose": "fine-tune",
15 |     }
16 |     with open(file_path, "rb") as file:
17 |         response = requests.post(url, headers=headers,
18 |                                  data=data, files={"file": file})
19 |         response_json = response.json()
20 |         file_id = response_json["id"]
21 |         print(f"File uploaded with ID: {file_id}")
22 |         return file_id
23 | 
24 | 
25 | def create_fine_tune_model(id):
26 |     url = "https://api.openai.com/v1/fine-tunes"
27 |     headers = {
28 |         "Content-Type": "application/json",
29 |         "Authorization": f"Bearer {openai.api_key}"
30 |     }
31 |     data = {
32 |         "training_file": id,
33 |         "model": "curie",
34 |         "batch_size": 512
35 |     }
36 |     response = requests.post(url, headers=headers, data=json.dumps(data))
37 |     if response.ok:
38 |         print("Fine tune model created successfully")
39 |         return response.json()["id"]
40 |     else:
41 |         raise Exception(f"Failed to create fine tune model: {response.text}")
42 | 


--------------------------------------------------------------------------------
/chat.py:
--------------------------------------------------------------------------------
 1 | import openai
 2 | import os
 3 | from get_context import get_context
 4 | openai.api_key = os.environ["OPEN_AI"]
 5 | system_prompt = "You are a very enthusiastic Alchemy representative named Mihir who loves to help people! Given the following Context section provided from the Alchemy documentation and the Previous messages you've sent, answer the Question using only that information. Output must be in markdown format. If you are unsure and the answer is not explicitly written in the documentation, output must only be {success:false}. Include links to relevant docs pages when possible."
 6 | latest_responses = []
 7 | 
 8 | while True:
 9 |     user_input = input("> ")
10 |     if user_input == "\x1b":  # check for escape key
11 |         break
12 | 
13 |     context = get_context(user_input)
14 |     prompt = f" \n\n Context sections:\n{context} \n\n Question '''{user_input}'''"
15 |     merged_latest_responses = '\n'.join(latest_responses)
16 | 
17 |     messages = [
18 |         {"role": "system", "content": system_prompt},
19 |         {"role": "user",
20 |             "content": f"Previous messages: {merged_latest_responses}"
21 |          },
22 |         {"role": "user", "content": prompt},
23 |     ]
24 |     # print(messages)
25 |     # print("\n\n")
26 |     response = openai.ChatCompletion.create(
27 |         model="gpt-3.5-turbo",
28 |         messages=messages,
29 |         temperature=0.3,
30 |     )
31 |     print(response.choices[0]["message"]["content"])
32 |     latest_responses.append(response.choices[0]["message"]["content"])
33 | 


--------------------------------------------------------------------------------
/get_context.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import json
 3 | from pineconeClient import PineconeClient
 4 | from get_text_embeddings import get_text_embeddings
 5 | import html
 6 | import os
 7 | 
 8 | def get_moderation_flagged(input_text, model="text-moderation-latest"):
 9 |     url = "https://api.openai.com/v1/moderations"
10 |     headers = {
11 |         "Content-Type": "application/json",
12 |         "Authorization": f"Bearer {os.environ['OPEN_AI']}",
13 |     }
14 |     payload = {
15 |         "input": input_text,
16 |         "model": model
17 |     }
18 |     response = requests.post(url, headers=headers, data=json.dumps(payload))
19 |     if response.status_code != 200:
20 |         raise ValueError(f"Failed to get moderation score: {response.text}")
21 |     flagged = response.json()["results"][0]["flagged"]
22 |     return flagged
23 | 
24 | 
25 | def sanitize_input(user_input):
26 |     # remove leading/trailing whitespace
27 |     user_input = user_input.strip()
28 |     # remove potentially harmful characters
29 |     user_input = user_input.replace('<', '&lt;').replace('>', '&gt;')
30 |     user_input = user_input.replace('"', '&quot;').replace("'", '&#39;')
31 |     # escape special characters
32 |     user_input = html.escape(user_input)
33 |     return user_input
34 | 
35 | 
36 | def get_context(text):
37 |     text = sanitize_input(text)
38 |     isFlagged = get_moderation_flagged(text)
39 |     if not isFlagged:
40 |         embeddings = get_text_embeddings(text)
41 |         client = PineconeClient()
42 |         top_neighbours = client.query("my-index", embeddings)
43 |         context = ""
44 |         for neighbour in top_neighbours:
45 |             context += "\n" + neighbour["metadata"]["message"]
46 | 
47 |         return context
48 |     else:
49 |         return
50 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Python Chatbot with GPT3.5
 2 | 
 3 | 👋 Hey there! Welcome to this awesome repository, which contains code for spinning up a personal chatbot using GPT3.5 trained on any documentation. And guess what? 90% of the code has been developed using ChatGPT, a large language model trained on the GPT-3.5 architecture! 💻
 4 | 
 5 | So, are you ready to create your own personal chatbot and start chatting with it? Here are the steps you need to follow:
 6 | 
 7 | ## Getting Started
 8 | 
 9 | 1. Clone the repository:
10 | 
11 | ```
12 | git clone https://github.com/Eversmile12/Mihir
13 | ```
14 | 
15 | 2. Create a new virtual environment and activate it:
16 | 
17 | ```
18 | python3 -m venv env
19 | source env/bin/activate
20 | ```
21 | 
22 | 3. Install the requirements:
23 | 
24 | ```
25 | pip install -r requirements.txt
26 | ```
27 | 
28 | 4. In the `scraper.py` file, change the `url` and `page number` variable in the following function (precisely when calling `multiProcessPages` and `findFirstPage`):
29 | 
30 | ```python
31 | if __name__ == "__main__":
32 |     links = findFirstPage("https://your_documentation_website.com")
33 |     visited = multiProcessPages(links, 15, "https://your_documentation_website.com/")
34 |     save_text_csv(visited)
35 |     page_text_to_embeddings("page_text.csv")
36 | ```
37 | 
38 | 5. Set up the environment variables for Pinecone and OpenAI. You can find guides on how to get your API keys here: [Pinecone](https://www.pinecone.io/docs/quickstart/) and [OpenAI](https://beta.openai.com/docs/quickstart).
39 | 
40 | 6. Once the two API keys are set, run `scraper.py`:
41 | 
42 | ```
43 | python3 scraper.py
44 | ```
45 | 
46 | 7. Wait for the process to complete.
47 | 
48 | 8. Run the `chat.py` file:
49 | 
50 | ```
51 | python3 chat.py
52 | ```
53 | 
54 | 9. And voila! You can now start chatting with your personal chatbot and ask it anything you want. It's like having your own personal assistant! 🤖💬
55 | 
56 | ## Badges
57 | 
58 | [![Placeholder](https://img.shields.io/badge/repository-placeholder-green.svg)](https://github.com/your_username/your_repo)
59 | 
60 | Hope you enjoy using this repository as much as we enjoyed creating it. Happy chatting! 😄
61 | 


--------------------------------------------------------------------------------
/save_to_dynamo.py:
--------------------------------------------------------------------------------
 1 | import spacy
 2 | import math
 3 | from multiprocessing import Pool
 4 | from dynamo.dynamo_manager import DynamoDBManager
 5 | 
 6 | nlp = spacy.load("en_core_web_sm")
 7 | dynamo_manager = DynamoDBManager("Alchemy")
 8 | 
 9 | def process_link(link_text_tuple):
10 |     link, text = link_text_tuple
11 |     if text is not None:
12 |         print(f"Processing text from {link}")
13 |         text = text.replace("\t", " ")
14 |         doc = nlp(text)
15 |         max_chunk_tokens = 300
16 |         chunks = []
17 |         chunk = []
18 |         num_tokens = 0
19 |         for sent in doc.sents:
20 |             sent_tokens = len(sent)
21 |             if num_tokens + sent_tokens > max_chunk_tokens and len(chunk) > 0:
22 |                 chunks.append(" ".join(chunk))
23 |                 chunk = []
24 |                 num_tokens = 0
25 |             chunk.append(sent.text)
26 |             num_tokens += sent_tokens
27 |         if len(chunk) > 0:
28 |             chunks.append(" ".join(chunk))
29 |         # Create items list to be passed to the `put_items` function
30 |         items = []
31 |         for i, chunk_text in enumerate(chunks):
32 |             item = {
33 |                 "id": f"{link}_{i+1}",
34 |                 "page_url": link,
35 |                 "page_text": chunk_text,
36 |             }
37 |             items.append(item)
38 |         print(f"Processed text from {link}")
39 |         return items
40 |     else:
41 |         print(f"Skipping {link} due to empty text")
42 |         return []
43 | 
44 | def save_text_dynamodb(link_text_tuple):
45 |     num_processes = 4 # Set the number of processes to use
46 |     batch_size = math.ceil(len(link_text_tuple) / num_processes)
47 |     batches = [link_text_tuple[i:i+batch_size] for i in range(0, len(link_text_tuple), batch_size)]
48 |     with Pool(processes=num_processes) as pool:
49 |         results = pool.map(process_link, batches)
50 |     # Combine the results from all batches into a single list of items
51 |     items = [item for sublist in results for item in sublist]
52 |     # Write the items to DynamoDB
53 |     dynamo_manager.put_items(items)
54 |     print(f"Saved all text to DynamoDB")
55 | 


--------------------------------------------------------------------------------
/pineconeClient.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | import pinecone
 3 | import os
 4 | 
 5 | 
 6 | class PineconeClient:
 7 |     def __init__(self):
 8 |         api_key = os.environ["PINECONE"]
 9 |         if api_key is None:
10 |             raise ValueError(
11 |                 "Pinecone API key not found in environment variables")
12 |         pinecone.init(api_key, environment="us-west1-gcp-free")
13 | 
14 |     def create_index(self, index_name):
15 |         if index_name not in pinecone.list_indexes():
16 |             pinecone.create_index(
17 |                 index_name, dimension=1536, metric="cosine")
18 | 
19 |     def chunks(_, iterable, batch_size=100):
20 |         """A helper function to break an iterable into chunks of size batch_size."""
21 |         it = iter(iterable)
22 |         chunk = tuple(itertools.islice(it, batch_size))
23 |         while chunk:
24 |             yield chunk
25 |             chunk = tuple(itertools.islice(it, batch_size))
26 | 
27 |     # def check_index_present(index_name):
28 |     #     active_indexes = pinecone.list_indexes()
29 |     #     if index_name in active_indexes:
30 |     #         return True
31 |     #     else:
32 |     #         return False
33 | 
34 |     def upsert_in_chunks(self, index_name, ids_vectors):
35 |         # Upsert data with 100 vectors per upsert request
36 |         responses = []
37 |         print(ids_vectors)
38 |         for ids_vectors_chunk in self.chunks(ids_vectors):
39 |             upsert_response = pinecone.Index(
40 |                 index_name).upsert(vectors=ids_vectors_chunk)
41 |             if upsert_response:
42 |                 responses.append(upsert_response)
43 | 
44 |         return responses
45 | 
46 |     def upsert_standard(self, index_name, ids_vectors):
47 |         upsert_response = pinecone.Index(index_name).upsert(
48 |             vectors=ids_vectors)
49 |         return upsert_response
50 | 
51 |     def upsert(self, index_name, ids_vectors):
52 |         if len(ids_vectors) >= 100:
53 |             print("data too big, dividing in chunks")
54 |             upsert_response = self.upsert_in_chunks(index_name, ids_vectors)
55 |         else:
56 |             print("upserting data without chunking")
57 | 
58 |             upsert_response = self.upsert_standard(index_name, ids_vectors)
59 |         return upsert_response
60 | 
61 |     def query(_, index_name, vector):
62 |         index = pinecone.Index(index_name)
63 |         top_neighbours = index.query(
64 |             top_k=3,
65 |             include_metadata=True,
66 |             vector=vector,
67 |         )
68 |         if not top_neighbours:
69 |             raise ValueError("No neighbors found")
70 | 
71 |         return top_neighbours["matches"]
72 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | aiohttp==3.8.4
  2 | aiosignal==1.3.1
  3 | async-generator==1.10
  4 | async-timeout==4.0.2
  5 | asynctest==0.13.0
  6 | attrs==22.2.0
  7 | autopep8==2.0.2
  8 | beautifulsoup4==4.12.1
  9 | black==23.3.0
 10 | blis==0.7.9
 11 | boto3==1.26.124
 12 | botocore==1.29.124
 13 | cachetools==5.3.0
 14 | catalogue==2.0.8
 15 | certifi==2022.12.7
 16 | cfgv==3.3.1
 17 | chardet==5.1.0
 18 | charset-normalizer==3.1.0
 19 | click==8.1.3
 20 | colorama==0.4.6
 21 | confection==0.0.4
 22 | coverage==7.2.3
 23 | crytic-compile==0.2.4
 24 | cssselect==1.2.0
 25 | cymem==2.0.7
 26 | distlib==0.3.5
 27 | dnspython==2.3.0
 28 | docker==6.0.1
 29 | duckduckgo-search==2.8.5
 30 | en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl
 31 | et-xmlfile==1.1.0
 32 | exceptiongroup==1.1.1
 33 | filelock==3.8.0
 34 | flake8==6.0.0
 35 | frozenlist==1.3.3
 36 | gitdb==4.0.10
 37 | GitPython==3.1.31
 38 | google-api-core==2.11.0
 39 | google-api-python-client==2.84.0
 40 | google-auth==2.17.2
 41 | google-auth-httplib2==0.1.0
 42 | googleapis-common-protos==1.59.0
 43 | gTTS==2.3.1
 44 | h11==0.14.0
 45 | httplib2==0.22.0
 46 | identify==2.5.22
 47 | idna==3.4
 48 | iniconfig==2.0.0
 49 | isort==5.12.0
 50 | Jinja2==3.1.2
 51 | jmespath==1.0.1
 52 | joblib==1.2.0
 53 | jsonschema==4.17.3
 54 | langcodes==3.3.0
 55 | loguru==0.7.0
 56 | lxml==4.9.2
 57 | MarkupSafe==2.1.2
 58 | mccabe==0.7.0
 59 | multidict==6.0.4
 60 | murmurhash==1.0.9
 61 | mypy-extensions==1.0.0
 62 | nltk==3.8.1
 63 | nodeenv==1.7.0
 64 | numpy==1.24.2
 65 | oauthlib==3.2.2
 66 | openai==0.27.2
 67 | openpyxl==3.1.2
 68 | orjson==3.8.10
 69 | outcome==1.2.0
 70 | packaging==23.0
 71 | panda==0.3.1
 72 | pandas==2.0.1
 73 | pandas-stubs==2.0.0.230412
 74 | pathspec==0.11.1
 75 | pathy==0.10.1
 76 | Pillow==9.5.0
 77 | pinecone-client==2.2.1
 78 | platformdirs==2.5.2
 79 | playsound==1.2.2
 80 | pluggy==1.0.0
 81 | pre-commit==3.2.2
 82 | preshed==3.0.8
 83 | prettytable==3.4.1
 84 | protobuf==4.22.1
 85 | py-cpuinfo==9.0.0
 86 | pyasn1==0.4.8
 87 | pyasn1-modules==0.2.8
 88 | pycodestyle==2.10.0
 89 | pydantic==1.10.7
 90 | pyflakes==3.0.1
 91 | pyparsing==3.0.9
 92 | pyrsistent==0.19.3
 93 | pysha3==1.0.2
 94 | PySocks==1.7.1
 95 | pytest==7.3.1
 96 | pytest-asyncio==0.21.0
 97 | pytest-benchmark==4.0.0
 98 | pytest-cov==4.0.0
 99 | pytest-integration==0.2.3
100 | pytest-mock==3.10.0
101 | python-dateutil==2.8.2
102 | python-dotenv==1.0.0
103 | pytz==2023.3
104 | PyYAML==6.0
105 | readability-lxml==0.8.1
106 | redis==4.5.4
107 | regex==2023.3.23
108 | requests==2.28.2
109 | requests-oauthlib==1.3.1
110 | rsa==4.9
111 | s3transfer==0.6.0
112 | selenium==4.8.3
113 | six==1.16.0
114 | slither-analyzer==0.9.0
115 | smart-open==6.3.0
116 | smmap==5.0.0
117 | sniffio==1.3.0
118 | sortedcontainers==2.4.0
119 | soupsieve==2.4
120 | sourcery==1.2.0
121 | spacy==3.5.2
122 | spacy-legacy==3.0.12
123 | spacy-loggers==1.0.4
124 | srsly==2.4.6
125 | thinc==8.1.9
126 | tiktoken==0.3.3
127 | tomli==2.0.1
128 | tqdm==4.65.0
129 | trio==0.22.0
130 | trio-websocket==0.10.2
131 | tweepy==4.13.0
132 | typer==0.7.0
133 | types-pytz==2023.3.0.0
134 | typing_extensions==4.5.0
135 | tzdata==2023.3
136 | uritemplate==4.1.1
137 | urllib3==1.26.15
138 | virtualenv==20.16.3
139 | wasabi==1.1.1
140 | wcwidth==0.2.5
141 | webdriver-manager==3.8.6
142 | websocket-client==1.5.1
143 | wsproto==1.2.0
144 | yarl==1.8.2
145 | 


--------------------------------------------------------------------------------
/dynamo/dynamo_manager.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import boto3
 3 | 
 4 | class DynamoManager:
 5 |     def __init__(self, table_name):
 6 |         self.access_key = os.environ.get("AWS_ACCESS_KEY_ID")
 7 |         self.secret_key = os.environ.get("AWS_SECRET_ACCESS_KEY")
 8 |         self.region_name = os.environ.get("AWS_REGION")
 9 |         self.session = boto3.Session(
10 |             aws_access_key_id=self.access_key,
11 |             aws_secret_access_key=self.secret_key,
12 |             region_name=self.region_name
13 |         )
14 |         self.dynamodb = self.session.client('dynamodb')
15 |         self.table_name = table_name
16 |         self.table = self.retrieve_or_create_table()
17 | 
18 |     def retrieve_or_create_table(self):
19 |         try:
20 |             table = self.dynamodb.describe_table(TableName=self.table_name)['Table']
21 |             print(f"Table '{self.table_name}' found")
22 |         except self.dynamodb.exceptions.ResourceNotFoundException:
23 |             print(f"Table '{self.table_name}' does not exist, creating it...")
24 |             table = self.dynamodb.create_table(
25 |                 TableName=self.table_name,
26 |                 KeySchema=[
27 |                     {
28 |                         'AttributeName': 'id',
29 |                         'KeyType': 'HASH'
30 |                     }
31 |                 ],
32 |                 AttributeDefinitions=[
33 |                     {
34 |                         'AttributeName': 'id',
35 |                         'AttributeType': 'S'
36 |                     },
37 |                     {
38 |                         'AttributeName': 'page_url',
39 |                         'AttributeType': 'S'
40 |                     },
41 |                     {
42 |                         'AttributeName': 'page_text',
43 |                         'AttributeType': 'S'
44 |                     }
45 |                 ],
46 |                 BillingMode='PAY_PER_REQUEST',
47 |                 GlobalSecondaryIndexes=[
48 |                     {
49 |                         'IndexName': 'page_url_index',
50 |                         'KeySchema': [
51 |                             {
52 |                                 'AttributeName': 'page_url',
53 |                                 'KeyType': 'HASH'
54 |                             }
55 |                         ],
56 |                         'Projection': {
57 |                             'ProjectionType': 'ALL'
58 |                         }
59 |                     }
60 |                 ]
61 |             )
62 |             waiter = self.dynamodb.get_waiter('table_exists')
63 |             waiter.wait(TableName=self.table_name)
64 |             print(f"Table '{self.table_name}' created")
65 |         return table
66 | 
67 |     def put_item(self, item):
68 |         # Check that the item has the correct structure
69 |         if 'id' not in item or 'page_url' not in item or 'page_text' not in item:
70 |             print("Error: item is missing one or more required attributes")
71 |             return
72 | 
73 |         # Check that the table exists
74 |         try:
75 |             self.dynamodb.describe_table(TableName=self.table_name)
76 |         except self.dynamodb.exceptions.ResourceNotFoundException:
77 |             print(f"Error: table '{self.table_name}' does not exist")
78 |             return
79 | 
80 |         # Add the item to the table
81 |         self.table.put_item(Item=item)
82 |         print(f"Item with ID '{item['id']}' added to table '{self.table_name}'")
83 |         return item
84 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 


--------------------------------------------------------------------------------
/create_training_data.py:
--------------------------------------------------------------------------------
  1 | import openai
  2 | import csv
  3 | import uuid
  4 | import multiprocessing as mp
  5 | from get_text_embeddings import get_text_embeddings
  6 | from pineconeClient import PineconeClient
  7 | import os
  8 | # Set up OpenAI API credentials
  9 | openai.api_key = os.environ['OPEN_AI']
 10 | openai.api_base = "https://api.openai.com/v1/"
 11 | 
 12 | 
 13 | # def create_training_data(prompt, examples):
 14 | #     max_tokens = 3500
 15 | #     while max_tokens > 0:
 16 | #         try:
 17 | #             print(f"Requesting with max_tokens={max_tokens}")
 18 | #             response = openai.ChatCompletion.create(
 19 | #                 model="gpt-3.5-turbo",
 20 | #                 messages=[{"role":"user", "content":prompt}],
 21 | #                 temperature=0.4,
 22 | #             )
 23 | #             print("Request completed")
 24 | #             print(response)
 25 | #             return response.choices[0].message.content.strip()
 26 | #         except openai.error.OpenAIError as e:
 27 | #             print(f"OpenAI error: {e}")
 28 | #             print("Retry with lower max_tokens")
 29 | #             max_tokens -= 200
 30 | 
 31 | #     raise Exception("Failed to generate response after multiple retries")
 32 | 
 33 | 
 34 | # def extract_json(text):
 35 | #     # Initialize an empty list to store parsed JSON objects
 36 | #     json_objects = []
 37 | #     # Loop through the string, looking for curly braces
 38 | #     start_index = 0
 39 | #     while True:
 40 | #         start_index = text.find("{", start_index)
 41 | #         if start_index == -1:
 42 | #             break
 43 | #         end_index = text.find("}", start_index)
 44 | #         if end_index == -1:
 45 | #             break
 46 | #         # Extract the JSON substring from the text
 47 | #         json_string = text[start_index:end_index+1]
 48 | #         try:
 49 | #             # Attempt to parse the JSON string
 50 | #             json_data = json.loads(json_string)
 51 | 
 52 | #             # Check if the prompt and completion keys are present in the JSON object
 53 | #             if "prompt" in json_data and "completion" in json_data:
 54 | #                 prompt = json_data["prompt"] + " \n\n###\n\n"
 55 | #                 completion = " " + json_data["completion"] + "###"
 56 | #                 json_objects.append(
 57 | #                     {"prompt": prompt, "completion": completion})
 58 | 
 59 | #         except ValueError:
 60 | #             # If the JSON string is invalid, ignore it
 61 | #             pass
 62 | #         # Move the start index to the next character after the end index
 63 | #         start_index = end_index + 1
 64 | 
 65 | #     return json_objects
 66 | 
 67 | 
 68 | def process_row(row):
 69 |     # # Replace any newlines or tabs in the page text with spaces
 70 |     # page_text = row["page_text"].replace("\n", " ").replace("\t", " ")
 71 |     # prompt = 'Generate one or more strings convertible to JSON containing the training data relative to this page' + \
 72 |     #     row["page_url"] + '.\n'
 73 |     # prompt += 'The training data must be in the format of: {"prompt": "<prompt_text>", "completion": "<ideal_generated_text>"}\n'
 74 |     # prompt += 'Prompt: question or statement related to the page content. Keep it <280 characters and as generic as possible. Must include question marks only 20% of times.\n'
 75 |     # prompt += 'Response: provide a specific answer to the prompt in <1000 characters. The completion should include relevant code and reference the appropriate section in the documentation page.'
 76 |     # prompt += row["page_text"]
 77 |     # examples = [
 78 |     #     ["Input:", page_text],
 79 |     #     ["Output:", ""]
 80 |     # ]
 81 |     # if response:
 82 |     #     json_data = extract_json(response)
 83 |     #     if len(json_data):
 84 |     #         return json_data
 85 |     embedding = get_text_embeddings(row['page_text'])
 86 |     if embedding:
 87 |         return embedding, row['page_text']
 88 | 
 89 | 
 90 | def page_text_to_embeddings(input_file):
 91 |     with open(input_file, "r", newline="", encoding="utf-8") as infile:
 92 |         reader = csv.DictReader(infile)
 93 |         pool = mp.Pool(processes=mp.cpu_count())
 94 |         vectors_map = pool.map(process_row, reader)
 95 |         pool.close()
 96 |         pool.join()
 97 |         ids_vectors = []
 98 |         # training_data = [
 99 |         #     json_data for json_list in training_data for json_data in json_list]
100 |         for vector in vectors_map:
101 |             id = str(uuid.uuid4())
102 |            
103 | 
104 |             ids_vectors.append({"id": id, "values": vector[0],
105 |                                 "metadata": {"message": vector[1]}})
106 | 
107 |         client = PineconeClient()
108 |         client.create_index("my-index")
109 |         response = client.upsert("my-index", ids_vectors)
110 |         print(response)
111 | 
112 | 
113 | # Example usage
114 | if __name__ == "__main__":
115 |     page_text_to_embeddings("page_text.csv")
116 | 


--------------------------------------------------------------------------------
/scraper.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import requests
  3 | from bs4 import BeautifulSoup
  4 | from urllib.parse import urlparse, urljoin
  5 | from multiprocessing import Pool
  6 | import csv
  7 | from create_training_data import page_text_to_embeddings
  8 | from create_fine_tuning_job import create_fine_tune_model, upload_jsonl_file
  9 | import datetime
 10 | import spacy
 11 | nlp = spacy.load("en_core_web_sm")
 12 | 
 13 | 
 14 | def findFirstPage(url):
 15 | 
 16 |     # Send a request to the URL and get its content
 17 |     response = requests.get(url)
 18 |     content = response.content
 19 | 
 20 |     # Create a BeautifulSoup object from the webpage content
 21 |     soup = BeautifulSoup(content, "html.parser")
 22 | 
 23 |     # Get the domain of the webpage URL
 24 |     domain = urlparse(url).netloc
 25 | 
 26 |     # Find all the links in the webpage and extract their href attributes
 27 |     links = []
 28 |     for link in soup.find_all("a"):
 29 |         href = link.get("href")
 30 |         if not href:
 31 |             # Skip links with empty href attribute
 32 |             continue
 33 |         # Check if the link is absolute or relative
 34 |         if urlparse(href).scheme:
 35 |             # Absolute link, no need to modify
 36 |             absolute_link = href
 37 |         else:
 38 |             # Relative link, prepend the domain of the webpage URL
 39 |             absolute_link = urljoin(url, href)
 40 |         # Check if the link is a duplicate or has already been found
 41 |         if absolute_link in links:
 42 |             continue
 43 |         # Check if the link has the same root domain as the webpage URL
 44 |         if urlparse(absolute_link).netloc == domain:
 45 |             # Add the absolute link to the list of links
 46 |             links.append(absolute_link)
 47 |     return links
 48 | 
 49 | 
 50 | def scrape_link(link, domain):
 51 |     # Send a request to the link and get its content
 52 |     response = requests.get(link)
 53 |     content = response.content
 54 |     # Create a BeautifulSoup object from the content
 55 |     soup = BeautifulSoup(content, "html.parser")
 56 |     # Find all the links in the webpage and extract their href attributes
 57 |     absolute_links = []
 58 | 
 59 |     for a in soup.find_all("a"):
 60 |         href = a.get("href")
 61 |         if not href:
 62 |             # Skip links with empty href attribute
 63 |             continue
 64 |         # Check if the link is absolute or relative
 65 |         if urlparse(href).scheme:
 66 |             # Absolute link, no need to modify
 67 |             absolute_link = href
 68 |         else:
 69 |             # Relative link, prepend the domain of the webpage URL
 70 |             absolute_link = urljoin(link, href)
 71 | 
 72 |         # Check if the link has the same root domain as the webpage URL
 73 |         if urlparse(absolute_link).netloc == domain:
 74 |             # Add the absolute link to the list of links
 75 |             absolute_links.append(absolute_link)
 76 |     return absolute_links
 77 | 
 78 | # Scrape the pages contained in the links in parallel using the Pool class
 79 | 
 80 | 
 81 | def multiProcessPages(links, max_pages, url):
 82 |     visited = set(links)
 83 |     visited_pages = 0
 84 |     with Pool() as pool:
 85 |         while links and len(visited) < max_pages:
 86 |             # Scrape the links in batches of 50
 87 |             batch = links[:50]
 88 |             links = links[50:]
 89 |             visited_pages += 50
 90 |             print(f"pages {len(batch)}")
 91 |             # Map the scrape_link function to the links in the batch
 92 |             results = pool.starmap(
 93 |                 scrape_link, [(link, url) for link in batch])
 94 |             # Flatten the list of lists of absolute links to a list of absolute links
 95 |             absolute_links = [link for sublist in results for link in sublist]
 96 |             # Remove duplicates and links that have already been visited
 97 |             absolute_links = set(absolute_links) - visited
 98 |             # Add the absolute links to the list of visited links
 99 |             visited |= absolute_links
100 |             print(f"pages {len(visited)}")
101 |             # Add the absolute links to the list of links to scrape next
102 |             links += list(absolute_links)
103 |     return list(visited)[:max_pages]
104 | 
105 | 
106 | # def save_links_sorted(links):
107 | #     # Sort the links in alphabetical order
108 | #     sorted_links = sorted(links)
109 | #     # Save all the found absolute links in the same text file
110 | #     with open("links.txt", "w") as file:
111 | #         for link in sorted_links:
112 | #             file.write(link + "\n")
113 | 
114 | 
115 | def save_text_csv(links):
116 |     with open("page_text.csv", "w", newline="", encoding="utf-8") as file:
117 |         writer = csv.writer(file)
118 |         writer.writerow(["Index", "page_url", "page_text"])
119 |         with Pool() as pool:
120 |             results = pool.map(scrape_text, links)
121 |             for link, text in results:
122 |                 if text is not None:
123 |                     print(f"Processing text from {link}")
124 |                     text = text.replace("\t", " ")
125 |                     doc = nlp(text)
126 |                     max_chunk_tokens = 300
127 |                     chunks = []
128 |                     chunk = []
129 |                     num_tokens = 0
130 |                     for sent in doc.sents:
131 |                         sent_tokens = len(sent)
132 |                         if num_tokens + sent_tokens > max_chunk_tokens and len(chunk) > 0:
133 |                             chunks.append(" ".join(chunk))
134 |                             chunk = []
135 |                             num_tokens = 0
136 |                         chunk.append(sent.text)
137 |                         num_tokens += sent_tokens
138 |                     if len(chunk) > 0:
139 |                         chunks.append(" ".join(chunk))
140 |                     for i, chunk_text in enumerate(chunks):
141 |                         writer.writerow([i+1, link, chunk_text])
142 |                     print(f"Saved text from {link} to CSV file")
143 |                 else:
144 |                     print(f"Skipping {link} due to empty text")
145 | 
146 | 
147 | def scrape_text(link):
148 |     print(f"Scraping text from {link}")
149 |     response = requests.get(link)
150 |     content = response.content
151 |     soup = BeautifulSoup(content, "html.parser")
152 | 
153 |     # Find the body of the page
154 |     body = soup.find("body")
155 |     if body is None:
156 |         return link, None
157 | 
158 |     # Find the header and footer tags, if they exist, and remove them and their contents
159 |     for tag in body(["header", "footer", "nav"]):
160 |         tag.extract()
161 | 
162 |     # Extract the remaining text in the body and remove double new lines and random spaces
163 |     text = body.get_text(separator="\n").replace("\n\n", "\n").strip()
164 |     text = ' '.join(text.split())
165 | 
166 |     return link, text
167 | 
168 | 
169 | if __name__ == "__main__":
170 |     links = findFirstPage("https://docs.alchemy.com/")
171 |     visited = multiProcessPages(links, 15, "https://docs.alchemy.com/")
172 |     save_text_csv(visited)
173 |     page_text_to_embeddings("page_text.csv")
174 |     # id = upload_jsonl_file("training_data.jsonl")
175 |     # id = upload_jsonl_file("training_data.jsonl")
176 |     # fine_tune_model_id = create_fine_tune_model(id)
177 | 
178 |     # # Store the fine tune model ID and the current date in a CSV file
179 |     # with open("model_ids.csv", "a", newline="", encoding="utf-8") as outfile:
180 |     #     writer = csv.DictWriter(outfile, fieldnames=["date", "id"])
181 |     #     if outfile.tell() == 0:
182 |     #         writer.writeheader()
183 |     #     writer.writerow({"date": datetime.datetime.now().strftime(
184 |     #         "%Y-%m-%d %H:%M:%S"), "id": fine_tune_model_id})
185 |     # print(
186 |     #     f"Fine tune model ID ({fine_tune_model_id}) and date ({datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}) stored in 'model_ids.csv'")
187 |     # create_fine_tuning_job("training_data.jsonlw")
188 | 


--------------------------------------------------------------------------------