├── .DS_Store ├── get_training_models.py ├── get_text_embeddings.py ├── LICENSE ├── create_fine_tuning_job.py ├── chat.py ├── get_context.py ├── README.md ├── save_to_dynamo.py ├── pineconeClient.py ├── requirements.txt ├── dynamo └── dynamo_manager.py ├── .gitignore ├── create_training_data.py └── scraper.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Eversmile12/Mihir/HEAD/.DS_Store -------------------------------------------------------------------------------- /get_training_models.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import os 3 | 4 | # Set your OpenAI API key as an environment variable 5 | openai_api_key = os.environ['OPEN_AI'] 6 | 7 | # Set the API endpoint URL 8 | url = 'https://api.openai.com/v1/fine-tunes' 9 | 10 | # Set the headers, including the authorization token 11 | headers = { 12 | 'Authorization': f'Bearer {openai_api_key}', 13 | } 14 | 15 | # Make the API request and print the response to the console 16 | response = requests.get(url, headers=headers) 17 | print(response.json()) 18 | -------------------------------------------------------------------------------- /get_text_embeddings.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | import os 4 | 5 | def get_text_embeddings(text): 6 | url = "https://api.openai.com/v1/embeddings" 7 | headers = { 8 | "Authorization": f"Bearer {os.environ['OPEN_AI']}", 9 | "Content-Type": "application/json" 10 | } 11 | payload = { 12 | "input": text, 13 | "model": "text-embedding-ada-002" 14 | } 15 | response = requests.post(url, headers=headers, data=json.dumps(payload)) 16 | if response.status_code != 200: 17 | raise ValueError(f"Failed to get embeddings for text: {response.text}") 18 | embeddings = response.json()["data"][0]["embedding"] 19 | return embeddings 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 VItto Rivabella 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /create_fine_tuning_job.py: -------------------------------------------------------------------------------- 1 | import json 2 | import requests 3 | import openai 4 | import os 5 | openai.api_key = os.environ["OPEN_AI"] 6 | 7 | 8 | def upload_jsonl_file( file_path): 9 | url = "https://api.openai.com/v1/files" 10 | headers = { 11 | "Authorization": f"Bearer {openai.api_key}" 12 | } 13 | data = { 14 | "purpose": "fine-tune", 15 | } 16 | with open(file_path, "rb") as file: 17 | response = requests.post(url, headers=headers, 18 | data=data, files={"file": file}) 19 | response_json = response.json() 20 | file_id = response_json["id"] 21 | print(f"File uploaded with ID: {file_id}") 22 | return file_id 23 | 24 | 25 | def create_fine_tune_model(id): 26 | url = "https://api.openai.com/v1/fine-tunes" 27 | headers = { 28 | "Content-Type": "application/json", 29 | "Authorization": f"Bearer {openai.api_key}" 30 | } 31 | data = { 32 | "training_file": id, 33 | "model": "curie", 34 | "batch_size": 512 35 | } 36 | response = requests.post(url, headers=headers, data=json.dumps(data)) 37 | if response.ok: 38 | print("Fine tune model created successfully") 39 | return response.json()["id"] 40 | else: 41 | raise Exception(f"Failed to create fine tune model: {response.text}") 42 | -------------------------------------------------------------------------------- /chat.py: -------------------------------------------------------------------------------- 1 | import openai 2 | import os 3 | from get_context import get_context 4 | openai.api_key = os.environ["OPEN_AI"] 5 | system_prompt = "You are a very enthusiastic Alchemy representative named Mihir who loves to help people! Given the following Context section provided from the Alchemy documentation and the Previous messages you've sent, answer the Question using only that information. Output must be in markdown format. If you are unsure and the answer is not explicitly written in the documentation, output must only be {success:false}. Include links to relevant docs pages when possible." 6 | latest_responses = [] 7 | 8 | while True: 9 | user_input = input("> ") 10 | if user_input == "\x1b": # check for escape key 11 | break 12 | 13 | context = get_context(user_input) 14 | prompt = f" \n\n Context sections:\n{context} \n\n Question '''{user_input}'''" 15 | merged_latest_responses = '\n'.join(latest_responses) 16 | 17 | messages = [ 18 | {"role": "system", "content": system_prompt}, 19 | {"role": "user", 20 | "content": f"Previous messages: {merged_latest_responses}" 21 | }, 22 | {"role": "user", "content": prompt}, 23 | ] 24 | # print(messages) 25 | # print("\n\n") 26 | response = openai.ChatCompletion.create( 27 | model="gpt-3.5-turbo", 28 | messages=messages, 29 | temperature=0.3, 30 | ) 31 | print(response.choices[0]["message"]["content"]) 32 | latest_responses.append(response.choices[0]["message"]["content"]) 33 | -------------------------------------------------------------------------------- /get_context.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | from pineconeClient import PineconeClient 4 | from get_text_embeddings import get_text_embeddings 5 | import html 6 | import os 7 | 8 | def get_moderation_flagged(input_text, model="text-moderation-latest"): 9 | url = "https://api.openai.com/v1/moderations" 10 | headers = { 11 | "Content-Type": "application/json", 12 | "Authorization": f"Bearer {os.environ['OPEN_AI']}", 13 | } 14 | payload = { 15 | "input": input_text, 16 | "model": model 17 | } 18 | response = requests.post(url, headers=headers, data=json.dumps(payload)) 19 | if response.status_code != 200: 20 | raise ValueError(f"Failed to get moderation score: {response.text}") 21 | flagged = response.json()["results"][0]["flagged"] 22 | return flagged 23 | 24 | 25 | def sanitize_input(user_input): 26 | # remove leading/trailing whitespace 27 | user_input = user_input.strip() 28 | # remove potentially harmful characters 29 | user_input = user_input.replace('<', '<').replace('>', '>') 30 | user_input = user_input.replace('"', '"').replace("'", ''') 31 | # escape special characters 32 | user_input = html.escape(user_input) 33 | return user_input 34 | 35 | 36 | def get_context(text): 37 | text = sanitize_input(text) 38 | isFlagged = get_moderation_flagged(text) 39 | if not isFlagged: 40 | embeddings = get_text_embeddings(text) 41 | client = PineconeClient() 42 | top_neighbours = client.query("my-index", embeddings) 43 | context = "" 44 | for neighbour in top_neighbours: 45 | context += "\n" + neighbour["metadata"]["message"] 46 | 47 | return context 48 | else: 49 | return 50 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Python Chatbot with GPT3.5 2 | 3 | 👋 Hey there! Welcome to this awesome repository, which contains code for spinning up a personal chatbot using GPT3.5 trained on any documentation. And guess what? 90% of the code has been developed using ChatGPT, a large language model trained on the GPT-3.5 architecture! 💻 4 | 5 | So, are you ready to create your own personal chatbot and start chatting with it? Here are the steps you need to follow: 6 | 7 | ## Getting Started 8 | 9 | 1. Clone the repository: 10 | 11 | ``` 12 | git clone https://github.com/Eversmile12/Mihir 13 | ``` 14 | 15 | 2. Create a new virtual environment and activate it: 16 | 17 | ``` 18 | python3 -m venv env 19 | source env/bin/activate 20 | ``` 21 | 22 | 3. Install the requirements: 23 | 24 | ``` 25 | pip install -r requirements.txt 26 | ``` 27 | 28 | 4. In the `scraper.py` file, change the `url` and `page number` variable in the following function (precisely when calling `multiProcessPages` and `findFirstPage`): 29 | 30 | ```python 31 | if __name__ == "__main__": 32 | links = findFirstPage("https://your_documentation_website.com") 33 | visited = multiProcessPages(links, 15, "https://your_documentation_website.com/") 34 | save_text_csv(visited) 35 | page_text_to_embeddings("page_text.csv") 36 | ``` 37 | 38 | 5. Set up the environment variables for Pinecone and OpenAI. You can find guides on how to get your API keys here: [Pinecone](https://www.pinecone.io/docs/quickstart/) and [OpenAI](https://beta.openai.com/docs/quickstart). 39 | 40 | 6. Once the two API keys are set, run `scraper.py`: 41 | 42 | ``` 43 | python3 scraper.py 44 | ``` 45 | 46 | 7. Wait for the process to complete. 47 | 48 | 8. Run the `chat.py` file: 49 | 50 | ``` 51 | python3 chat.py 52 | ``` 53 | 54 | 9. And voila! You can now start chatting with your personal chatbot and ask it anything you want. It's like having your own personal assistant! 🤖💬 55 | 56 | ## Badges 57 | 58 | [![Placeholder](https://img.shields.io/badge/repository-placeholder-green.svg)](https://github.com/your_username/your_repo) 59 | 60 | Hope you enjoy using this repository as much as we enjoyed creating it. Happy chatting! 😄 61 | -------------------------------------------------------------------------------- /save_to_dynamo.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | import math 3 | from multiprocessing import Pool 4 | from dynamo.dynamo_manager import DynamoDBManager 5 | 6 | nlp = spacy.load("en_core_web_sm") 7 | dynamo_manager = DynamoDBManager("Alchemy") 8 | 9 | def process_link(link_text_tuple): 10 | link, text = link_text_tuple 11 | if text is not None: 12 | print(f"Processing text from {link}") 13 | text = text.replace("\t", " ") 14 | doc = nlp(text) 15 | max_chunk_tokens = 300 16 | chunks = [] 17 | chunk = [] 18 | num_tokens = 0 19 | for sent in doc.sents: 20 | sent_tokens = len(sent) 21 | if num_tokens + sent_tokens > max_chunk_tokens and len(chunk) > 0: 22 | chunks.append(" ".join(chunk)) 23 | chunk = [] 24 | num_tokens = 0 25 | chunk.append(sent.text) 26 | num_tokens += sent_tokens 27 | if len(chunk) > 0: 28 | chunks.append(" ".join(chunk)) 29 | # Create items list to be passed to the `put_items` function 30 | items = [] 31 | for i, chunk_text in enumerate(chunks): 32 | item = { 33 | "id": f"{link}_{i+1}", 34 | "page_url": link, 35 | "page_text": chunk_text, 36 | } 37 | items.append(item) 38 | print(f"Processed text from {link}") 39 | return items 40 | else: 41 | print(f"Skipping {link} due to empty text") 42 | return [] 43 | 44 | def save_text_dynamodb(link_text_tuple): 45 | num_processes = 4 # Set the number of processes to use 46 | batch_size = math.ceil(len(link_text_tuple) / num_processes) 47 | batches = [link_text_tuple[i:i+batch_size] for i in range(0, len(link_text_tuple), batch_size)] 48 | with Pool(processes=num_processes) as pool: 49 | results = pool.map(process_link, batches) 50 | # Combine the results from all batches into a single list of items 51 | items = [item for sublist in results for item in sublist] 52 | # Write the items to DynamoDB 53 | dynamo_manager.put_items(items) 54 | print(f"Saved all text to DynamoDB") 55 | -------------------------------------------------------------------------------- /pineconeClient.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import pinecone 3 | import os 4 | 5 | 6 | class PineconeClient: 7 | def __init__(self): 8 | api_key = os.environ["PINECONE"] 9 | if api_key is None: 10 | raise ValueError( 11 | "Pinecone API key not found in environment variables") 12 | pinecone.init(api_key, environment="us-west1-gcp-free") 13 | 14 | def create_index(self, index_name): 15 | if index_name not in pinecone.list_indexes(): 16 | pinecone.create_index( 17 | index_name, dimension=1536, metric="cosine") 18 | 19 | def chunks(_, iterable, batch_size=100): 20 | """A helper function to break an iterable into chunks of size batch_size.""" 21 | it = iter(iterable) 22 | chunk = tuple(itertools.islice(it, batch_size)) 23 | while chunk: 24 | yield chunk 25 | chunk = tuple(itertools.islice(it, batch_size)) 26 | 27 | # def check_index_present(index_name): 28 | # active_indexes = pinecone.list_indexes() 29 | # if index_name in active_indexes: 30 | # return True 31 | # else: 32 | # return False 33 | 34 | def upsert_in_chunks(self, index_name, ids_vectors): 35 | # Upsert data with 100 vectors per upsert request 36 | responses = [] 37 | print(ids_vectors) 38 | for ids_vectors_chunk in self.chunks(ids_vectors): 39 | upsert_response = pinecone.Index( 40 | index_name).upsert(vectors=ids_vectors_chunk) 41 | if upsert_response: 42 | responses.append(upsert_response) 43 | 44 | return responses 45 | 46 | def upsert_standard(self, index_name, ids_vectors): 47 | upsert_response = pinecone.Index(index_name).upsert( 48 | vectors=ids_vectors) 49 | return upsert_response 50 | 51 | def upsert(self, index_name, ids_vectors): 52 | if len(ids_vectors) >= 100: 53 | print("data too big, dividing in chunks") 54 | upsert_response = self.upsert_in_chunks(index_name, ids_vectors) 55 | else: 56 | print("upserting data without chunking") 57 | 58 | upsert_response = self.upsert_standard(index_name, ids_vectors) 59 | return upsert_response 60 | 61 | def query(_, index_name, vector): 62 | index = pinecone.Index(index_name) 63 | top_neighbours = index.query( 64 | top_k=3, 65 | include_metadata=True, 66 | vector=vector, 67 | ) 68 | if not top_neighbours: 69 | raise ValueError("No neighbors found") 70 | 71 | return top_neighbours["matches"] 72 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.8.4 2 | aiosignal==1.3.1 3 | async-generator==1.10 4 | async-timeout==4.0.2 5 | asynctest==0.13.0 6 | attrs==22.2.0 7 | autopep8==2.0.2 8 | beautifulsoup4==4.12.1 9 | black==23.3.0 10 | blis==0.7.9 11 | boto3==1.26.124 12 | botocore==1.29.124 13 | cachetools==5.3.0 14 | catalogue==2.0.8 15 | certifi==2022.12.7 16 | cfgv==3.3.1 17 | chardet==5.1.0 18 | charset-normalizer==3.1.0 19 | click==8.1.3 20 | colorama==0.4.6 21 | confection==0.0.4 22 | coverage==7.2.3 23 | crytic-compile==0.2.4 24 | cssselect==1.2.0 25 | cymem==2.0.7 26 | distlib==0.3.5 27 | dnspython==2.3.0 28 | docker==6.0.1 29 | duckduckgo-search==2.8.5 30 | en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl 31 | et-xmlfile==1.1.0 32 | exceptiongroup==1.1.1 33 | filelock==3.8.0 34 | flake8==6.0.0 35 | frozenlist==1.3.3 36 | gitdb==4.0.10 37 | GitPython==3.1.31 38 | google-api-core==2.11.0 39 | google-api-python-client==2.84.0 40 | google-auth==2.17.2 41 | google-auth-httplib2==0.1.0 42 | googleapis-common-protos==1.59.0 43 | gTTS==2.3.1 44 | h11==0.14.0 45 | httplib2==0.22.0 46 | identify==2.5.22 47 | idna==3.4 48 | iniconfig==2.0.0 49 | isort==5.12.0 50 | Jinja2==3.1.2 51 | jmespath==1.0.1 52 | joblib==1.2.0 53 | jsonschema==4.17.3 54 | langcodes==3.3.0 55 | loguru==0.7.0 56 | lxml==4.9.2 57 | MarkupSafe==2.1.2 58 | mccabe==0.7.0 59 | multidict==6.0.4 60 | murmurhash==1.0.9 61 | mypy-extensions==1.0.0 62 | nltk==3.8.1 63 | nodeenv==1.7.0 64 | numpy==1.24.2 65 | oauthlib==3.2.2 66 | openai==0.27.2 67 | openpyxl==3.1.2 68 | orjson==3.8.10 69 | outcome==1.2.0 70 | packaging==23.0 71 | panda==0.3.1 72 | pandas==2.0.1 73 | pandas-stubs==2.0.0.230412 74 | pathspec==0.11.1 75 | pathy==0.10.1 76 | Pillow==9.5.0 77 | pinecone-client==2.2.1 78 | platformdirs==2.5.2 79 | playsound==1.2.2 80 | pluggy==1.0.0 81 | pre-commit==3.2.2 82 | preshed==3.0.8 83 | prettytable==3.4.1 84 | protobuf==4.22.1 85 | py-cpuinfo==9.0.0 86 | pyasn1==0.4.8 87 | pyasn1-modules==0.2.8 88 | pycodestyle==2.10.0 89 | pydantic==1.10.7 90 | pyflakes==3.0.1 91 | pyparsing==3.0.9 92 | pyrsistent==0.19.3 93 | pysha3==1.0.2 94 | PySocks==1.7.1 95 | pytest==7.3.1 96 | pytest-asyncio==0.21.0 97 | pytest-benchmark==4.0.0 98 | pytest-cov==4.0.0 99 | pytest-integration==0.2.3 100 | pytest-mock==3.10.0 101 | python-dateutil==2.8.2 102 | python-dotenv==1.0.0 103 | pytz==2023.3 104 | PyYAML==6.0 105 | readability-lxml==0.8.1 106 | redis==4.5.4 107 | regex==2023.3.23 108 | requests==2.28.2 109 | requests-oauthlib==1.3.1 110 | rsa==4.9 111 | s3transfer==0.6.0 112 | selenium==4.8.3 113 | six==1.16.0 114 | slither-analyzer==0.9.0 115 | smart-open==6.3.0 116 | smmap==5.0.0 117 | sniffio==1.3.0 118 | sortedcontainers==2.4.0 119 | soupsieve==2.4 120 | sourcery==1.2.0 121 | spacy==3.5.2 122 | spacy-legacy==3.0.12 123 | spacy-loggers==1.0.4 124 | srsly==2.4.6 125 | thinc==8.1.9 126 | tiktoken==0.3.3 127 | tomli==2.0.1 128 | tqdm==4.65.0 129 | trio==0.22.0 130 | trio-websocket==0.10.2 131 | tweepy==4.13.0 132 | typer==0.7.0 133 | types-pytz==2023.3.0.0 134 | typing_extensions==4.5.0 135 | tzdata==2023.3 136 | uritemplate==4.1.1 137 | urllib3==1.26.15 138 | virtualenv==20.16.3 139 | wasabi==1.1.1 140 | wcwidth==0.2.5 141 | webdriver-manager==3.8.6 142 | websocket-client==1.5.1 143 | wsproto==1.2.0 144 | yarl==1.8.2 145 | -------------------------------------------------------------------------------- /dynamo/dynamo_manager.py: -------------------------------------------------------------------------------- 1 | import os 2 | import boto3 3 | 4 | class DynamoManager: 5 | def __init__(self, table_name): 6 | self.access_key = os.environ.get("AWS_ACCESS_KEY_ID") 7 | self.secret_key = os.environ.get("AWS_SECRET_ACCESS_KEY") 8 | self.region_name = os.environ.get("AWS_REGION") 9 | self.session = boto3.Session( 10 | aws_access_key_id=self.access_key, 11 | aws_secret_access_key=self.secret_key, 12 | region_name=self.region_name 13 | ) 14 | self.dynamodb = self.session.client('dynamodb') 15 | self.table_name = table_name 16 | self.table = self.retrieve_or_create_table() 17 | 18 | def retrieve_or_create_table(self): 19 | try: 20 | table = self.dynamodb.describe_table(TableName=self.table_name)['Table'] 21 | print(f"Table '{self.table_name}' found") 22 | except self.dynamodb.exceptions.ResourceNotFoundException: 23 | print(f"Table '{self.table_name}' does not exist, creating it...") 24 | table = self.dynamodb.create_table( 25 | TableName=self.table_name, 26 | KeySchema=[ 27 | { 28 | 'AttributeName': 'id', 29 | 'KeyType': 'HASH' 30 | } 31 | ], 32 | AttributeDefinitions=[ 33 | { 34 | 'AttributeName': 'id', 35 | 'AttributeType': 'S' 36 | }, 37 | { 38 | 'AttributeName': 'page_url', 39 | 'AttributeType': 'S' 40 | }, 41 | { 42 | 'AttributeName': 'page_text', 43 | 'AttributeType': 'S' 44 | } 45 | ], 46 | BillingMode='PAY_PER_REQUEST', 47 | GlobalSecondaryIndexes=[ 48 | { 49 | 'IndexName': 'page_url_index', 50 | 'KeySchema': [ 51 | { 52 | 'AttributeName': 'page_url', 53 | 'KeyType': 'HASH' 54 | } 55 | ], 56 | 'Projection': { 57 | 'ProjectionType': 'ALL' 58 | } 59 | } 60 | ] 61 | ) 62 | waiter = self.dynamodb.get_waiter('table_exists') 63 | waiter.wait(TableName=self.table_name) 64 | print(f"Table '{self.table_name}' created") 65 | return table 66 | 67 | def put_item(self, item): 68 | # Check that the item has the correct structure 69 | if 'id' not in item or 'page_url' not in item or 'page_text' not in item: 70 | print("Error: item is missing one or more required attributes") 71 | return 72 | 73 | # Check that the table exists 74 | try: 75 | self.dynamodb.describe_table(TableName=self.table_name) 76 | except self.dynamodb.exceptions.ResourceNotFoundException: 77 | print(f"Error: table '{self.table_name}' does not exist") 78 | return 79 | 80 | # Add the item to the table 81 | self.table.put_item(Item=item) 82 | print(f"Item with ID '{item['id']}' added to table '{self.table_name}'") 83 | return item 84 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | -------------------------------------------------------------------------------- /create_training_data.py: -------------------------------------------------------------------------------- 1 | import openai 2 | import csv 3 | import uuid 4 | import multiprocessing as mp 5 | from get_text_embeddings import get_text_embeddings 6 | from pineconeClient import PineconeClient 7 | import os 8 | # Set up OpenAI API credentials 9 | openai.api_key = os.environ['OPEN_AI'] 10 | openai.api_base = "https://api.openai.com/v1/" 11 | 12 | 13 | # def create_training_data(prompt, examples): 14 | # max_tokens = 3500 15 | # while max_tokens > 0: 16 | # try: 17 | # print(f"Requesting with max_tokens={max_tokens}") 18 | # response = openai.ChatCompletion.create( 19 | # model="gpt-3.5-turbo", 20 | # messages=[{"role":"user", "content":prompt}], 21 | # temperature=0.4, 22 | # ) 23 | # print("Request completed") 24 | # print(response) 25 | # return response.choices[0].message.content.strip() 26 | # except openai.error.OpenAIError as e: 27 | # print(f"OpenAI error: {e}") 28 | # print("Retry with lower max_tokens") 29 | # max_tokens -= 200 30 | 31 | # raise Exception("Failed to generate response after multiple retries") 32 | 33 | 34 | # def extract_json(text): 35 | # # Initialize an empty list to store parsed JSON objects 36 | # json_objects = [] 37 | # # Loop through the string, looking for curly braces 38 | # start_index = 0 39 | # while True: 40 | # start_index = text.find("{", start_index) 41 | # if start_index == -1: 42 | # break 43 | # end_index = text.find("}", start_index) 44 | # if end_index == -1: 45 | # break 46 | # # Extract the JSON substring from the text 47 | # json_string = text[start_index:end_index+1] 48 | # try: 49 | # # Attempt to parse the JSON string 50 | # json_data = json.loads(json_string) 51 | 52 | # # Check if the prompt and completion keys are present in the JSON object 53 | # if "prompt" in json_data and "completion" in json_data: 54 | # prompt = json_data["prompt"] + " \n\n###\n\n" 55 | # completion = " " + json_data["completion"] + "###" 56 | # json_objects.append( 57 | # {"prompt": prompt, "completion": completion}) 58 | 59 | # except ValueError: 60 | # # If the JSON string is invalid, ignore it 61 | # pass 62 | # # Move the start index to the next character after the end index 63 | # start_index = end_index + 1 64 | 65 | # return json_objects 66 | 67 | 68 | def process_row(row): 69 | # # Replace any newlines or tabs in the page text with spaces 70 | # page_text = row["page_text"].replace("\n", " ").replace("\t", " ") 71 | # prompt = 'Generate one or more strings convertible to JSON containing the training data relative to this page' + \ 72 | # row["page_url"] + '.\n' 73 | # prompt += 'The training data must be in the format of: {"prompt": "", "completion": ""}\n' 74 | # prompt += 'Prompt: question or statement related to the page content. Keep it <280 characters and as generic as possible. Must include question marks only 20% of times.\n' 75 | # prompt += 'Response: provide a specific answer to the prompt in <1000 characters. The completion should include relevant code and reference the appropriate section in the documentation page.' 76 | # prompt += row["page_text"] 77 | # examples = [ 78 | # ["Input:", page_text], 79 | # ["Output:", ""] 80 | # ] 81 | # if response: 82 | # json_data = extract_json(response) 83 | # if len(json_data): 84 | # return json_data 85 | embedding = get_text_embeddings(row['page_text']) 86 | if embedding: 87 | return embedding, row['page_text'] 88 | 89 | 90 | def page_text_to_embeddings(input_file): 91 | with open(input_file, "r", newline="", encoding="utf-8") as infile: 92 | reader = csv.DictReader(infile) 93 | pool = mp.Pool(processes=mp.cpu_count()) 94 | vectors_map = pool.map(process_row, reader) 95 | pool.close() 96 | pool.join() 97 | ids_vectors = [] 98 | # training_data = [ 99 | # json_data for json_list in training_data for json_data in json_list] 100 | for vector in vectors_map: 101 | id = str(uuid.uuid4()) 102 | 103 | 104 | ids_vectors.append({"id": id, "values": vector[0], 105 | "metadata": {"message": vector[1]}}) 106 | 107 | client = PineconeClient() 108 | client.create_index("my-index") 109 | response = client.upsert("my-index", ids_vectors) 110 | print(response) 111 | 112 | 113 | # Example usage 114 | if __name__ == "__main__": 115 | page_text_to_embeddings("page_text.csv") 116 | -------------------------------------------------------------------------------- /scraper.py: -------------------------------------------------------------------------------- 1 | 2 | import requests 3 | from bs4 import BeautifulSoup 4 | from urllib.parse import urlparse, urljoin 5 | from multiprocessing import Pool 6 | import csv 7 | from create_training_data import page_text_to_embeddings 8 | from create_fine_tuning_job import create_fine_tune_model, upload_jsonl_file 9 | import datetime 10 | import spacy 11 | nlp = spacy.load("en_core_web_sm") 12 | 13 | 14 | def findFirstPage(url): 15 | 16 | # Send a request to the URL and get its content 17 | response = requests.get(url) 18 | content = response.content 19 | 20 | # Create a BeautifulSoup object from the webpage content 21 | soup = BeautifulSoup(content, "html.parser") 22 | 23 | # Get the domain of the webpage URL 24 | domain = urlparse(url).netloc 25 | 26 | # Find all the links in the webpage and extract their href attributes 27 | links = [] 28 | for link in soup.find_all("a"): 29 | href = link.get("href") 30 | if not href: 31 | # Skip links with empty href attribute 32 | continue 33 | # Check if the link is absolute or relative 34 | if urlparse(href).scheme: 35 | # Absolute link, no need to modify 36 | absolute_link = href 37 | else: 38 | # Relative link, prepend the domain of the webpage URL 39 | absolute_link = urljoin(url, href) 40 | # Check if the link is a duplicate or has already been found 41 | if absolute_link in links: 42 | continue 43 | # Check if the link has the same root domain as the webpage URL 44 | if urlparse(absolute_link).netloc == domain: 45 | # Add the absolute link to the list of links 46 | links.append(absolute_link) 47 | return links 48 | 49 | 50 | def scrape_link(link, domain): 51 | # Send a request to the link and get its content 52 | response = requests.get(link) 53 | content = response.content 54 | # Create a BeautifulSoup object from the content 55 | soup = BeautifulSoup(content, "html.parser") 56 | # Find all the links in the webpage and extract their href attributes 57 | absolute_links = [] 58 | 59 | for a in soup.find_all("a"): 60 | href = a.get("href") 61 | if not href: 62 | # Skip links with empty href attribute 63 | continue 64 | # Check if the link is absolute or relative 65 | if urlparse(href).scheme: 66 | # Absolute link, no need to modify 67 | absolute_link = href 68 | else: 69 | # Relative link, prepend the domain of the webpage URL 70 | absolute_link = urljoin(link, href) 71 | 72 | # Check if the link has the same root domain as the webpage URL 73 | if urlparse(absolute_link).netloc == domain: 74 | # Add the absolute link to the list of links 75 | absolute_links.append(absolute_link) 76 | return absolute_links 77 | 78 | # Scrape the pages contained in the links in parallel using the Pool class 79 | 80 | 81 | def multiProcessPages(links, max_pages, url): 82 | visited = set(links) 83 | visited_pages = 0 84 | with Pool() as pool: 85 | while links and len(visited) < max_pages: 86 | # Scrape the links in batches of 50 87 | batch = links[:50] 88 | links = links[50:] 89 | visited_pages += 50 90 | print(f"pages {len(batch)}") 91 | # Map the scrape_link function to the links in the batch 92 | results = pool.starmap( 93 | scrape_link, [(link, url) for link in batch]) 94 | # Flatten the list of lists of absolute links to a list of absolute links 95 | absolute_links = [link for sublist in results for link in sublist] 96 | # Remove duplicates and links that have already been visited 97 | absolute_links = set(absolute_links) - visited 98 | # Add the absolute links to the list of visited links 99 | visited |= absolute_links 100 | print(f"pages {len(visited)}") 101 | # Add the absolute links to the list of links to scrape next 102 | links += list(absolute_links) 103 | return list(visited)[:max_pages] 104 | 105 | 106 | # def save_links_sorted(links): 107 | # # Sort the links in alphabetical order 108 | # sorted_links = sorted(links) 109 | # # Save all the found absolute links in the same text file 110 | # with open("links.txt", "w") as file: 111 | # for link in sorted_links: 112 | # file.write(link + "\n") 113 | 114 | 115 | def save_text_csv(links): 116 | with open("page_text.csv", "w", newline="", encoding="utf-8") as file: 117 | writer = csv.writer(file) 118 | writer.writerow(["Index", "page_url", "page_text"]) 119 | with Pool() as pool: 120 | results = pool.map(scrape_text, links) 121 | for link, text in results: 122 | if text is not None: 123 | print(f"Processing text from {link}") 124 | text = text.replace("\t", " ") 125 | doc = nlp(text) 126 | max_chunk_tokens = 300 127 | chunks = [] 128 | chunk = [] 129 | num_tokens = 0 130 | for sent in doc.sents: 131 | sent_tokens = len(sent) 132 | if num_tokens + sent_tokens > max_chunk_tokens and len(chunk) > 0: 133 | chunks.append(" ".join(chunk)) 134 | chunk = [] 135 | num_tokens = 0 136 | chunk.append(sent.text) 137 | num_tokens += sent_tokens 138 | if len(chunk) > 0: 139 | chunks.append(" ".join(chunk)) 140 | for i, chunk_text in enumerate(chunks): 141 | writer.writerow([i+1, link, chunk_text]) 142 | print(f"Saved text from {link} to CSV file") 143 | else: 144 | print(f"Skipping {link} due to empty text") 145 | 146 | 147 | def scrape_text(link): 148 | print(f"Scraping text from {link}") 149 | response = requests.get(link) 150 | content = response.content 151 | soup = BeautifulSoup(content, "html.parser") 152 | 153 | # Find the body of the page 154 | body = soup.find("body") 155 | if body is None: 156 | return link, None 157 | 158 | # Find the header and footer tags, if they exist, and remove them and their contents 159 | for tag in body(["header", "footer", "nav"]): 160 | tag.extract() 161 | 162 | # Extract the remaining text in the body and remove double new lines and random spaces 163 | text = body.get_text(separator="\n").replace("\n\n", "\n").strip() 164 | text = ' '.join(text.split()) 165 | 166 | return link, text 167 | 168 | 169 | if __name__ == "__main__": 170 | links = findFirstPage("https://docs.alchemy.com/") 171 | visited = multiProcessPages(links, 15, "https://docs.alchemy.com/") 172 | save_text_csv(visited) 173 | page_text_to_embeddings("page_text.csv") 174 | # id = upload_jsonl_file("training_data.jsonl") 175 | # id = upload_jsonl_file("training_data.jsonl") 176 | # fine_tune_model_id = create_fine_tune_model(id) 177 | 178 | # # Store the fine tune model ID and the current date in a CSV file 179 | # with open("model_ids.csv", "a", newline="", encoding="utf-8") as outfile: 180 | # writer = csv.DictWriter(outfile, fieldnames=["date", "id"]) 181 | # if outfile.tell() == 0: 182 | # writer.writeheader() 183 | # writer.writerow({"date": datetime.datetime.now().strftime( 184 | # "%Y-%m-%d %H:%M:%S"), "id": fine_tune_model_id}) 185 | # print( 186 | # f"Fine tune model ID ({fine_tune_model_id}) and date ({datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}) stored in 'model_ids.csv'") 187 | # create_fine_tuning_job("training_data.jsonlw") 188 | --------------------------------------------------------------------------------