├── .gitignore ├── README.md ├── requirements.txt ├── setup.py └── src └── url_to_llm_text ├── __init__.py ├── get_html_text.py └── get_llm_input_text.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 110 | .pdm.toml 111 | .pdm-python 112 | .pdm-build/ 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | 133 | # Spyder project settings 134 | .spyderproject 135 | .spyproject 136 | 137 | # Rope project settings 138 | .ropeproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | 151 | # pytype static type analyzer 152 | .pytype/ 153 | 154 | # Cython debug symbols 155 | cython_debug/ 156 | 157 | # PyCharm 158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 160 | # and can be added to the global gitignore or merged into this file. For a more nuclear 161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 162 | #.idea/ 163 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Webpage to LLM ready input text 2 | Pre-processing html source text before giving it as input to the LLM improves extraction/scraping accuracy especially if you want to extract website and image links required for most scraping operations like scraping an e-commerce website. 3 | 4 | Use this library to turn any html source text to LLM friendly text. Fully open source alternative to jina reader api and firecrawl api. 5 | 6 | You can also refer to my other repo [AI-web_scraper](https://github.com/m92vyas/AI-web_scraper) for direct scraping tools to scrape multiple links, web search+scraping with just a simple query. It supports multiple LLMs, Web Search and Extracts Data as per your written instructions. 7 | 8 | ### Install: 9 | ```python 10 | pip install git+https://github.com/m92vyas/llm-reader.git 11 | ``` 12 | 13 | ### Import: 14 | ```python 15 | from url_to_llm_text.get_html_text import get_page_source # you can also use your own code or other services to get the page source 16 | from url_to_llm_text.get_llm_input_text import get_processed_text # pass html source text to get llm ready text 17 | ``` 18 | 19 | ### Get processed LLM input text: 20 | 21 | ```python 22 | url= 23 | 24 | # get html source text 25 | # first time the below function will take some time as it loads the web driver, subsequent run will be faster 26 | # You can use your own function to get the html source text 27 | 28 | page_source = await get_page_source(url) 29 | 30 | # get LLM ready input text from html source text 31 | 32 | llm_text = await get_processed_text(page_source, url) 33 | print(llm_text) 34 | ``` 35 | ### Example Usage: 36 | suppose we want to scrape the product name, main product page link, image link and price from the url "https://www.ikea.com/in/en/cat/corner-sofas-10671/" using any openai model. 37 | ```python 38 | import requests 39 | from url_to_llm_text.get_html_text import get_page_source 40 | from url_to_llm_text.get_llm_input_text import get_processed_text 41 | 42 | url = "https://www.ikea.com/in/en/cat/corner-sofas-10671/" 43 | 44 | # get page html source text using this library function or any other means 45 | page_source = await get_page_source(url) 46 | 47 | # get llm ready text and pass the text to your LLM prompt template 48 | llm_text = await get_processed_text(page_source, url) 49 | 50 | # prompt template 51 | prompt_format = """extract the product name, product link, image link and price for all the products given in the below webpage. The format should be: 52 | {{ 53 | "1": {{ 54 | "Product Name": , 55 | "Product Link": , 56 | "Image Link": , 57 | "Price": 58 | }}, 59 | "2": {{ 60 | "Product Name": , 61 | ... 62 | }}, 63 | }} 64 | 65 | webpage: 66 | {llm_friendly_webpage_text} 67 | """ 68 | 69 | # calculate tokens and truncate the llm_text to fit your model context length and your requirements. sometimes you may need only initial part of the webpage. 70 | # below we are manually truncating to 40000 characters. create a seperate function as per your need. 71 | prompt = prompt_format.format(llm_friendly_webpage_text=llm_text[:40000]) 72 | 73 | api_key = 74 | headers = { 75 | "Content-Type": "application/json", 76 | "Authorization": f"Bearer {api_key}" 77 | } 78 | payload = { 79 | "model": "gpt-4o-mini", 80 | "messages": [ 81 | { 82 | "role": "user", 83 | "content": [ 84 | { 85 | "type": "text", 86 | "text": prompt 87 | } 88 | ]}], 89 | 'seed': 0, 90 | "temperature": 0, 91 | "top_p": 0.001, 92 | # "max_tokens": 1024, # if you want to limit the output tokens. this may keep the output json structure incomplete. 93 | "n": 1, 94 | "frequency_penalty": 0, "presence_penalty": 0 95 | } 96 | 97 | response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload) 98 | 99 | print(response.json()['choices'][0]['message']['content']) 100 | ``` 101 | ```python 102 | Output 103 | { 104 | "1": { 105 | "Product Name": "SÖDERHAMN Corner sofa, 6-seat", 106 | "Product Link": "https://www.ikea.com/in/en/p/soederhamn-corner-sofa-6-seat-viarp-beige-brown-s69305895/", 107 | "Image Link": "https://www.ikea.com/in/en/images/products/soederhamn-corner-sofa-6-seat-viarp-beige-brown__0802771_pe768584_s5.jpg?f=xxs", 108 | "Price": "Rs.1,40,080" 109 | }, 110 | "2": { 111 | "Product Name": "HOLMSUND Corner sofa-bed", 112 | "Product Link": "https://www.ikea.com/in/en/p/holmsund-corner-sofa-bed-borgunda-dark-grey-s49516894/", 113 | "Image Link": "https://www.ikea.com/in/en/images/products/holmsund-corner-sofa-bed-borgunda-dark-grey__1212713_pe910718_s5.jpg?f=xxs", 114 | "Price": "Rs.69,990" 115 | }, 116 | "3": { 117 | "Product Name": "JÄTTEBO U-shaped sofa, 7-seat", 118 | "Product Link": "https://www.ikea.com/in/en/p/jaettebo-u-shaped-sofa-7-seat-with-chaise-longue-right-with-headrests-tonerud-grey-s39510618/", 119 | "Image Link": "https://www.ikea.com/in/en/images/products/jaettebo-u-shaped-sofa-7-seat-with-chaise-longue-right-with-headrests-tonerud-grey__1179836_pe896109_s5.jpg?f=xxs", 120 | "Price": "Rs.2,60,000" 121 | }, 122 | "4": { 123 | "Product Name": "SÖDERHAMN Corner sofa, 4-seat", 124 | "Product Link": "https://www.ikea.com/in/en/p/soederhamn-corner-sofa-4-seat-with-open-end-tonerud-red-s09514420/", 125 | "Image Link": "https://www.ikea.com/in/en/images/products/soederhamn-corner-sofa-4-seat-with-open-end-tonerud-red__1213815_pe911323_s5.jpg?f=xxs", 126 | "Price": "Rs.98,540" 127 | }, 128 | "5": { 129 | "Product Name": "JÄTTEBO Mod crnr sofa 2,5-seat w chaise lng", 130 | "Product Link": "https://www.ikea.com/in/en/p/jaettebo-mod-crnr-sofa-2-5-seat-w-chaise-lng-right-samsala-grey-beige-s09485173/", 131 | "Image Link": "https://www.ikea.com/in/en/images/products/jaettebo-mod-crnr-sofa-2-5-seat-w-chaise-lng-right-samsala-grey-beige__1109627_pe870119_s5.jpg?f=xxs", 132 | "Price": "Rs.1,32,000" 133 | }, 134 | "6": { 135 | "Product Name": "JÄTTEBO Modular corner sofa, 6 seat", 136 | "Product Link": "https://www.ikea.com/in/en/p/jaettebo-modular-corner-sofa-6-seat-samsala-dark-yellow-green-s09485248/", 137 | "Image Link": "https://www.ikea.com/in/en/images/products/jaettebo-modular-corner-sofa-6-seat-samsala-dark-yellow-green__1109619_pe870109_s5.jpg?f=xxs", 138 | "Price": "Rs.2,06,000" 139 | }, 140 | "7": { 141 | "Product Name": "SÖDERHAMN Corner sofa, 3-seat", 142 | "Product Link": "https://www.ikea.com/in/en/p/soederhamn-corner-sofa-3-seat-viarp-beige-brown-s09305884/", 143 | "Image Link": "https://www.ikea.com/in/en/images/products/soederhamn-corner-sofa-3-seat-viarp-beige-brown__0802711_pe768555_s5.jpg?f=xxs", 144 | "Price": "Rs.91,000" 145 | }, 146 | ......} 147 | ``` 148 | 149 | ### Documentation: 150 | https://github.com/m92vyas/llm-reader/wiki/Documentation 151 | 152 | 153 | ### To Scrape without getting Blocked: 154 | - Apart from the open source option shared here, i am in the process of creating a paid API service that handles website blocking, dynamic content etc. 155 | - If you are interested you can connect with me (view contact details in my profile) for API trial or for any feature request. 156 | - It will also have parsing solutions for various documents like pdfs/docx, full website crawling & parsing, video, audio with complete RAG solution which can handle and retrieve complex layouts, images, tables, math equations etc. (RAG as a service) 157 | - I can develop a custom solution as per your requirements. 158 | 159 | 160 | ### What if the extracted results are inaccurate: 161 | - Some websites' structure can cause the LLM to misinterpret certain fields like it may assign the image link of the next product to the previous product while extractions. 162 | - You can connect with me to resolve such issues. The HTML cleaning code has to be modified as per the inaccuracy and then things will work for that website. 163 | - As the code is open sourced you can modify the code and handle such issues which is not possible for closed sourced options. If you are using any paid solution to avoid getting blocked you can get only the source HTML from the paid provider and use the modified cleaning code to avoid such inaccuracies. 164 | - If you understand web scraping script you can modify the `get_processed_text` function. It generally involves finding the css selector or xpath that will help you to separate out sections of the webpage that have issues (like separate out product wise) and then use some delimiter between them and merge them to get the page content. 165 | 166 | ### Support & Feedback: 167 | - Share and consider giving a Star if you found this repo helpful. 168 | - I am available for freelance work: maharishi92vyas@gmail.com / https://www.linkedin.com/in/maharishi-vyas 169 | - Also try out the other repo [AI-web_scraper](https://github.com/m92vyas/AI-web_scraper) and leave a Star there if you find it useful. 170 | - Open any issues or feature request. 171 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.12.3 2 | inscriptis==2.5.0 3 | minify_html==0.15.0 4 | selenium==4.23.1 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup,find_packages 2 | 3 | setup( 4 | name = 'llm-reader' 5 | , version = '1' 6 | , license = 'MIT License' 7 | , author = "Maharishi Vyas" 8 | , author_email = 'maharishi92vyas@gmail.com' 9 | , packages = find_packages('src') 10 | , package_dir = {'': 'src'} 11 | , url = 'https://github.com/m92vyas/llm-reader.git' 12 | , keywords = 'url to llm ready input text' 13 | , install_requires = [ 14 | 'selenium', 15 | 'beautifulsoup4', 16 | 'inscriptis', 17 | 'minify_html' 18 | ] 19 | , include_package_data=True 20 | ) -------------------------------------------------------------------------------- /src/url_to_llm_text/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/m92vyas/llm-reader/e25203ac9eeea2ed715b968927f4fc3ef366b004/src/url_to_llm_text/__init__.py -------------------------------------------------------------------------------- /src/url_to_llm_text/get_html_text.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver.chrome.options import Options 3 | import requests 4 | 5 | async def get_page_source(url: str, 6 | wait: float = 1.5, 7 | headless: bool = True, 8 | user_agent: str = "Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia 640 XL LTE) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Mobile Safari/537.36 Edge/12.10166" 9 | ) -> str: 10 | """ 11 | Get html text using selenium 12 | 13 | Args: 14 | url (str): The url from which html content is to be extracted 15 | wait (float): time to implicitly wait for the website to load. default is 1.5 sec. 16 | headless (bool): use headless browser or not. default True 17 | user_agent (str): user agent. default "Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia 640 XL LTE) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Mobile Safari/537.36 Edge/12.10166" 18 | 19 | Returns (str): 20 | html text 21 | """ 22 | try: 23 | # check if using google colab 24 | using_colab = False 25 | try: 26 | import google.colab 27 | using_colab = True 28 | except: 29 | using_colab = False 30 | # add driver options 31 | options = webdriver.ChromeOptions() 32 | if headless: 33 | options.add_argument('--headless') 34 | options.add_argument(f'--user-agent={user_agent}') 35 | if using_colab: 36 | options.add_argument('--no-sandbox') 37 | options.add_argument('--disable-dev-shm-usage') 38 | 39 | driver = webdriver.Chrome(options=options) 40 | driver.get(url) 41 | driver.implicitly_wait(wait) 42 | 43 | return driver.page_source 44 | except Exception as e: 45 | print('Error while getting page source: ', e) 46 | return '' 47 | -------------------------------------------------------------------------------- /src/url_to_llm_text/get_llm_input_text.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import re 3 | from urllib.parse import urljoin 4 | from minify_html import minify 5 | from inscriptis import get_text 6 | 7 | async def get_processed_text(page_source: str, base_url: str, 8 | html_parser: str ='lxml', 9 | keep_images: bool =True, remove_svg_image: bool =True, remove_gif_image: bool =True, remove_image_types: list =[], 10 | keep_webpage_links: bool =True, 11 | remove_script_tag: bool =True, remove_style_tag: bool =True, remove_tags: list =[] 12 | ) -> str: 13 | """ 14 | process html text. This helps the LLM to easily extract/scrape data especially image links and web links. 15 | 16 | Args: 17 | page_source (str): html source text 18 | base_url (str): url of the html source. 19 | html_parser (str): which beautifulsoup html parser to use, defaults to 'lxml' 20 | keep_images (bool): keep image links. If False will remove image links from the text saving tokens to be processed by LLM. Default True 21 | remove_svg_image (bool): remove .svg image. usually not useful while scraping. default True 22 | remove_gif_image (bool): remove .gif image. usually not useful while scraping. default True 23 | remove_image_types (list): add any image extensions which you want to remove inside a list. eg: [.png]. Default [] 24 | keep_webpage_links (bool): keep webpage links. if scraping job does not require links then can remove them to reduce input token count to LLM. Default True 25 | remove_script_tag (bool): True 26 | remove_style_tag (bool): =True 27 | remove_tags (list): = list of tags to be remove. Default [] 28 | 29 | Returns (str): 30 | LLM ready input web page text 31 | """ 32 | try: 33 | soup = BeautifulSoup(page_source, html_parser) 34 | 35 | # -------remove tags---------- 36 | remove_tag = [] 37 | if remove_script_tag: 38 | remove_tag.append('script') 39 | if remove_style_tag: 40 | remove_tag.append('style') 41 | remove_tag.extend(remove_tags) 42 | remove_tag = list(set(remove_tag)) 43 | for tag in soup.find_all(remove_tag): 44 | try: 45 | tag.extract() 46 | except Exception as e: 47 | print('Error while removing tag: ', e) 48 | continue 49 | 50 | # --------process image links-------- 51 | remove_image_type = [] 52 | if remove_svg_image: 53 | remove_image_type.append('.svg') 54 | if remove_gif_image: 55 | remove_image_type.append('.gif') 56 | remove_image_type.extend(remove_image_types) 57 | remove_image_type = list(set(remove_image_type)) 58 | for image in (images := soup.find_all('img')): 59 | try: 60 | if not keep_images: 61 | image.replace_with('') 62 | else: 63 | image_link = image.get('src') 64 | type_replaced = False 65 | if type(image_link)==str: 66 | if remove_image_type!=[]: 67 | for image_type in remove_image_type: 68 | if not type_replaced and image_type in image_link: 69 | image.replace_with('') 70 | type_replaced=True 71 | if not type_replaced: 72 | image.replace_with('\n' + urljoin(base_url, image_link) + ' ') 73 | except Exception as e: 74 | print('Error while getting image link: ', e) 75 | continue 76 | # ----------process website links----------- 77 | for link in (urls := soup.find_all('a', href=True)): 78 | try: 79 | if not keep_webpage_links: 80 | link.replace_with('') 81 | else: 82 | link.replace_with(link.text + ': ' + urljoin(base_url, link['href']) + ' ') 83 | except Exception as e: 84 | print('Error while getting webpage link: ', e) 85 | continue 86 | 87 | # -----------change text structure----------- 88 | body_content = soup.find('body') 89 | if body_content: 90 | try: 91 | minimized_body = minify(str(body_content)) 92 | text = get_text(minimized_body) 93 | except: 94 | text = get_text(str(body_content)) 95 | else: 96 | text = soup.get_text() 97 | return text 98 | 99 | except Exception as e: 100 | print('Error while getting processed text: ', e) 101 | return '' 102 | --------------------------------------------------------------------------------