├── .gitignore
├── README.md
├── requirements.txt
├── setup.py
└── src
    └── url_to_llm_text
        ├── __init__.py
        ├── get_html_text.py
        └── get_llm_input_text.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 | 
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 | 
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 | 
121 | # SageMath parsed files
122 | *.sage.py
123 | 
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 | 
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 | 
137 | # Rope project settings
138 | .ropeproject
139 | 
140 | # mkdocs documentation
141 | /site
142 | 
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 | 
148 | # Pyre type checker
149 | .pyre/
150 | 
151 | # pytype static type analyzer
152 | .pytype/
153 | 
154 | # Cython debug symbols
155 | cython_debug/
156 | 
157 | # PyCharm
158 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
161 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Webpage to LLM ready input text
  2 | Pre-processing html source text before giving it as input to the LLM improves extraction/scraping accuracy especially if you want to extract website and image links required for most scraping operations like scraping an e-commerce website.
  3 | 
  4 | Use this library to turn any html source text to LLM friendly text. Fully open source alternative to jina reader api and firecrawl api.
  5 | 
  6 | You can also refer to my other repo [AI-web_scraper](https://github.com/m92vyas/AI-web_scraper) for direct scraping tools to scrape multiple links, web search+scraping with just a simple query. It supports multiple LLMs, Web Search and Extracts Data as per your written instructions.
  7 | 
  8 | ### Install:
  9 | ```python
 10 | pip install git+https://github.com/m92vyas/llm-reader.git
 11 | ```
 12 | 
 13 | ### Import:
 14 | ```python
 15 | from url_to_llm_text.get_html_text import get_page_source   # you can also use your own code or other services to get the page source
 16 | from url_to_llm_text.get_llm_input_text import get_processed_text   # pass html source text to get llm ready text
 17 | ```
 18 | 
 19 | ### Get processed LLM input text:
 20 | 
 21 | ```python
 22 | url= <url_to_scrape>
 23 | 
 24 | # get html source text
 25 | # first time the below function will take some time as it loads the web driver, subsequent run will be faster
 26 | # You can use your own function to get the html source text 
 27 | 
 28 | page_source = await get_page_source(url)
 29 | 
 30 | # get LLM ready input text from html source text
 31 | 
 32 | llm_text = await get_processed_text(page_source, url)
 33 | print(llm_text)
 34 | ```
 35 | ### Example Usage:
 36 | suppose we want to scrape the product name, main product page link, image link and price from the url "https://www.ikea.com/in/en/cat/corner-sofas-10671/" using any openai model.
 37 | ```python
 38 | import requests
 39 | from url_to_llm_text.get_html_text import get_page_source
 40 | from url_to_llm_text.get_llm_input_text import get_processed_text
 41 | 
 42 | url = "https://www.ikea.com/in/en/cat/corner-sofas-10671/"
 43 | 
 44 | # get page html source text using this library function or any other means
 45 | page_source = await get_page_source(url)
 46 | 
 47 | # get llm ready text and pass the text to your LLM prompt template
 48 | llm_text = await get_processed_text(page_source, url)
 49 | 
 50 | # prompt template
 51 | prompt_format = """extract the product name, product link, image link and price for all the products given in the below webpage. The format should be:
 52 | {{
 53 |   "1": {{
 54 |         "Product Name": ,
 55 |         "Product Link": ,
 56 |         "Image Link": ,
 57 |         "Price":
 58 |         }},
 59 |   "2": {{
 60 |         "Product Name": ,
 61 |         ...
 62 |         }},
 63 | }}
 64 | 
 65 | webpage:
 66 | {llm_friendly_webpage_text}
 67 | """
 68 | 
 69 | # calculate tokens and truncate the llm_text to fit your model context length and your requirements. sometimes you may need only initial part of the webpage.
 70 | # below we are manually truncating to 40000 characters. create a seperate function as per your need.
 71 | prompt = prompt_format.format(llm_friendly_webpage_text=llm_text[:40000])
 72 | 
 73 | api_key = <your openai api key>
 74 | headers = {
 75 |   "Content-Type": "application/json",
 76 |   "Authorization": f"Bearer {api_key}"
 77 | }
 78 | payload = {
 79 |   "model": "gpt-4o-mini",
 80 |   "messages": [
 81 |     {
 82 |       "role": "user",
 83 |       "content": [
 84 |         {
 85 |           "type": "text",
 86 |           "text": prompt
 87 |         }
 88 |   ]}],
 89 |   'seed': 0,
 90 |   "temperature": 0,
 91 |   "top_p": 0.001,
 92 |   # "max_tokens": 1024, # if you want to limit the output tokens. this may keep the output json structure incomplete.
 93 |   "n": 1,
 94 |   "frequency_penalty": 0, "presence_penalty": 0
 95 | }
 96 | 
 97 | response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
 98 | 
 99 | print(response.json()['choices'][0]['message']['content'])
100 | ```
101 | ```python
102 | Output
103 | {
104 |     "1": {
105 |         "Product Name": "SÖDERHAMN Corner sofa, 6-seat",
106 |         "Product Link": "https://www.ikea.com/in/en/p/soederhamn-corner-sofa-6-seat-viarp-beige-brown-s69305895/",
107 |         "Image Link": "https://www.ikea.com/in/en/images/products/soederhamn-corner-sofa-6-seat-viarp-beige-brown__0802771_pe768584_s5.jpg?f=xxs",
108 |         "Price": "Rs.1,40,080"
109 |     },
110 |     "2": {
111 |         "Product Name": "HOLMSUND Corner sofa-bed",
112 |         "Product Link": "https://www.ikea.com/in/en/p/holmsund-corner-sofa-bed-borgunda-dark-grey-s49516894/",
113 |         "Image Link": "https://www.ikea.com/in/en/images/products/holmsund-corner-sofa-bed-borgunda-dark-grey__1212713_pe910718_s5.jpg?f=xxs",
114 |         "Price": "Rs.69,990"
115 |     },
116 |     "3": {
117 |         "Product Name": "JÄTTEBO U-shaped sofa, 7-seat",
118 |         "Product Link": "https://www.ikea.com/in/en/p/jaettebo-u-shaped-sofa-7-seat-with-chaise-longue-right-with-headrests-tonerud-grey-s39510618/",
119 |         "Image Link": "https://www.ikea.com/in/en/images/products/jaettebo-u-shaped-sofa-7-seat-with-chaise-longue-right-with-headrests-tonerud-grey__1179836_pe896109_s5.jpg?f=xxs",
120 |         "Price": "Rs.2,60,000"
121 |     },
122 |     "4": {
123 |         "Product Name": "SÖDERHAMN Corner sofa, 4-seat",
124 |         "Product Link": "https://www.ikea.com/in/en/p/soederhamn-corner-sofa-4-seat-with-open-end-tonerud-red-s09514420/",
125 |         "Image Link": "https://www.ikea.com/in/en/images/products/soederhamn-corner-sofa-4-seat-with-open-end-tonerud-red__1213815_pe911323_s5.jpg?f=xxs",
126 |         "Price": "Rs.98,540"
127 |     },
128 |     "5": {
129 |         "Product Name": "JÄTTEBO Mod crnr sofa 2,5-seat w chaise lng",
130 |         "Product Link": "https://www.ikea.com/in/en/p/jaettebo-mod-crnr-sofa-2-5-seat-w-chaise-lng-right-samsala-grey-beige-s09485173/",
131 |         "Image Link": "https://www.ikea.com/in/en/images/products/jaettebo-mod-crnr-sofa-2-5-seat-w-chaise-lng-right-samsala-grey-beige__1109627_pe870119_s5.jpg?f=xxs",
132 |         "Price": "Rs.1,32,000"
133 |     },
134 |     "6": {
135 |         "Product Name": "JÄTTEBO Modular corner sofa, 6 seat",
136 |         "Product Link": "https://www.ikea.com/in/en/p/jaettebo-modular-corner-sofa-6-seat-samsala-dark-yellow-green-s09485248/",
137 |         "Image Link": "https://www.ikea.com/in/en/images/products/jaettebo-modular-corner-sofa-6-seat-samsala-dark-yellow-green__1109619_pe870109_s5.jpg?f=xxs",
138 |         "Price": "Rs.2,06,000"
139 |     },
140 |     "7": {
141 |         "Product Name": "SÖDERHAMN Corner sofa, 3-seat",
142 |         "Product Link": "https://www.ikea.com/in/en/p/soederhamn-corner-sofa-3-seat-viarp-beige-brown-s09305884/",
143 |         "Image Link": "https://www.ikea.com/in/en/images/products/soederhamn-corner-sofa-3-seat-viarp-beige-brown__0802711_pe768555_s5.jpg?f=xxs",
144 |         "Price": "Rs.91,000"
145 |     },
146 |     ......}
147 | ```
148 | 
149 | ### Documentation:
150 | https://github.com/m92vyas/llm-reader/wiki/Documentation
151 | 
152 | 
153 | ### To Scrape without getting Blocked:
154 |  - Apart from the open source option shared here, i am in the process of creating a paid API service that handles website blocking, dynamic content etc.
155 |  - If you are interested you can connect with me (view contact details in my profile) for API trial or for any feature request.
156 |  - It will also have parsing solutions for various documents like pdfs/docx, full website crawling & parsing, video, audio with complete RAG solution which can handle and retrieve complex layouts, images, tables, math equations etc. (RAG as a service)
157 |  - I can develop a custom solution as per your requirements.
158 | 
159 | 
160 | ### What if the extracted results are inaccurate:
161 | - Some websites' structure can cause the LLM to misinterpret certain fields like it may assign the image link of the next product to the previous product while extractions.
162 | - You can connect with me to resolve such issues. The HTML cleaning code has to be modified as per the inaccuracy and then things will work for that website.
163 | - As the code is open sourced you can modify the code and handle such issues which is not possible for closed sourced options. If you are using any paid solution to avoid getting blocked you can get only the source HTML from the paid provider and use the modified cleaning code to avoid such inaccuracies.
164 | - If you understand web scraping script you can modify the `get_processed_text` function. It generally involves finding the css selector or xpath that will help you to separate out sections of the webpage that have issues (like separate out product wise) and then use some delimiter between them and merge them to get the page content.
165 | 
166 | ### Support & Feedback:
167 | - Share and consider giving a Star if you found this repo helpful.
168 | - I am available for freelance work: maharishi92vyas@gmail.com / https://www.linkedin.com/in/maharishi-vyas
169 | - Also try out the other repo [AI-web_scraper](https://github.com/m92vyas/AI-web_scraper) and leave a Star there if you find it useful.
170 | - Open any issues or feature request.
171 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4==4.12.3
2 | inscriptis==2.5.0
3 | minify_html==0.15.0
4 | selenium==4.23.1


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup,find_packages
 2 | 
 3 | setup(
 4 |     name               = 'llm-reader'
 5 |     , version          = '1'
 6 |     , license          = 'MIT License'
 7 |     , author           = "Maharishi Vyas"
 8 |     , author_email     = 'maharishi92vyas@gmail.com'
 9 |     , packages         = find_packages('src')
10 |     , package_dir      = {'': 'src'}
11 |     , url              = 'https://github.com/m92vyas/llm-reader.git'
12 |     , keywords         = 'url to llm ready input text'
13 |     , install_requires = [
14 |                             'selenium',
15 |                             'beautifulsoup4',
16 |                             'inscriptis',
17 |                             'minify_html'
18 |                          ]
19 |     , include_package_data=True
20 | )


--------------------------------------------------------------------------------
/src/url_to_llm_text/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/m92vyas/llm-reader/e25203ac9eeea2ed715b968927f4fc3ef366b004/src/url_to_llm_text/__init__.py


--------------------------------------------------------------------------------
/src/url_to_llm_text/get_html_text.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | from selenium.webdriver.chrome.options import Options
 3 | import requests
 4 | 
 5 | async def get_page_source(url: str,
 6 |                           wait: float = 1.5,
 7 |                           headless: bool = True,
 8 |                           user_agent: str = "Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia 640 XL LTE) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Mobile Safari/537.36 Edge/12.10166"
 9 |                           ) -> str:
10 |   """
11 |   Get html text using selenium
12 | 
13 |   Args:
14 |     url (str): The url from which html content is to be extracted 
15 |     wait (float): time to implicitly wait for the website to load. default is 1.5 sec.
16 |     headless (bool): use headless browser or not. default True
17 |     user_agent (str): user agent. default "Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia 640 XL LTE) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Mobile Safari/537.36 Edge/12.10166"
18 | 
19 |   Returns (str): 
20 |     html text
21 |   """
22 |   try:
23 |       # check if using google colab
24 |       using_colab = False
25 |       try:
26 |         import google.colab
27 |         using_colab = True
28 |       except:
29 |         using_colab = False
30 |       # add driver options
31 |       options = webdriver.ChromeOptions()
32 |       if headless:
33 |         options.add_argument('--headless')
34 |       options.add_argument(f'--user-agent={user_agent}')
35 |       if using_colab:
36 |         options.add_argument('--no-sandbox')
37 |         options.add_argument('--disable-dev-shm-usage')
38 |       
39 |       driver = webdriver.Chrome(options=options)
40 |       driver.get(url)
41 |       driver.implicitly_wait(wait)
42 | 
43 |       return driver.page_source
44 |   except Exception as e:
45 |     print('Error while getting page source: ', e)
46 |     return ''
47 | 


--------------------------------------------------------------------------------
/src/url_to_llm_text/get_llm_input_text.py:
--------------------------------------------------------------------------------
  1 | from bs4 import BeautifulSoup
  2 | import re
  3 | from urllib.parse import urljoin
  4 | from minify_html import minify
  5 | from inscriptis import get_text
  6 | 
  7 | async def get_processed_text(page_source: str, base_url: str,
  8 |                              html_parser: str ='lxml',
  9 |                              keep_images: bool =True, remove_svg_image: bool =True, remove_gif_image: bool =True, remove_image_types: list =[],
 10 |                              keep_webpage_links: bool =True,
 11 |                              remove_script_tag: bool =True, remove_style_tag: bool =True, remove_tags: list =[]
 12 |                              ) -> str:
 13 |   """
 14 |   process html text. This helps the LLM to easily extract/scrape data especially image links and web links.
 15 | 
 16 |   Args:
 17 |     page_source (str): html source text
 18 |     base_url (str): url of the html source.
 19 |     html_parser (str): which beautifulsoup html parser to use, defaults to 'lxml'
 20 |     keep_images (bool): keep image links. If False will remove image links from the text saving tokens to be processed by LLM. Default True
 21 |     remove_svg_image (bool): remove .svg image. usually not useful while scraping. default True
 22 |     remove_gif_image (bool): remove .gif image. usually not useful while scraping. default True
 23 |     remove_image_types (list): add any image extensions which you want to remove inside a list. eg: [.png]. Default []
 24 |     keep_webpage_links (bool): keep webpage links. if scraping job does not require links then can remove them to reduce input token count to LLM. Default True
 25 |     remove_script_tag (bool): True
 26 |     remove_style_tag (bool): =True
 27 |     remove_tags (list): = list of tags to be remove. Default []
 28 | 
 29 |   Returns (str):
 30 |     LLM ready input web page text
 31 |   """
 32 |   try:
 33 |     soup = BeautifulSoup(page_source, html_parser)
 34 |     
 35 |     # -------remove tags----------
 36 |     remove_tag = []
 37 |     if remove_script_tag:
 38 |       remove_tag.append('script')
 39 |     if remove_style_tag:
 40 |       remove_tag.append('style')
 41 |     remove_tag.extend(remove_tags)
 42 |     remove_tag = list(set(remove_tag))
 43 |     for tag in soup.find_all(remove_tag):
 44 |       try:
 45 |         tag.extract()
 46 |       except Exception as e:
 47 |         print('Error while removing tag: ', e)
 48 |         continue
 49 |     
 50 |     # --------process image links--------
 51 |     remove_image_type = []
 52 |     if remove_svg_image:
 53 |       remove_image_type.append('.svg')
 54 |     if remove_gif_image:
 55 |       remove_image_type.append('.gif')
 56 |     remove_image_type.extend(remove_image_types)
 57 |     remove_image_type = list(set(remove_image_type))
 58 |     for image in (images := soup.find_all('img')):
 59 |       try:
 60 |         if not keep_images:
 61 |           image.replace_with('')
 62 |         else:
 63 |           image_link = image.get('src')
 64 |           type_replaced = False
 65 |           if type(image_link)==str:
 66 |             if remove_image_type!=[]:
 67 |               for image_type in remove_image_type:
 68 |                 if not type_replaced and image_type in image_link:
 69 |                   image.replace_with('')
 70 |                   type_replaced=True
 71 |             if not type_replaced:
 72 |               image.replace_with('\n' + urljoin(base_url, image_link) + ' ')
 73 |       except Exception as e:
 74 |           print('Error while getting image link: ', e)
 75 |           continue
 76 |     # ----------process website links-----------
 77 |     for link in (urls := soup.find_all('a', href=True)):
 78 |       try:
 79 |         if not keep_webpage_links:
 80 |           link.replace_with('')
 81 |         else:
 82 |           link.replace_with(link.text + ': ' + urljoin(base_url, link['href']) + ' ')
 83 |       except Exception as e:
 84 |           print('Error while getting webpage link: ', e)
 85 |           continue
 86 | 
 87 |     # -----------change text structure-----------
 88 |     body_content = soup.find('body')
 89 |     if body_content:
 90 |       try:
 91 |         minimized_body = minify(str(body_content))
 92 |         text = get_text(minimized_body)
 93 |       except:
 94 |         text = get_text(str(body_content))
 95 |     else:
 96 |       text = soup.get_text()
 97 |     return text
 98 | 
 99 |   except Exception as e:
100 |     print('Error while getting processed text: ', e)
101 |     return ''
102 | 


--------------------------------------------------------------------------------