├── tests ├── __init__.py ├── files │ ├── example.ini │ ├── example.json │ ├── example.txt │ ├── example.csv │ ├── example.docx │ ├── example.h │ ├── example.jpg │ ├── example.mp3 │ ├── example.mp4 │ ├── example.pdf │ ├── example.png │ ├── example.pptx │ ├── example.xlsx │ ├── example.zip │ ├── example_pdf_with_no_extension │ ├── example.py │ ├── example.cpp │ ├── example.css │ ├── example.tsx │ ├── example.ipynb │ ├── example.html │ └── example.md ├── test_extractor.py ├── test_chunker.py ├── test_core.py └── test_scraper.py ├── .gitignore ├── requirements.txt ├── LICENSE ├── .github └── workflows │ ├── python-publish.yml │ └── python-ci.yml ├── setup.py ├── thepipe ├── __init__.py ├── extract.py ├── chunker.py ├── core.py └── scraper.py └── README.md /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/files/example.ini: -------------------------------------------------------------------------------- 1 | [Greetings] 2 | say = Hello, World! -------------------------------------------------------------------------------- /tests/files/example.json: -------------------------------------------------------------------------------- 1 | { 2 | "greeting": "Hello, World!" 3 | } -------------------------------------------------------------------------------- /tests/files/example.txt: -------------------------------------------------------------------------------- 1 | Just a plain text, 2 | saying Hello, World! -------------------------------------------------------------------------------- /tests/files/example.csv: -------------------------------------------------------------------------------- 1 | num,numtext 2 | 1,one 3 | 2,two 4 | 3,three 5 | 4,four 6 | 5,five -------------------------------------------------------------------------------- /tests/files/example.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/emcf/thepipe/HEAD/tests/files/example.docx -------------------------------------------------------------------------------- /tests/files/example.h: -------------------------------------------------------------------------------- 1 | #ifndef GREET_H 2 | #define GREET_H 3 | 4 | void greet(); 5 | 6 | #endif -------------------------------------------------------------------------------- /tests/files/example.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/emcf/thepipe/HEAD/tests/files/example.jpg -------------------------------------------------------------------------------- /tests/files/example.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/emcf/thepipe/HEAD/tests/files/example.mp3 -------------------------------------------------------------------------------- /tests/files/example.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/emcf/thepipe/HEAD/tests/files/example.mp4 -------------------------------------------------------------------------------- /tests/files/example.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/emcf/thepipe/HEAD/tests/files/example.pdf -------------------------------------------------------------------------------- /tests/files/example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/emcf/thepipe/HEAD/tests/files/example.png -------------------------------------------------------------------------------- /tests/files/example.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/emcf/thepipe/HEAD/tests/files/example.pptx -------------------------------------------------------------------------------- /tests/files/example.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/emcf/thepipe/HEAD/tests/files/example.xlsx -------------------------------------------------------------------------------- /tests/files/example.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/emcf/thepipe/HEAD/tests/files/example.zip -------------------------------------------------------------------------------- /tests/files/example_pdf_with_no_extension: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/emcf/thepipe/HEAD/tests/files/example_pdf_with_no_extension -------------------------------------------------------------------------------- /tests/files/example.py: -------------------------------------------------------------------------------- 1 | class ExampleClass: 2 | def greet(): 3 | print("Hello, World!") 4 | 5 | 6 | ExampleClass.greet() 7 | -------------------------------------------------------------------------------- /tests/files/example.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace std; 3 | 4 | int main() { 5 | cout << "Hello, World!"; 6 | return 0; 7 | } -------------------------------------------------------------------------------- /tests/files/example.css: -------------------------------------------------------------------------------- 1 | body { 2 | background-color: #f0f0f2; 3 | margin: 0; 4 | padding: 0; 5 | font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif; 6 | } -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | tests/__pycache__/ 3 | thepipeapi/__pycache__/ 4 | .pytest_cache/ 5 | .vscode/ 6 | outputs/ 7 | logs/ 8 | benchmarks/ 9 | node/node_modules/ 10 | thepipeapi/ctags.exe 11 | thepipe.bat 12 | *.egg-info/ 13 | build/ 14 | dist/ 15 | youtube_temp/ 16 | images/ -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4>=4.12 2 | markdownify==0.12.1 3 | magika>=0.5.0 4 | moviepy==1.0.3 5 | numpy>=1.23 6 | openai>=1.51.0 7 | openpyxl>=3.1 8 | pandas>=2.0 9 | pillow>=9.4 10 | playwright>=1.47.0 11 | pydantic>=2.7 12 | pymupdf4llm==0.0.21 13 | PyMuPDF==1.25.5 14 | python-docx>=1.1 15 | python-dotenv>=1.0 16 | python-pptx>=0.6 17 | pytube>=15.0 18 | requests>=2.31 19 | -------------------------------------------------------------------------------- /tests/files/example.tsx: -------------------------------------------------------------------------------- 1 | // pages/index.tsx 2 | import type { NextPage } from 'next'; 3 | import Head from 'next/head'; 4 | import Image from 'next/image'; 5 | import "../app/globals.css"; 6 | 7 | const Home: NextPage = () => { 8 | return ( 9 | <> 10 | The Pipe 11 | 12 | 13 | 14 | ); 15 | }; 16 | 17 | export default Home; -------------------------------------------------------------------------------- /tests/files/example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "ff774e22", 6 | "metadata": {}, 7 | "source": [ 8 | "**Hello**" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "id": "3e261faf", 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAIAAAD8GO2jAAAAKElEQVR4nO3NMQEAAAjDMMC/ZzDBvlRA01vZJvwHAAAAAAAAAAAAbx2jxAE/i2AjOgAAAABJRU5ErkJggg==", 20 | "text/plain": [ 21 | "out" 22 | ] 23 | }, 24 | "metadata": {}, 25 | "output_type": "display_data" 26 | } 27 | ], 28 | "source": [ 29 | "print('X')" 30 | ] 31 | } 32 | ], 33 | "metadata": {}, 34 | "nbformat": 4, 35 | "nbformat_minor": 5 36 | } 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Emmett McFarlane 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow uses actions that are not certified by GitHub. 2 | # They are provided by a third-party and are governed by 3 | # separate terms of service, privacy policy, and support 4 | # documentation. 5 | 6 | # GitHub recommends pinning actions to a commit SHA. 7 | # To get a newer version, you will need to update the SHA. 8 | # You can also reference a tag or branch, but the action may change without warning. 9 | 10 | name: Upload Python Package 11 | 12 | on: 13 | release: 14 | types: [published] 15 | 16 | jobs: 17 | deploy: 18 | runs-on: ubuntu-latest 19 | steps: 20 | - uses: actions/checkout@v4 21 | - name: Set up Python 22 | uses: actions/setup-python@v4 23 | with: 24 | python-version: '3.x' 25 | - name: Install dependencies 26 | run: | 27 | python -m pip install --upgrade pip 28 | pip install build 29 | - name: Build package 30 | run: python -m build 31 | - name: Publish package 32 | uses: pypa/gh-action-pypi-publish@release/v1 33 | with: 34 | password: ${{ secrets.PYPI_API_TOKEN }} -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | 4 | def read_requirements(file): 5 | with open(file, encoding="utf-8") as f: 6 | return [line.strip() for line in f if line.strip() and not line.startswith("#")] 7 | 8 | 9 | EXTRAS = { 10 | "audio": ["openai-whisper>=20231117"], 11 | "semantic": ["sentence-transformers>=2.2.2"], 12 | "llama-index": ["llama-index>=0.10.50,<0.11"], 13 | "gpu": [ 14 | "torch>=2.5,<2.6", 15 | "torchvision>=0.20,<0.21", 16 | "torchaudio>=2.5,<2.6", 17 | "sentence-transformers>=2.2.2", 18 | "openai-whisper>=20231117", 19 | ], 20 | } 21 | EXTRAS["all"] = sorted({pkg for deps in EXTRAS.values() for pkg in deps}) 22 | 23 | 24 | setup( 25 | name="thepipe_api", 26 | version="1.7.1", 27 | author="Emmett McFarlane", 28 | author_email="emmett@thepi.pe", 29 | description="Get clean data from tricky documents, powered by VLMs.", 30 | long_description=open("README.md", encoding="utf-8").read(), 31 | long_description_content_type="text/markdown", 32 | url="https://github.com/emcf/thepipe", 33 | packages=find_packages(), 34 | classifiers=[ 35 | "Programming Language :: Python :: 3", 36 | "License :: OSI Approved :: MIT License", 37 | "Operating System :: OS Independent", 38 | ], 39 | python_requires=">=3.9", 40 | install_requires=read_requirements("requirements.txt"), 41 | extras_require=EXTRAS, 42 | include_package_data=True, 43 | entry_points={ 44 | "console_scripts": [ 45 | "thepipe=thepipe.__init__:main", 46 | ], 47 | }, 48 | ) 49 | -------------------------------------------------------------------------------- /.github/workflows/python-ci.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | push: 5 | branches: ["main"] 6 | pull_request: 7 | branches: ["main"] 8 | 9 | permissions: 10 | contents: read 11 | 12 | jobs: 13 | build: 14 | runs-on: ubuntu-latest 15 | timeout-minutes: 30 16 | 17 | steps: 18 | - uses: actions/checkout@v4 19 | - uses: actions/setup-node@v4 20 | with: 21 | node-version: 18 22 | - name: Install playwright 23 | run: | 24 | npm install playwright 25 | npm install playwright-core 26 | - name: Install Playwright 27 | run: npx playwright install --with-deps 28 | - name: Set up Python 3.10 29 | uses: actions/setup-python@v3 30 | with: 31 | python-version: "3.10" 32 | - name: Install FFmpeg 33 | run: sudo apt-get update && sudo apt-get install -y ffmpeg 34 | - name: Install pip dependencies 35 | run: | 36 | python -m pip install --upgrade pip 37 | pip install flake8 coverage 38 | pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cpu 39 | pip install -e .[all] 40 | - name: Install playwright browsers 41 | run: python -m playwright install 42 | - name: Lint with flake8 43 | run: | 44 | # stop the build if there are Python syntax errors or undefined names 45 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 46 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 47 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 48 | - name: Test with unittest and generate coverage report 49 | env: 50 | OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} 51 | run: | 52 | coverage run -m unittest discover 53 | coverage xml -i 54 | - name: Upload coverage reports to Codecov 55 | uses: codecov/codecov-action@v4.0.1 56 | with: 57 | token: ${{ secrets.CODECOV_TOKEN }} 58 | slug: emcf/thepipe 59 | -------------------------------------------------------------------------------- /tests/test_extractor.py: -------------------------------------------------------------------------------- 1 | # test_extractor.py 2 | 3 | import unittest 4 | import sys 5 | import os 6 | import json 7 | from openai import OpenAI 8 | 9 | sys.path.append("..") 10 | from thepipe.extract import extract, extract_json_from_response 11 | from thepipe.core import Chunk 12 | 13 | 14 | class TestExtractor(unittest.TestCase): 15 | def setUp(self): 16 | self.example_receipt = """# Receipt 17 | Store Name: Grocery Mart 18 | ## Total 19 | Subtotal: $13.49 USD 20 | Tax (8%): $1.08 USD 21 | Total: $14.57 USD 22 | """ 23 | 24 | self.schema = { 25 | "store_name": "string", 26 | "subtotal_usd": "float", 27 | "tax_usd": "float", 28 | "total_usd": "float", 29 | } 30 | 31 | self.chunks = [Chunk(path="receipt.md", text=self.example_receipt)] 32 | 33 | def test_extract_json_from_response(self): 34 | # List of test cases with expected results 35 | test_cases = [ 36 | # Case 1: JSON enclosed in triple backticks 37 | { 38 | "input": '```json\n{"key1": "value1", "key2": 2}\n```', 39 | "expected": {"key1": "value1", "key2": 2}, 40 | }, 41 | # Case 2: JSON directly in the response 42 | { 43 | "input": '{"key1": "value1", "key2": 2}', 44 | "expected": {"key1": "value1", "key2": 2}, 45 | }, 46 | # Case 3: Response contains multiple JSON objects 47 | { 48 | "input": 'Random text {"key1": "value1"} and another {"key2": 2}', 49 | "expected": [{"key1": "value1"}, {"key2": 2}], 50 | }, 51 | # Case 4: Response contains incomplete JSON 52 | { 53 | "input": 'Random text {"key1": "value1"} and another {"key2": 2', 54 | "expected": {"key1": "value1"}, 55 | }, 56 | ] 57 | 58 | for i, case in enumerate(test_cases): 59 | with self.subTest(i=i): 60 | result = extract_json_from_response(case["input"]) 61 | self.assertEqual(result, case["expected"]) 62 | 63 | def test_extract(self): 64 | # provide an explicit client so we cover the new parameter 65 | client = OpenAI() 66 | 67 | results, total_tokens_used = extract( 68 | chunks=self.chunks, # receipt 69 | schema=self.schema, 70 | openai_client=client, 71 | ) 72 | 73 | # Check if we got a result 74 | self.assertEqual(len(results), 1) 75 | result = results[0] 76 | 77 | print("test_extract result:", json.dumps(result, indent=2)) 78 | 79 | # Check if all expected fields are present 80 | expected_fields = ["store_name", "subtotal_usd", "tax_usd", "total_usd"] 81 | for field in expected_fields: 82 | self.assertIn(field, result) 83 | 84 | # Check some specific values 85 | self.assertEqual(result["store_name"], "Grocery Mart") 86 | self.assertEqual(result["subtotal_usd"], 13.49) 87 | self.assertEqual(result["tax_usd"], 1.08) 88 | self.assertEqual(result["total_usd"], 14.57) 89 | 90 | # Check if tokens were used 91 | self.assertGreater(total_tokens_used, 0) 92 | 93 | 94 | if __name__ == "__main__": 95 | unittest.main() 96 | -------------------------------------------------------------------------------- /tests/files/example.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | HTML with Tables and JavaScript 7 | 35 | 43 | 44 | 45 | 46 |

Heading 1

47 |

Heading 2

48 |

Heading 3

49 |

50 | This is a paragraph with some bold text and some 51 | italic text. You can also use strong text for 52 | emphasis. 53 |

54 | 55 |

Here is a simple unordered list:

56 | 61 | 62 |

And an ordered list:

63 |
    64 |
  1. First step
  2. 65 |
  3. Second step
  4. 66 |
  5. Third step
  6. 67 |
68 | 69 | 70 |

Sample Table

71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 |
NameAgeCountry
John Doe28USA
Jane Smith34Canada
Alice Johnson29UK
97 | 98 |

Another Table Example

99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 |
ProductPriceQuantity
Apples$210
Bananas$115
Cherries$57
125 | 126 | 127 | 128 | 129 | 130 | -------------------------------------------------------------------------------- /tests/files/example.md: -------------------------------------------------------------------------------- 1 | ## 4.1. Driven Hydrodynamic Turbulence 2 | 3 | Figure 1 shows the mean-Mach relation found from timeaveraged PDFs over the full domain for driven hydro and driven MHD turbulence. We find that a value of in the func- a p 0.5 tion gives the best linear relations for the hydro case. y(M) 4 | For the volume fraction we find 5 | 6 | $$\mu_{V}=\,-0.36\,\mathrm{ln}\,[1\,+0.5{\mathcal M}^{2}]\,+0.10,$$ 7 | 8 | while for the mass fraction we find 9 | 10 | $$\mu_{M}=0.32\ln\,[1+0.5{\mathcal{M}}^{2}]-0.10.$$ 11 | 12 | The rms residuals for these fits are and 3 8.9 \# 10 6.5 \# 13 | , respectively. Because the density fluctuations in subsonic 3 10 turbulence are not produced by shocks, we have no reason to expect these relations to approach zero with Mach number. The mean-Mach pairs from the time-averaged PDFs fall very close to these relations. Over the full range of Mach numbers tested, however, the time-averaged means are smaller than those found by P97 ( ). To determine the 2 m p 0.5 ln [1 - 0.25M ] V,M 14 | magnitude of the effect that the driving may have had on the relations, we also compare values determined from hydrodynamic turbulence with (not shown), finding that k L pk /2p p 4 these points fall very close to the relations as k L pk /2p p 2 well. 15 | 16 | Also shown in the figure are the instantaneous mean-Mach pairs found from each of the eight subboxes. The scatter in these points, illustrated by 1 j error bars on the plot, is quite significant. The ensemble average of the 56 points for each run falls at a slightly lower Mach number than the value found from the time average over the full domain since the Mach number typically decreases on smaller scales. These averages differ slightly from those computed from time-averaged fulldomain PDFs, although they still fall close to the relations found. The 1 j scatter in the Mach number is 4%–8%, while in the sub-PDF means it is 16%–17% for the lowest Mach number run and 10%–14% for the remaining runs. This scatter puts some of the instantaneous sub-PDF values in the vicinity of the P97 relation. 17 | 18 | The scatter in and can be converted to scatter in the m m V M 19 | median density within the medium. For the ratio of the median to mean density within a cloud, , we can define and r rr ˜ ˜˜ 20 | for clouds where m is 1 j above or below the mean. The ratios r˜˜ ˜ ˜ /r p exp (2j ) and then gen- r /r p exp (2j ) V,- V, V,S M,- M, M,S 21 | erally increase with Mach number, ranging from 1.0 to 1.3. 22 | 23 | $4.2\cdot A$ . 24 | 25 | ## 4.2. Driven Mhd Turbulence 26 | 27 | Figure 1 also includes the mean-Mach relation for driven strong-field MHD turbulence, where we continue to use a p 0.5. The mean-Mach pairs from the time-averaged PDFs again fall very close to these relations, 28 | 29 | $$\mu_{v}=\,-0.29\,\ln\,[1\,+\,0.5\mathcal{M}^{2}]-0.06$$ 30 | for the volume fraction and 31 | $$\mu_{M}=0.28\ln\,[1+0.5{\mathcal{M}}^{2}]+0.07$$ 32 | $$(S)$$ 33 | 34 | ![0_image_0.png](0_image_0.png) 35 | 36 | $$(6)$$ 37 | 38 | found by P97. The rms residuals for these fits are 1.1 \# 39 | and , respectively. 2 2 10 1.6 \# 10 The instantaneous mean-Mach pairs found from subboxes have more scatter with a strong magnetic field than they did in the purely hydrodynamic case. The 1 j scatter in the Mach number is 5%–8%, while for the sub-PDF means it is 22%– 40 | 24% for the lowest Mach number run and 13%–18% for the remaining runs. Converting this to scatter in the median density, we find that the ratios and range from 1.1 r˜˜ ˜ ˜ V,- /r r V, M,- /rM, 41 | to 1.3. This again puts some of the instantaneous sub-PDF values in the vicinity of the P97 mean-Mach relation. 42 | 43 | The time-averaged mean-Mach relations found for hydro and MHD differ, as one should expect due to differences in the shock-jump conditions. However, the sub-PDF values overlap substantially, making them difficult to distinguish observationally. The time-averaged relations fall a bit less than 1 j (computed from the sub-PDF values) apart. 44 | 45 | ## 4.3. Decaying Mhd Turbulence 46 | 47 | $$\left(7\right)$$ 48 | $$({\mathfrak{s}})$$ 49 | 50 | As it seems likely that molecular clouds are transient entities, it may be more appropriate to study the PDF of decaying turbulence. In Figure 2, we show the evolution of the PDF in a decaying strong-field MHD turbulence run, initialized from a snapshot of fully developed turbulence from our highest Mach number driven run. Although this snapshot has a full-domain PDF mean roughly 1 j more extreme than the time-averaged driven relations, this should not affect the results. At first the small change in mean as the Mach number decreases causes a shallower slope than that of the driven relation. Once the mean begins to change appreciably, however, the slope becomes much steeper, crossing the driven relation at roughly M p 4.5. Although the slope shallows as low Mach numbers are reached, it does so only as the means become very small. 51 | -------------------------------------------------------------------------------- /thepipe/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import argparse 4 | import os 5 | import warnings 6 | from typing import Optional 7 | 8 | from openai import OpenAI 9 | 10 | from .scraper import scrape_directory, scrape_file, scrape_url 11 | from .core import DEFAULT_AI_MODEL, save_outputs 12 | 13 | 14 | # Argument parsing 15 | def parse_arguments() -> argparse.Namespace: # noqa: D401 – imperative is fine here 16 | """ 17 | Parse CLI flags. 18 | 19 | Returns 20 | ------- 21 | argparse.Namespace 22 | Parsed arguments. 23 | """ 24 | parser = argparse.ArgumentParser( 25 | prog="thepipe", 26 | description="Universal document/Web scraper with optional OpenAI extraction.", 27 | ) 28 | 29 | # Required source (file, directory, or URL) 30 | parser.add_argument( 31 | "source", 32 | help="File path, directory, or URL to scrape.", 33 | ) 34 | 35 | # Optional flags 36 | parser.add_argument( 37 | "-i", 38 | "--inclusion-pattern", 39 | dest="inclusion_pattern", 40 | default=None, 41 | help="Regex pattern – only files whose *full path* matches are scraped " 42 | "(applies to directory/zip scraping).", 43 | ) 44 | parser.add_argument( 45 | "-v", 46 | "--verbose", 47 | action="store_true", 48 | help="Enable verbose logging.", 49 | ) 50 | parser.add_argument( 51 | "--text-only", 52 | dest="text_only", 53 | action="store_true", 54 | help="Suppress images – output only extracted text.", 55 | ) 56 | 57 | # OpenAI-related flags 58 | parser.add_argument( 59 | "--openai-api-key", 60 | dest="openai_api_key", 61 | default=os.getenv("OPENAI_API_KEY"), 62 | help="OpenAI API key. If omitted, env variable OPENAI_API_KEY is used.", 63 | ) 64 | parser.add_argument( 65 | "--openai-base-url", 66 | dest="openai_base_url", 67 | default="https://api.openai.com/v1", 68 | help="Base URL for the OpenAI API (default: https://api.openai.com/v1).", 69 | ) 70 | parser.add_argument( 71 | "--openai-model", 72 | dest="openai_model", 73 | default=DEFAULT_AI_MODEL, 74 | help=f"Chat/VLM model to use (default: {DEFAULT_AI_MODEL}).", 75 | ) 76 | 77 | # Legacy flag (will be removed in future versions) 78 | parser.add_argument( 79 | "--ai-extraction", 80 | action="store_true", 81 | help=argparse.SUPPRESS, # hidden but still accepted 82 | ) 83 | 84 | return parser.parse_args() 85 | 86 | 87 | # OpenAI client factory 88 | def create_openai_client( 89 | *, 90 | api_key: Optional[str], 91 | base_url: str, 92 | enable_vlm: bool, 93 | ) -> Optional[OpenAI]: 94 | if api_key: 95 | # Normal path – user gave an explicit key 96 | return OpenAI(api_key=api_key, base_url=base_url) 97 | 98 | if enable_vlm: 99 | # Old flag: fall back to env vars 100 | warnings.warn( 101 | "--ai-extraction is deprecated; " 102 | "please use --openai-api-key and --openai-model " 103 | "(and optionally --openai-base-url) instead.", 104 | DeprecationWarning, 105 | stacklevel=2, 106 | ) 107 | return OpenAI(base_url=base_url, api_key=os.getenv("OPENAI_API_KEY")) 108 | 109 | # AI extraction disabled 110 | return None 111 | 112 | 113 | def main() -> None: 114 | """CLI entry point""" 115 | args = parse_arguments() 116 | 117 | # Instantiate the OpenAI client if requested 118 | openai_client = create_openai_client( 119 | api_key=args.openai_api_key, 120 | base_url=args.openai_base_url, 121 | enable_vlm=args.ai_extraction, 122 | ) 123 | 124 | # Delegate scraping based on source type 125 | if args.source.startswith(("http://", "https://")): 126 | chunks = scrape_url( 127 | args.source, 128 | verbose=args.verbose, 129 | openai_client=openai_client, 130 | model=args.openai_model, 131 | ) 132 | elif os.path.isdir(args.source): 133 | chunks = scrape_directory( 134 | dir_path=args.source, 135 | inclusion_pattern=args.inclusion_pattern, 136 | verbose=args.verbose, 137 | openai_client=openai_client, 138 | ) 139 | elif os.path.isfile(args.source): 140 | chunks = scrape_file( 141 | filepath=args.source, 142 | verbose=args.verbose, 143 | openai_client=openai_client, 144 | model=args.openai_model, 145 | ) 146 | else: 147 | raise ValueError(f"Invalid source: {args.source}") 148 | 149 | # Persist results 150 | save_outputs( 151 | chunks=chunks, 152 | verbose=args.verbose, 153 | text_only=args.text_only, 154 | output_folder="thepipe_output", 155 | ) 156 | 157 | if args.verbose: 158 | print(f"Scraping complete. Outputs saved to 'thepipe_output/'.") 159 | 160 | 161 | # Entry-point shim 162 | if __name__ == "__main__": 163 | main() 164 | -------------------------------------------------------------------------------- /tests/test_chunker.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import sys 4 | from typing import List, cast 5 | 6 | try: 7 | from openai import OpenAI 8 | except ImportError: # pragma: no cover - optional in tests 9 | OpenAI = None # type: ignore[assignment] 10 | 11 | try: 12 | import sentence_transformers # noqa: F401 # pragma: no cover - optional import 13 | 14 | HAS_SENTENCE_TRANSFORMERS = True 15 | except ImportError: # pragma: no cover - optional import 16 | HAS_SENTENCE_TRANSFORMERS = False 17 | 18 | sys.path.append("..") 19 | import thepipe.chunker as chunker 20 | from thepipe.core import Chunk, calculate_tokens 21 | 22 | 23 | class test_chunker(unittest.TestCase): 24 | def setUp(self): 25 | self.files_directory = os.path.join(os.path.dirname(__file__), "files") 26 | self.example_markdown_path = os.path.join(self.files_directory, "example.md") 27 | self.max_tokens_per_chunk = ( 28 | 10 # Define an arbitrary max tokens per chunk for testing 29 | ) 30 | 31 | def read_markdown_file(self, file_path: str) -> List[Chunk]: 32 | with open(file_path, "r") as f: 33 | text = f.read() 34 | return [Chunk(path=file_path, text=text)] 35 | 36 | def test_chunk_by_keywords(self): 37 | text = "Intro line\nfoo starts here\nmiddle\nbar next\nend" 38 | chunk = Chunk(path="doc.md", text=text) 39 | 40 | result = chunker.chunk_by_keywords([chunk], keywords=["foo", "bar"]) 41 | # There are 3 segments: before foo, between foo and bar, after bar 42 | self.assertEqual(len(result), 3) 43 | 44 | # 1st chunk: only the intro 45 | t0 = cast(str, result[0].text) 46 | self.assertIn("Intro line", t0) 47 | self.assertNotIn("foo", t0.lower()) 48 | 49 | # 2nd chunk: starts with foo, includes 'middle' 50 | t1 = cast(str, result[1].text).lower() 51 | self.assertIn("foo starts here", t1) 52 | self.assertIn("middle", t1) 53 | 54 | # 3rd chunk: starts with bar, includes 'end' 55 | t2 = cast(str, result[2].text).lower() 56 | self.assertIn("bar next", t2) 57 | self.assertIn("end", t2) 58 | 59 | @unittest.skipIf(OpenAI is None or not os.getenv("OPENAI_API_KEY"), "OpenAI API key required") 60 | def test_chunk_agentic(self): 61 | openai_client = OpenAI() 62 | chunks = self.read_markdown_file(self.example_markdown_path) 63 | chunked_agentic = chunker.chunk_agentic(chunks, openai_client=openai_client) 64 | # Verify the output 65 | self.assertIsInstance(chunked_agentic, list) 66 | self.assertGreater(len(chunked_agentic), 0) 67 | # verify there are 3 chunks corresponding to the sections in the example markdown 68 | self.assertEqual(len(chunked_agentic), 3) 69 | # Verify the output contains chunks with text or images 70 | for chunk in chunked_agentic: 71 | self.assertIsInstance(chunk, Chunk) 72 | self.assertTrue(chunk.text or chunk.images) 73 | 74 | def test_chunk_by_length(self): 75 | chunks = self.read_markdown_file(self.example_markdown_path) 76 | chunked_length = chunker.chunk_by_length( 77 | chunks, max_tokens=self.max_tokens_per_chunk 78 | ) 79 | # Verify the output 80 | self.assertIsInstance(chunked_length, list) 81 | self.assertGreater(len(chunked_length), 0) 82 | for chunk in chunked_length: 83 | self.assertIsInstance(chunk, Chunk) 84 | # Verify that the chunk text or images are not none 85 | self.assertTrue(chunk.text or chunk.images) 86 | # assert length of text is less than max_tokens 87 | if chunk.text or chunk.images: 88 | self.assertLessEqual( 89 | calculate_tokens([chunk]), self.max_tokens_per_chunk 90 | ) 91 | 92 | @unittest.skipUnless(HAS_SENTENCE_TRANSFORMERS, "sentence-transformers extra is not installed") 93 | def test_chunk_semantic(self): 94 | test_sentence = "Computational astrophysics. Numerical astronomy. Bananas." 95 | chunks = [Chunk(text=test_sentence)] 96 | chunked_semantic = chunker.chunk_semantic( 97 | chunks, 98 | model="sentence-transformers/all-MiniLM-L6-v2", 99 | buffer_size=2, 100 | similarity_threshold=0.5, 101 | ) 102 | # verify the output 103 | self.assertIsInstance(chunked_semantic, list) 104 | self.assertGreater(len(chunked_semantic), 0) 105 | # verify it split into ['Computational astrophysics.\nNumerical astronomy.', 'Bananas.'] 106 | self.assertEqual(len(chunked_semantic), 2) 107 | self.assertEqual( 108 | chunked_semantic[0].text, 109 | "Computational astrophysics.\nNumerical astronomy.", 110 | ) 111 | self.assertEqual( 112 | chunked_semantic[1].text, 113 | "Bananas.", 114 | ) 115 | 116 | def test_chunk_by_page(self): 117 | chunks = self.read_markdown_file(self.example_markdown_path) 118 | chunked_pages = chunker.chunk_by_page(chunks) 119 | # Verify the output 120 | self.assertIsInstance(chunked_pages, list) 121 | self.assertGreater(len(chunked_pages), 0) 122 | for chunk in chunked_pages: 123 | self.assertIsInstance(chunk, Chunk) 124 | self.assertTrue(chunk.text or chunk.images) 125 | 126 | def test_chunk_by_section(self): 127 | chunks = self.read_markdown_file(self.example_markdown_path) 128 | chunked_sections = chunker.chunk_by_section(chunks) 129 | self.assertIsInstance(chunked_sections, list) 130 | self.assertEqual(len(chunked_sections), 3) 131 | # Verify the output contains chunks with text or images 132 | for chunk in chunked_sections: 133 | self.assertIsInstance(chunk, Chunk) 134 | self.assertTrue(chunk.text or chunk.images) 135 | 136 | def test_chunk_by_section_first_line_and_custom_separator(self): 137 | # Default separator, with first line as a header 138 | text1 = "## Alpha\nContent A\n## Beta\nContent B" 139 | chunk1 = Chunk(text=text1) 140 | out1 = chunker.chunk_by_section([chunk1]) 141 | self.assertEqual(len(out1), 2) 142 | 143 | # cast .text to str so Pylance stops complaining 144 | t0 = cast(str, out1[0].text) 145 | self.assertIn("Alpha", t0) 146 | 147 | t1 = cast(str, out1[1].text) 148 | self.assertIn("Beta", t1) 149 | 150 | # Custom separator "### " 151 | text2 = "### One\nX\n### Two\nY" 152 | chunk2 = Chunk(text=text2) 153 | out2 = chunker.chunk_by_section([chunk2], section_separator="### ") 154 | self.assertEqual(len(out2), 2) 155 | 156 | o0 = cast(str, out2[0].text) 157 | self.assertIn("One", o0) 158 | 159 | o1 = cast(str, out2[1].text) 160 | self.assertIn("Two", o1) 161 | 162 | def test_chunk_by_document(self): 163 | chunks = self.read_markdown_file(self.example_markdown_path) 164 | chunked_documents = chunker.chunk_by_document(chunks) 165 | self.assertIsInstance(chunked_documents, list) 166 | self.assertEqual(len(chunked_documents), 1) 167 | # Verify the output contains chunks with text or images 168 | chunk = chunked_documents[0] 169 | self.assertIsInstance(chunk, Chunk) 170 | self.assertTrue(chunk.text or chunk.images) 171 | 172 | 173 | if __name__ == "__main__": 174 | unittest.main() 175 | -------------------------------------------------------------------------------- /tests/test_core.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import base64 3 | import shutil 4 | from typing import List, cast 5 | import unittest 6 | import os 7 | import sys 8 | 9 | sys.path.append("..") 10 | import thepipe.core as core 11 | import thepipe.scraper as scraper 12 | from PIL import Image 13 | from io import BytesIO 14 | 15 | 16 | class test_core(unittest.TestCase): 17 | def setUp(self): 18 | self.files_directory = os.path.join(os.path.dirname(__file__), "files") 19 | self.outputs_directory = "outputs" 20 | 21 | def tearDown(self): 22 | # clean up outputs 23 | if os.path.exists(self.outputs_directory): 24 | for file in os.listdir(self.outputs_directory): 25 | os.remove(os.path.join(self.outputs_directory, file)) 26 | os.rmdir(self.outputs_directory) 27 | 28 | def test_make_image_url_data_and_max_resolution(self): 29 | img = Image.new("RGB", (100, 50), color="purple") 30 | # max_resolution scales down before encoding 31 | url = core.make_image_url(img, host_images=False, max_resolution=25) 32 | self.assertTrue(url.startswith("data:image/jpeg;base64,")) 33 | b64 = url.split(",", 1)[1] 34 | img2 = Image.open(BytesIO(base64.b64decode(b64))) 35 | self.assertTrue(max(img2.size) <= 25) 36 | 37 | def test_make_image_url_host_images(self): 38 | 39 | # clear any old images 40 | if os.path.isdir("images"): 41 | shutil.rmtree("images") 42 | 43 | # override host for predictability 44 | import thepipe.core as core 45 | 46 | core.HOST_URL = "http://test-host" 47 | 48 | img = Image.new("RGB", (10, 10), color="orange") 49 | url = core.make_image_url(img, host_images=True) 50 | 51 | # URL should point to our HOST_URL 52 | self.assertTrue(url.startswith("http://test-host/images/")) 53 | 54 | # extract the image_id from the URL 55 | image_id = url.rsplit("/", 1)[-1] 56 | 57 | # confirm that exact file exists on disk 58 | self.assertTrue(os.path.exists(os.path.join("images", image_id))) 59 | 60 | def test_calculate_image_and_mixed_tokens(self): 61 | small = Image.new("RGB", (256, 256)) 62 | self.assertEqual(core.calculate_image_tokens(small, detail="auto"), 85) 63 | large = Image.new("RGB", (2048, 2048)) 64 | high = core.calculate_image_tokens(large, detail="high") 65 | self.assertGreater(high, 85) 66 | 67 | # Mixed text+image chunk 68 | txt = core.Chunk(text="abcd") # 4 chars → 1 token 69 | img = core.Chunk(images=[small]) # 85 tokens 70 | total = core.calculate_tokens([txt, img]) 71 | self.assertEqual(total, 1 + 85) 72 | 73 | def test_chunk_to_message_variants(self): 74 | img = Image.new("RGB", (5, 5)) 75 | chunk = core.Chunk(path="f.md", text="![alt](foo.png)\nHello", images=[img]) 76 | 77 | # text_only=True → no image_url entries 78 | msg1 = chunk.to_message(text_only=True) 79 | self.assertEqual(len(msg1["content"]), 1) 80 | self.assertEqual(msg1["content"][0]["type"], "text") 81 | 82 | # host_images & include_paths 83 | core.HOST_URL = "http://host" 84 | msg2 = chunk.to_message(host_images=True, include_paths=True) 85 | # First content block should include the wrapper 86 | self.assertIn('', msg2["content"][0]["text"]) 87 | # There must be at least one image_url entry 88 | self.assertTrue(any(item["type"] == "image_url" for item in msg2["content"])) 89 | 90 | def test_json_roundtrip(self): 91 | img = Image.new("RGB", (2, 2)) 92 | chunk = core.Chunk(path="p", text="T", images=[img]) 93 | data = chunk.to_json() 94 | chunk2 = core.Chunk.from_json(data) 95 | 96 | self.assertEqual(chunk2.path, "p") 97 | self.assertEqual(chunk2.text, "T") 98 | 99 | images = cast(List[Image.Image], chunk2.images) 100 | self.assertIsInstance(images, list) 101 | self.assertEqual(len(images), 1) 102 | 103 | @unittest.skipUnless(core.has_llama_index(), "llama-index extra is not installed") 104 | def test_chunk_to_llamaindex(self): 105 | chunk = core.Chunk( 106 | path="example.md", 107 | text="This is a coloured image", 108 | images=[Image.new("RGB", (32, 32), color="red")], 109 | ) 110 | llama_index_document = chunk.to_llamaindex() 111 | self.assertEqual(type(llama_index_document), list) 112 | self.assertEqual(len(llama_index_document), 1) 113 | self.assertEqual(type(llama_index_document[0]), core.ImageDocument) 114 | 115 | def test_chunks_to_messages(self): 116 | chunks = scraper.scrape_file(filepath=self.files_directory + "/example.md") 117 | messages = core.chunks_to_messages(chunks) 118 | self.assertEqual(type(messages), list) 119 | for message in messages: 120 | self.assertEqual(type(message), dict) 121 | self.assertIn("role", message) 122 | self.assertIn("content", message) 123 | # test chunks_to_messages with path included 124 | messages = core.chunks_to_messages(chunks, include_paths=True) 125 | for message in messages: 126 | self.assertIn("example.md", message["content"][0]["text"]) 127 | 128 | def test_save_outputs_text_only_and_with_images(self): 129 | # Text-only 130 | c = core.Chunk(path="x.txt", text="XYZ") 131 | core.save_outputs([c], text_only=True, output_folder="outputs") 132 | self.assertTrue(os.path.exists("outputs/prompt.txt")) 133 | files = os.listdir("outputs") 134 | self.assertEqual(files, ["prompt.txt"]) 135 | shutil.rmtree("outputs") 136 | 137 | # With image 138 | img = Image.new("RGB", (10, 10)) 139 | c2 = core.Chunk(path="y", text="TXT", images=[img]) 140 | core.save_outputs([c2], text_only=False, output_folder="outputs") 141 | files = os.listdir("outputs") 142 | self.assertIn("prompt.txt", files) 143 | self.assertTrue(any(f.endswith(".jpg") for f in files)) 144 | 145 | def test_chunk_json(self): 146 | example_image_path = os.path.join(self.files_directory, "example.jpg") 147 | image = Image.open(example_image_path) 148 | chunk = core.Chunk(path="example.md", text="Hello, World!") 149 | # convert to json 150 | chunk_json = chunk.to_json() 151 | # verify it is a dictionary with the expected items 152 | self.assertEqual(type(chunk_json), dict) 153 | self.assertIn("text", chunk_json) 154 | self.assertIn("path", chunk_json) 155 | # convert back 156 | chunk = core.Chunk.from_json(chunk_json) 157 | # verify it is the correct Chunk object 158 | self.assertEqual(type(chunk), core.Chunk) 159 | self.assertEqual(chunk.path, "example.md") 160 | self.assertEqual(chunk.text, "Hello, World!") 161 | 162 | def test_calculate_tokens(self): 163 | text = "Hello, World!" 164 | tokens = core.calculate_tokens([core.Chunk(text=text)]) 165 | self.assertAlmostEqual(tokens, 3.25, places=0) 166 | 167 | def test_calculate_image_tokens(self): 168 | image = Image.open(os.path.join(self.files_directory, "example.jpg")) 169 | image.load() # needed to close the file 170 | tokens = core.calculate_image_tokens(image, detail="auto") 171 | self.assertAlmostEqual(tokens, 85, places=0) 172 | tokens = core.calculate_image_tokens(image, detail="low") 173 | self.assertAlmostEqual(tokens, 85, places=0) 174 | tokens = core.calculate_image_tokens(image, detail="high") 175 | self.assertAlmostEqual(tokens, 595, places=0) 176 | 177 | def test_make_image_url(self): 178 | image = Image.open(os.path.join(self.files_directory, "example.jpg")) 179 | image.load() # needed to close the file 180 | url = core.make_image_url(image, host_images=False) 181 | # verify it is in the correct format 182 | self.assertTrue(url.startswith("data:image/jpeg;base64,")) 183 | # verify it decodes correctly 184 | remove_prefix = url.replace("data:image/jpeg;base64,", "") 185 | image_data = base64.b64decode(remove_prefix) 186 | image = Image.open(BytesIO(image_data)) 187 | self.assertEqual(image.format, "JPEG") 188 | # verify it hosts the image correctly 189 | url = core.make_image_url(image, host_images=True) 190 | self.assertTrue(url.startswith(core.HOST_URL)) 191 | -------------------------------------------------------------------------------- /thepipe/extract.py: -------------------------------------------------------------------------------- 1 | from concurrent.futures import ThreadPoolExecutor, as_completed 2 | import json 3 | import re 4 | from typing import Iterable, List, Dict, Union, Optional, Tuple, Callable, cast 5 | from .core import ( 6 | Chunk, 7 | calculate_tokens, 8 | DEFAULT_AI_MODEL, 9 | ) 10 | from .scraper import scrape_url, scrape_file 11 | from .chunker import ( 12 | chunk_by_page, 13 | chunk_by_document, 14 | chunk_by_section, 15 | chunk_semantic, 16 | chunk_by_keywords, 17 | chunk_by_length, 18 | chunk_agentic, 19 | ) 20 | import requests 21 | import os 22 | from openai import OpenAI 23 | from openai.types.chat.chat_completion_message_param import ChatCompletionMessageParam 24 | 25 | DEFAULT_EXTRACTION_PROMPT = "Extract all the information from the given document according to the following schema: {schema}. Immediately return valid JSON formatted data. If there is missing data, you may use null, but always fill in every column as best you can. Always immediately return valid JSON. You must extract ALL the information available in the entire document." 26 | 27 | 28 | def extract_json_from_response(llm_response: str) -> Union[Dict, List[Dict], None]: 29 | def clean_response_text(llm_response: str) -> str: 30 | return llm_response.encode("utf-8", "ignore").decode("utf-8").strip() 31 | 32 | code_block_pattern = r"^```(?:json)?\s*([\s\S]*?)\s*```$" 33 | match = re.match(code_block_pattern, llm_response, re.MULTILINE | re.DOTALL) 34 | if match: 35 | llm_response = match.group(1) 36 | llm_response = clean_response_text(llm_response) 37 | 38 | try: 39 | parsed_json = json.loads(llm_response) 40 | return parsed_json 41 | except json.JSONDecodeError: 42 | json_pattern = r"($$[\s\S]*$$|\{[\s\S]*\})" 43 | match = re.search(json_pattern, llm_response) 44 | if match: 45 | try: 46 | parsed_json = json.loads(match.group(1)) 47 | return parsed_json 48 | except json.JSONDecodeError: 49 | pass 50 | 51 | objects = re.findall(r"\{[^{}]*\}", llm_response) 52 | if objects: 53 | valid_objects = [] 54 | for obj in objects: 55 | try: 56 | obj = obj.replace("\\", "").replace('\\"', '"') 57 | valid_objects.append(json.loads(obj)) 58 | except json.JSONDecodeError: 59 | continue 60 | if valid_objects: 61 | return valid_objects if len(valid_objects) > 1 else valid_objects[0] 62 | print(f"[thepipe] Failed to extract valid JSON from LLM response: {llm_response}") 63 | return None 64 | 65 | 66 | def extract_from_chunk( 67 | chunk: Chunk, 68 | chunk_index: int, 69 | schema: str, 70 | ai_model: str, 71 | source: str, 72 | multiple_extractions: bool, 73 | extraction_prompt: str, 74 | host_images: bool, 75 | openai_client: OpenAI, 76 | ) -> Tuple[Dict, int]: 77 | response_dict = {"chunk_index": chunk_index, "source": source} 78 | tokens_used = 0 79 | try: 80 | corrected_extraction_prompt = extraction_prompt.replace("{schema}", schema) 81 | if multiple_extractions: 82 | corrected_extraction_prompt += """\nIf there are multiple extractions, return each JSON dictionary in a list under the key "extraction". The list should contain each extraction dict (according to the schema) and the entire list should be set to the "extraction" key. Immediately return this extraction JSON object with the "extraction" key mapping to a list containing all the extracted data.""" 83 | else: 84 | corrected_extraction_prompt += ( 85 | """\nImmediately return the JSON dictionary.""" 86 | ) 87 | 88 | messages = [ 89 | chunk.to_message(host_images=host_images), 90 | { 91 | "role": "user", 92 | "content": corrected_extraction_prompt, 93 | }, 94 | ] 95 | 96 | response = openai_client.chat.completions.create( 97 | model=ai_model, 98 | messages=cast(Iterable[ChatCompletionMessageParam], messages), 99 | response_format={"type": "json_object"}, 100 | ) 101 | llm_response = response.choices[0].message.content 102 | if not llm_response: 103 | raise Exception( 104 | f"Failed to receive a message content from LLM Response: {response}" 105 | ) 106 | input_tokens = calculate_tokens([chunk]) 107 | output_tokens = calculate_tokens([Chunk(text=llm_response)]) 108 | tokens_used += input_tokens + output_tokens 109 | try: 110 | llm_response_dict = extract_json_from_response(llm_response) 111 | if llm_response_dict: 112 | if multiple_extractions: 113 | if ( 114 | isinstance(llm_response_dict, dict) 115 | and "extraction" in llm_response_dict 116 | ): 117 | response_dict["extraction"] = llm_response_dict["extraction"] 118 | elif isinstance(llm_response_dict, list): 119 | response_dict["extraction"] = llm_response_dict 120 | else: 121 | response_dict["extraction"] = [llm_response_dict] 122 | else: 123 | if isinstance(llm_response_dict, dict): 124 | response_dict.update(llm_response_dict) 125 | elif isinstance(llm_response_dict, list): 126 | response_dict["error"] = ( 127 | f"Expected a single JSON object but received a list: {llm_response_dict}. Try enabling multiple extractions." 128 | ) 129 | else: 130 | response_dict["error"] = ( 131 | f"Invalid JSON structure in LLM response: {llm_response_dict}" 132 | ) 133 | else: 134 | response_dict["error"] = ( 135 | f"Failed to extract valid JSON from LLM response: {llm_response}" 136 | ) 137 | except Exception as e: 138 | response_dict["error"] = f"Error processing LLM response: {e}" 139 | if not multiple_extractions: 140 | schema_keys = ( 141 | json.loads(schema).keys() if isinstance(schema, str) else schema.keys() 142 | ) 143 | for key in schema_keys: 144 | if key not in response_dict: 145 | response_dict[key] = None 146 | except Exception as e: 147 | response_dict = {"chunk_index": chunk_index, "source": source, "error": str(e)} 148 | return response_dict, tokens_used 149 | 150 | 151 | def extract( 152 | chunks: List[Chunk], 153 | schema: Union[str, Dict], 154 | model: str = DEFAULT_AI_MODEL, 155 | multiple_extractions: bool = False, 156 | extraction_prompt: str = DEFAULT_EXTRACTION_PROMPT, 157 | host_images: bool = False, 158 | openai_client: Optional[OpenAI] = None, 159 | ) -> Tuple[List[Dict], int]: 160 | print( 161 | f"[thepipe] Extract functions will be deprecated in future versions. See the README for more information" 162 | ) 163 | if isinstance(schema, dict): 164 | schema = json.dumps(schema) 165 | 166 | if openai_client is None: 167 | raise ValueError( 168 | "OpenAI client is required for structured extraction. Please provide a valid OpenAI client." 169 | ) 170 | 171 | results = [] 172 | total_tokens_used = 0 173 | 174 | n_threads = (os.cpu_count() or 1) * 2 175 | with ThreadPoolExecutor(max_workers=n_threads) as executor: 176 | future_to_chunk = { 177 | executor.submit( 178 | extract_from_chunk, 179 | chunk=chunk, 180 | chunk_index=i, 181 | schema=schema, 182 | ai_model=model, 183 | source=chunk.path if chunk.path else "", 184 | multiple_extractions=multiple_extractions, 185 | extraction_prompt=extraction_prompt, 186 | host_images=host_images, 187 | openai_client=openai_client, 188 | ): i 189 | for i, chunk in enumerate(chunks) 190 | } 191 | 192 | for future in as_completed(future_to_chunk): 193 | try: 194 | result, tokens_used = future.result() 195 | results.append(result) 196 | total_tokens_used += tokens_used 197 | except Exception as e: 198 | chunk_index = future_to_chunk[future] 199 | results.append( 200 | { 201 | "chunk_index": chunk_index, 202 | "source": chunks[chunk_index].path, 203 | "error": str(e), 204 | } 205 | ) 206 | 207 | results.sort(key=lambda x: x["chunk_index"]) 208 | return results, total_tokens_used 209 | 210 | 211 | def extract_from_url( 212 | url: str, 213 | schema: Union[str, Dict], 214 | model: str = DEFAULT_AI_MODEL, 215 | multiple_extractions: bool = False, 216 | extraction_prompt: str = DEFAULT_EXTRACTION_PROMPT, 217 | host_images: bool = False, 218 | verbose: bool = False, 219 | chunking_method: Callable[[List[Chunk]], List[Chunk]] = chunk_by_page, 220 | openai_client: Optional[OpenAI] = None, 221 | ) -> Tuple[List[Dict], int]: 222 | print( 223 | f"[thepipe] Extract functions will be deprecated in future versions. See the README for more information" 224 | ) 225 | chunks = scrape_url( 226 | url, 227 | verbose=verbose, 228 | chunking_method=chunking_method, 229 | openai_client=openai_client, 230 | ) 231 | extracted_chunks, tokens_used = extract( 232 | chunks=chunks, 233 | schema=schema, 234 | model=model, 235 | multiple_extractions=multiple_extractions, 236 | extraction_prompt=extraction_prompt, 237 | host_images=host_images, 238 | openai_client=openai_client, 239 | ) 240 | return extracted_chunks, tokens_used 241 | 242 | 243 | def extract_from_file( 244 | file_path: str, 245 | schema: Union[str, Dict], 246 | model: str = DEFAULT_AI_MODEL, 247 | multiple_extractions: bool = False, 248 | extraction_prompt: str = DEFAULT_EXTRACTION_PROMPT, 249 | host_images: bool = False, 250 | verbose: bool = False, 251 | chunking_method: Callable[[List[Chunk]], List[Chunk]] = chunk_by_page, 252 | openai_client: Optional[OpenAI] = None, 253 | ) -> Tuple[List[Dict], int]: 254 | print( 255 | f"[thepipe] Extract functions will be deprecated in future versions. See the README for more information" 256 | ) 257 | chunks = scrape_file( 258 | file_path, 259 | verbose=verbose, 260 | chunking_method=chunking_method, 261 | openai_client=openai_client, 262 | ) 263 | extracted_chunks, tokens_used = extract( 264 | chunks=chunks, 265 | schema=schema, 266 | model=model, 267 | multiple_extractions=multiple_extractions, 268 | extraction_prompt=extraction_prompt, 269 | host_images=host_images, 270 | openai_client=openai_client, 271 | ) 272 | return extracted_chunks, tokens_used 273 | -------------------------------------------------------------------------------- /thepipe/chunker.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import Dict, List, Optional, Tuple, Union 3 | from .core import ( 4 | Chunk, 5 | calculate_tokens, 6 | DEFAULT_AI_MODEL, 7 | DEFAULT_EMBEDDING_MODEL, 8 | ) 9 | import numpy as np 10 | from pydantic import BaseModel 11 | from openai import OpenAI 12 | 13 | 14 | class Section(BaseModel): 15 | title: str 16 | start_line: int 17 | end_line: int 18 | 19 | 20 | class SectionList(BaseModel): 21 | sections: List[Section] 22 | 23 | 24 | def chunk_by_document(chunks: List[Chunk]) -> List[Chunk]: 25 | chunks_by_doc = {} 26 | new_chunks = [] 27 | for chunk in chunks: 28 | if not chunk.path: 29 | raise ValueError( 30 | "Document chunking requires the path attribute to determine the document boundaries" 31 | ) 32 | if chunk.path not in chunks_by_doc: 33 | chunks_by_doc[chunk.path] = [] 34 | chunks_by_doc[chunk.path].append(chunk) 35 | for doc_chunks in chunks_by_doc.values(): 36 | doc_texts = [] 37 | doc_images = [] 38 | for chunk in doc_chunks: 39 | doc_texts.append(chunk.text) 40 | doc_images.extend(chunk.images) 41 | text = "\n".join(doc_texts) if doc_texts else None 42 | new_chunks.append(Chunk(path=doc_chunks[0].path, text=text, images=doc_images)) 43 | return new_chunks 44 | 45 | 46 | def chunk_by_page(chunks: List[Chunk]) -> List[Chunk]: 47 | # by-page chunking is default behavior 48 | return chunks 49 | 50 | 51 | def chunk_by_section( 52 | chunks: List[Chunk], section_separator: str = "## " 53 | ) -> List[Chunk]: 54 | section_chunks: List[Chunk] = [] 55 | cur_text: Optional[str] = None 56 | cur_images: List = [] 57 | cur_path: Optional[str] = None 58 | 59 | for chunk in chunks: 60 | # Extract text (always a string or None) 61 | chunk_text = chunk.text or "" 62 | # Append images to current section once started 63 | if cur_text is not None and getattr(chunk, "images", None): 64 | if chunk.images: 65 | cur_images.extend(chunk.images) 66 | 67 | for line in chunk_text.split("\n"): 68 | if line.startswith(section_separator): 69 | # New section header found 70 | if cur_text is not None: 71 | # Flush previous section 72 | section_chunks.append( 73 | Chunk( 74 | path=cur_path, 75 | text=cur_text.rstrip("\n"), 76 | images=cur_images.copy(), 77 | ) 78 | ) 79 | # Start new section 80 | cur_text = line + "\n" 81 | cur_images = [] 82 | cur_path = chunk.path 83 | else: 84 | if cur_text is not None: 85 | cur_text += line + "\n" 86 | else: 87 | # Text before any section header: start first section 88 | if line.strip(): 89 | cur_text = line + "\n" 90 | cur_path = chunk.path 91 | cur_images = [] 92 | 93 | # Flush last section if present 94 | if cur_text is not None: 95 | section_chunks.append( 96 | Chunk(path=cur_path, text=cur_text.rstrip("\n"), images=cur_images.copy()) 97 | ) 98 | 99 | return section_chunks 100 | 101 | 102 | def chunk_semantic( 103 | chunks: List[Chunk], 104 | model: str = DEFAULT_EMBEDDING_MODEL, 105 | buffer_size: int = 3, 106 | similarity_threshold: float = 0.1, 107 | ) -> List[Chunk]: 108 | try: 109 | from sentence_transformers import SentenceTransformer 110 | except ImportError as exc: # pragma: no cover - exercised via runtime usage 111 | raise ImportError( 112 | "`chunk_semantic` requires the optional dependency `sentence-transformers`. " 113 | "Install it with `pip install thepipe-api[semantic]` or include the `gpu` extra." 114 | ) from exc 115 | 116 | embedding_model = SentenceTransformer(model_name_or_path=model) 117 | # Flatten the chunks into sentences 118 | sentences = [] 119 | sentence_chunk_map = [] 120 | sentence_path_map = [] 121 | for chunk in chunks: 122 | chunk_text = chunk.text 123 | if chunk_text: 124 | lines = re.split(r"(?<=[.?!])\s+", chunk_text) 125 | for line in lines: 126 | sentences.append(line) 127 | sentence_chunk_map.append(chunk) 128 | sentence_path_map.append(chunk.path) 129 | 130 | # Compute embeddings 131 | embeddings = np.array(embedding_model.encode(sentences, convert_to_numpy=True)) 132 | 133 | # Create groups based on sentence similarity 134 | grouped_sentences = [] 135 | current_group = [] 136 | for i, embedding in enumerate(embeddings): 137 | if not current_group: 138 | current_group.append(i) 139 | continue 140 | # Check similarity with the last sentence in the current group 141 | # If the similarity is above the threshold, add the sentence to the group 142 | # Otherwise, start a new group 143 | a = embedding 144 | b = embeddings[current_group[-1]] 145 | denom = float(np.linalg.norm(a) * np.linalg.norm(b)) 146 | similarity = float(np.dot(a, b) / denom) if denom else 0.0 147 | if similarity >= similarity_threshold: 148 | current_group.append(i) 149 | else: 150 | grouped_sentences.append(current_group) 151 | current_group = [i] 152 | 153 | if current_group: 154 | grouped_sentences.append(current_group) 155 | 156 | # Create new chunks based on grouped sentences 157 | new_chunks = [] 158 | for group in grouped_sentences: 159 | group_text = "\n".join(sentences[i] for i in group) 160 | group_images = [] 161 | group_path = sentence_path_map[group[0]] 162 | seen_images = [] 163 | for i in group: 164 | for image in sentence_chunk_map[i].images: 165 | if image not in seen_images: 166 | group_images.append(image) 167 | seen_images.append(image) 168 | new_chunks.append(Chunk(path=group_path, text=group_text, images=group_images)) 169 | 170 | return new_chunks 171 | 172 | 173 | # starts a new chunk any time a word is found 174 | def chunk_by_keywords( 175 | chunks: List[Chunk], keywords: List[str] = ["section"] 176 | ) -> List[Chunk]: 177 | new_chunks = [] 178 | current_chunk_text = "" 179 | current_chunk_images = [] 180 | current_chunk_path = chunks[0].path 181 | for chunk in chunks: 182 | if chunk.images: 183 | current_chunk_images.extend(chunk.images) 184 | lines = chunk.text.split("\n") if chunk.text else [] 185 | for line in lines: 186 | if any(keyword.lower() in line.lower() for keyword in keywords): 187 | if current_chunk_text: 188 | new_chunks.append( 189 | Chunk( 190 | path=chunk.path, 191 | text=current_chunk_text, 192 | images=current_chunk_images, 193 | ) 194 | ) 195 | current_chunk_text = "" 196 | current_chunk_images = [] 197 | current_chunk_path = chunk.path 198 | current_chunk_text += line + "\n" 199 | if current_chunk_text: 200 | new_chunks.append( 201 | Chunk( 202 | path=current_chunk_path, 203 | text=current_chunk_text, 204 | images=current_chunk_images, 205 | ) 206 | ) 207 | return new_chunks 208 | 209 | 210 | def chunk_by_length(chunks: List[Chunk], max_tokens: int = 10000) -> List[Chunk]: 211 | new_chunks = [] 212 | for chunk in chunks: 213 | total_tokens = calculate_tokens([chunk]) 214 | if total_tokens < max_tokens: 215 | new_chunks.append(chunk) 216 | continue 217 | text_halfway_index = len(chunk.text) // 2 if chunk.text else 0 218 | images_halfway_index = len(chunk.images) // 2 if chunk.images else 0 219 | if text_halfway_index == 0 and images_halfway_index == 0: 220 | if chunk.images: 221 | # can't be split further: try to reduce the size of the images 222 | # by resizing each image to half its size 223 | new_images = [] 224 | for image in chunk.images: 225 | new_width = image.width // 2 226 | new_height = image.height // 2 227 | resized_image = image.resize((new_width, new_height)) 228 | new_images.append(resized_image) 229 | else: 230 | # throw error to prevent downstream errors with LLM inference 231 | raise ValueError( 232 | "Chunk cannot be split further. Please increase the max_tokens limit." 233 | ) 234 | 235 | return new_chunks 236 | split_chunks = [ 237 | Chunk( 238 | path=chunk.path, 239 | text=chunk.text[:text_halfway_index] if chunk.text else None, 240 | images=chunk.images[:images_halfway_index] if chunk.images else None, 241 | ), 242 | Chunk( 243 | path=chunk.path, 244 | text=chunk.text[text_halfway_index:] if chunk.text else None, 245 | images=chunk.images[images_halfway_index:] if chunk.images else None, 246 | ), 247 | ] 248 | # recursive call 249 | new_chunks = chunk_by_length(split_chunks, max_tokens) 250 | 251 | return new_chunks 252 | 253 | 254 | # LLM-based agentic semantic chunking (experimental, openai only) 255 | def chunk_agentic( 256 | chunks: List[Chunk], 257 | openai_client: OpenAI, 258 | model: str = DEFAULT_AI_MODEL, 259 | max_tokens: int = 50000, 260 | ) -> List[Chunk]: 261 | # 1) Enforce a hard token limit 262 | chunks = chunk_by_length(chunks, max_tokens=max_tokens) 263 | 264 | # 2) Group by document 265 | docs: Dict[str, List[Chunk]] = {} 266 | for c in chunks: 267 | docs.setdefault(c.path or "__no_path__", []).append(c) 268 | 269 | final_chunks: List[Chunk] = [] 270 | 271 | for path, doc_chunks in docs.items(): 272 | # Flatten into numbered lines 273 | lines: List[str] = [] 274 | line_to_chunk: List[Chunk] = [] 275 | for chunk in doc_chunks: 276 | texts = ( 277 | chunk.text 278 | if isinstance(chunk.text, list) 279 | else ([chunk.text] if chunk.text else []) 280 | ) 281 | for text in texts: 282 | for line in text.split("\n"): 283 | lines.append(line) 284 | line_to_chunk.append(chunk) 285 | if not lines: 286 | continue 287 | 288 | numbered = "\n".join(f"{i+1}: {lines[i]}" for i in range(len(lines))) 289 | 290 | # 3) Ask the LLM for structured JSON 291 | system_prompt = ( 292 | "Divide the following numbered document into semantically cohesive sections. " 293 | "Return only a single JSON object matching the Pydantic schema `SectionList`, " 294 | "e.g.:\n" 295 | "{\n" 296 | ' "sections": [\n' 297 | ' {"title": "Introduction", "start_line": 1, "end_line": 5},\n' 298 | " ...\n" 299 | " ]\n" 300 | "}\n" 301 | "Ensure `start_line` and `end_line` are integers, cover every line in order, " 302 | "and do not overlap or leave gaps." 303 | ) 304 | user_prompt = numbered 305 | 306 | completion = openai_client.beta.chat.completions.parse( 307 | model=model, 308 | messages=[ 309 | {"role": "system", "content": system_prompt}, 310 | {"role": "user", "content": user_prompt}, 311 | ], 312 | response_format=SectionList, 313 | ) 314 | 315 | if not completion.choices[0].message.parsed: 316 | raise ValueError( 317 | "LLM did not return a valid response during agentic chunking." 318 | ) 319 | 320 | sections: List[Section] = completion.choices[0].message.parsed.sections 321 | 322 | # build chunks from those sections 323 | for sec in sections: 324 | start, end, title = sec.start_line, sec.end_line, sec.title 325 | # clamp 326 | start = max(1, min(start, len(lines))) 327 | end = max(start, min(end, len(lines))) 328 | 329 | sec_lines = lines[start - 1 : end] 330 | seen_imgs = set() 331 | sec_images = [] 332 | for idx in range(start - 1, end): 333 | for img in getattr(line_to_chunk[idx], "images", []): 334 | if img not in seen_imgs: 335 | seen_imgs.add(img) 336 | sec_images.append(img) 337 | 338 | # prepend header 339 | text_block = "\n".join(sec_lines) 340 | new_chunk = Chunk( 341 | path=path if path != "__no_path__" else None, 342 | text=text_block, 343 | images=sec_images, 344 | ) 345 | 346 | # break further by length if needed 347 | final_chunks.extend(chunk_by_length([new_chunk], max_tokens=max_tokens)) 348 | 349 | return final_chunks 350 | -------------------------------------------------------------------------------- /thepipe/core.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import base64 3 | from io import BytesIO 4 | import json 5 | import os 6 | import re 7 | import time 8 | from typing import Dict, Iterable, List, Optional, Tuple, Union 9 | import requests 10 | from PIL import Image 11 | 12 | try: # Optional LlamaIndex dependency 13 | from llama_index.core.schema import Document as _LlamaDocument 14 | from llama_index.core.schema import ImageDocument as _LlamaImageDocument 15 | except ImportError: # pragma: no cover - handled dynamically in helpers below 16 | _LlamaDocument = None # type: ignore[assignment] 17 | _LlamaImageDocument = None # type: ignore[assignment] 18 | 19 | # Re-export for backwards compatibility (may be ``None`` when not installed) 20 | Document = _LlamaDocument # type: ignore[assignment] 21 | ImageDocument = _LlamaImageDocument # type: ignore[assignment] 22 | 23 | # LLM provider info, defaults to openai 24 | DEFAULT_AI_MODEL = os.getenv("DEFAULT_AI_MODEL", "gpt-4o") 25 | DEFAULT_EMBEDDING_MODEL = os.getenv( 26 | "DEFAULT_EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2" 27 | ) 28 | 29 | # for persistent images via filehosting 30 | HOST_IMAGES = os.getenv("HOST_IMAGES", "false").lower() == "true" 31 | HOST_URL = os.getenv("HOST_URL", "https://thepipe-api.up.railway.app") 32 | 33 | 34 | def prepare_image(image: Image.Image) -> Image.Image: 35 | """Return an in-memory copy of ``image`` with its underlying resources closed.""" 36 | 37 | try: 38 | image.load() 39 | except Exception: 40 | pass 41 | 42 | try: 43 | prepared_image = image.copy() 44 | except Exception: 45 | return image 46 | 47 | try: 48 | image.close() 49 | except Exception: 50 | pass 51 | 52 | return prepared_image 53 | 54 | 55 | def _ensure_llama_index() -> Tuple["Document", "ImageDocument"]: 56 | """Import LlamaIndex lazily and provide a helpful error message if missing.""" 57 | 58 | global _LlamaDocument, _LlamaImageDocument 59 | 60 | if _LlamaDocument is not None and _LlamaImageDocument is not None: 61 | return _LlamaDocument, _LlamaImageDocument # type: ignore[return-value] 62 | 63 | try: 64 | from llama_index.core.schema import Document as doc_cls 65 | from llama_index.core.schema import ImageDocument as image_doc_cls 66 | except ImportError as exc: # pragma: no cover - exercised via has_llama_index 67 | raise ImportError( 68 | "LlamaIndex support is optional. Install it with " 69 | "`pip install thepipe-api[llama-index]` to use `Chunk.to_llamaindex`." 70 | ) from exc 71 | 72 | _LlamaDocument, _LlamaImageDocument = doc_cls, image_doc_cls 73 | return doc_cls, image_doc_cls # type: ignore[return-value] 74 | 75 | 76 | def has_llama_index() -> bool: 77 | """Return ``True`` when the optional LlamaIndex dependency is available.""" 78 | 79 | try: 80 | _ensure_llama_index() 81 | except ImportError: 82 | return False 83 | return True 84 | 85 | 86 | class Chunk: 87 | def __init__( 88 | self, 89 | path: Optional[str] = None, 90 | text: Optional[str] = None, 91 | images: Optional[Iterable[Image.Image]] = None, 92 | audios: Optional[Iterable] = None, 93 | videos: Optional[Iterable] = None, 94 | ): 95 | self.path = path 96 | self.text = text or "" 97 | self.images = [prepare_image(image) for image in images] if images else [] 98 | self.audios = list(audios) if audios else [] 99 | self.videos = list(videos) if videos else [] 100 | 101 | def __repr__(self) -> str: 102 | parts = [] 103 | if self.path is not None: 104 | parts.append(f"path={self.path!r}") 105 | if self.text: 106 | # Show a concise preview of the text 107 | snippet = self.text.replace("\n", " ") 108 | if len(snippet) > 50: 109 | snippet = snippet[:47] + "..." 110 | parts.append(f"text_snippet={snippet!r}") 111 | if self.images: 112 | parts.append(f"images_count={len(self.images)}") 113 | if self.audios: 114 | parts.append(f"audios_count={len(self.audios)}") 115 | if self.videos: 116 | parts.append(f"videos_count={len(self.videos)}") 117 | content = ", ".join(parts) or "empty" 118 | return f"Chunk({content})" 119 | 120 | def __str__(self) -> str: 121 | return self.__repr__() 122 | 123 | def to_llamaindex(self) -> Union[List["Document"], List["ImageDocument"]]: 124 | DocumentCls, ImageDocumentCls = _ensure_llama_index() 125 | document_text = self.text if self.text else "" 126 | metadata = {"filepath": self.path} if self.path else {} 127 | 128 | # If we have PIL Image objects in self.images, convert them to base64 strings 129 | if self.images: 130 | image_docs: List[ImageDocument] = [] 131 | for img in self.images: 132 | # Encode the image to JPEG (or use its original format if available) 133 | buffer = BytesIO() 134 | fmt = img.format or "JPEG" 135 | img = img.convert("RGB") # ensure RGB 136 | img.save(buffer, format=fmt) 137 | img_bytes = buffer.getvalue() 138 | 139 | # Base64‑encode and build MIME type 140 | img_b64 = base64.b64encode(img_bytes).decode("utf-8") 141 | 142 | image_docs.append( 143 | ImageDocumentCls( 144 | text=document_text, 145 | image=img_b64, 146 | extra_info=metadata, 147 | ) 148 | ) 149 | return image_docs 150 | 151 | # Fallback to plain text Document 152 | return [DocumentCls(text=document_text, extra_info=metadata)] 153 | 154 | def to_message( 155 | self, 156 | text_only: bool = False, 157 | host_images: bool = False, 158 | max_resolution: Optional[int] = None, 159 | include_paths: Optional[bool] = False, 160 | ) -> Dict: 161 | message_text = "" 162 | message = {"role": "user", "content": []} 163 | image_urls = ( 164 | [ 165 | make_image_url(image, host_images, max_resolution) 166 | for image in self.images 167 | ] 168 | if self.images and not text_only 169 | else [] 170 | ) 171 | img_index = 0 172 | text = self.text if self.text else "" 173 | if host_images: 174 | 175 | def replace_image(match): 176 | nonlocal img_index 177 | if img_index < len(image_urls): 178 | url = image_urls[img_index] 179 | img_index += 1 180 | return f"![image]({url})" 181 | return match.group( 182 | 0 183 | ) # If we run out of images, leave the original text 184 | 185 | # Replace markdown image references with hosted URLs 186 | text = re.sub(r"!\[([^\]]*)\]\([^\)]+\)", replace_image, text) 187 | message_text += text + "\n\n" 188 | # clean up, add to message 189 | message_text = re.sub(r"\n{3,}", "\n\n", message_text).strip() 190 | # Wrap the text in a path html block if it exists 191 | if include_paths and self.path: 192 | message_text = f'\n{message_text}\n' 193 | message["content"].append({"type": "text", "text": message_text}) 194 | 195 | # Add remaining images that weren't referenced in the text 196 | for image_url in image_urls: 197 | message["content"].append({"type": "image_url", "image_url": image_url}) 198 | 199 | return message 200 | 201 | def to_json(self, host_images: bool = False, text_only: bool = False) -> Dict: 202 | data = { 203 | "path": self.path, 204 | "text": self.text.strip() if self.text else "", 205 | "images": ( 206 | [ 207 | make_image_url(image=image, host_images=host_images) 208 | for image in self.images 209 | if not text_only 210 | ] 211 | if self.images 212 | else [] 213 | ), 214 | "audios": self.audios, 215 | "videos": self.videos, 216 | } 217 | return data 218 | 219 | @staticmethod 220 | def from_json(data: Dict, host_images: bool = False) -> "Chunk": 221 | images = [] 222 | if "images" in data: 223 | for image_str in data["images"]: 224 | if host_images: 225 | image_data = requests.get(image_str).content 226 | image = Image.open(BytesIO(image_data)) 227 | images.append(image) 228 | else: 229 | remove_prefix = image_str.replace("data:image/jpeg;base64,", "") 230 | image_data = base64.b64decode(remove_prefix) 231 | image = Image.open(BytesIO(image_data)) 232 | images.append(image) 233 | text = data["text"].strip() if "text" in data else None 234 | return Chunk( 235 | path=data["path"], 236 | text=text, 237 | images=images, 238 | # audios=data['audios'], 239 | # videos=data['videos'], 240 | ) 241 | 242 | 243 | def make_image_url( 244 | image: Image.Image, host_images: bool = False, max_resolution: Optional[int] = None 245 | ) -> str: 246 | if max_resolution: 247 | width, height = image.size 248 | if width > max_resolution or height > max_resolution: 249 | scale = max_resolution / max(width, height) 250 | new_width = int(width * scale) 251 | new_height = int(height * scale) 252 | image = image.resize((new_width, new_height)) 253 | if host_images: 254 | if not os.path.exists("images"): 255 | os.makedirs("images") 256 | image_id = f"{time.time_ns()}.jpg" 257 | image_path = os.path.join("images", image_id) 258 | if image.mode in ("P", "RGBA"): 259 | image = image.convert("RGB") 260 | image.save(image_path) 261 | return f"{HOST_URL}/images/{image_id}" 262 | else: 263 | buffered = BytesIO() 264 | if image.mode != "RGB": 265 | image = image.convert("RGB") 266 | image.save(buffered, format="JPEG") 267 | img_str = base64.b64encode(buffered.getvalue()).decode() 268 | return f"data:image/jpeg;base64,{img_str}" 269 | 270 | 271 | def calculate_image_tokens(image: Image.Image, detail: str = "auto") -> int: 272 | width, height = image.size 273 | if detail == "low": 274 | return 85 275 | elif detail == "high": 276 | width, height = min(width, 2048), min(height, 2048) 277 | short_side = min(width, height) 278 | scale = 768 / short_side 279 | scaled_width = int(width * scale) 280 | scaled_height = int(height * scale) 281 | tiles = (scaled_width // 512) * (scaled_height // 512) 282 | return 170 * tiles + 85 283 | else: 284 | if width <= 512 and height <= 512: 285 | return 85 286 | else: 287 | return calculate_image_tokens(image, detail="high") 288 | 289 | 290 | def calculate_tokens(chunks: List[Chunk], text_only: bool = False) -> int: 291 | n_tokens = 0 292 | for chunk in chunks: 293 | if chunk.text: 294 | n_tokens += len(chunk.text) / 4 295 | if chunk.images and not text_only: 296 | for image in chunk.images: 297 | n_tokens += calculate_image_tokens(image) 298 | return int(n_tokens) 299 | 300 | 301 | def chunks_to_messages( 302 | chunks: List[Chunk], 303 | text_only: bool = False, 304 | host_images: bool = False, 305 | max_resolution: Optional[int] = None, 306 | include_paths: Optional[bool] = False, 307 | ) -> List[Dict]: 308 | return [ 309 | chunk.to_message( 310 | text_only=text_only, 311 | host_images=host_images, 312 | max_resolution=max_resolution, 313 | include_paths=include_paths, 314 | ) 315 | for chunk in chunks 316 | ] 317 | 318 | 319 | def save_outputs( 320 | chunks: List[Chunk], 321 | output_folder: str, 322 | verbose: bool = False, 323 | text_only: bool = False, 324 | ) -> None: 325 | if not os.path.exists(output_folder): 326 | os.makedirs(output_folder) 327 | text = "" 328 | # Save the text and images to the outputs directory 329 | for i, chunk in enumerate(chunks): 330 | if chunk is None: 331 | continue 332 | if chunk.path is not None: 333 | text += f"{chunk.path}:\n" 334 | if chunk.text: 335 | text += f"```\n{chunk.text}\n```\n" 336 | if not text_only and chunk.images: 337 | for j, image in enumerate(chunk.images): 338 | image.convert("RGB").save(f"{output_folder}/{i}_{j}.jpg") 339 | # Save the text 340 | with open(f"{output_folder}/prompt.txt", "w", encoding="utf-8") as file: 341 | file.write(text) 342 | if verbose: 343 | print(f"[thepipe] {calculate_tokens(chunks)} tokens saved to {output_folder}") 344 | 345 | 346 | def parse_arguments() -> argparse.Namespace: 347 | parser = argparse.ArgumentParser( 348 | description="Compress project files into a context prompt." 349 | ) 350 | parser.add_argument( 351 | "source", type=str, help="The source file or directory to compress." 352 | ) 353 | parser.add_argument( 354 | "--inclusion_pattern", 355 | type=str, 356 | default=None, 357 | help="Regex pattern to match in a directory.", 358 | ) 359 | parser.add_argument( 360 | "--ai_extraction", 361 | action="store_true", 362 | help="Use ai_extraction to extract text from images.", 363 | ) 364 | parser.add_argument("--text_only", action="store_true", help="Only store text.") 365 | parser.add_argument("--verbose", action="store_true", help="Print status messages.") 366 | parser.add_argument("--local", action="store_true", help="Print status messages.") 367 | args = parser.parse_args() 368 | return args 369 | -------------------------------------------------------------------------------- /tests/test_scraper.py: -------------------------------------------------------------------------------- 1 | import json 2 | import tempfile 3 | from typing import cast 4 | import unittest 5 | import os 6 | import sys 7 | import zipfile 8 | from PIL import Image 9 | import pandas as pd 10 | 11 | try: 12 | from openai import OpenAI 13 | except ImportError: # pragma: no cover - optional in CI 14 | OpenAI = None # type: ignore[assignment] 15 | 16 | try: 17 | import whisper # noqa: F401 # pragma: no cover - optional dependency 18 | 19 | HAS_WHISPER = True 20 | except ImportError: # pragma: no cover - optional dependency 21 | HAS_WHISPER = False 22 | 23 | SKIP_NETWORK_TESTS = os.getenv("THEPIPE_SKIP_NETWORK_TESTS") == "1" 24 | 25 | sys.path.append("..") 26 | import thepipe.core as core 27 | import thepipe.scraper as scraper 28 | 29 | 30 | class test_scraper(unittest.TestCase): 31 | def setUp(self): 32 | self.files_directory = os.path.join(os.path.dirname(__file__), "files") 33 | self.outputs_directory = "outputs" 34 | # create a client we can re-use for ai_extraction scenarios 35 | self.client = OpenAI() if OpenAI is not None else None 36 | 37 | def tearDown(self): 38 | # clean up outputs 39 | if os.path.exists(self.outputs_directory): 40 | for file in os.listdir(self.outputs_directory): 41 | os.remove(os.path.join(self.outputs_directory, file)) 42 | os.rmdir(self.outputs_directory) 43 | 44 | def test_scrape_directory(self): 45 | # verify scraping entire example directory, bar the 'unknown' file 46 | chunks = scraper.scrape_directory( 47 | dir_path=self.files_directory, inclusion_pattern="^(?!.*unknown).*" 48 | ) 49 | self.assertIsInstance(chunks, list) 50 | self.assertGreater(len(chunks), 0) 51 | for chunk in chunks: 52 | self.assertIsInstance(chunk, core.Chunk) 53 | # ensure at least one of text/images is non-empty 54 | if not (chunk.text or chunk.images): 55 | self.fail("Empty chunk found: {}".format(chunk.path)) 56 | self.assertTrue(chunk.text or chunk.images) 57 | 58 | def test_scrape_directory_inclusion_exclusion(self): 59 | with tempfile.TemporaryDirectory() as tmp: 60 | # ignored folder 61 | os.makedirs(os.path.join(tmp, "node_modules")) 62 | with open(os.path.join(tmp, "node_modules", "a.txt"), "w") as f: 63 | f.write("x") 64 | # ignored extension 65 | with open(os.path.join(tmp, "bad.pyc"), "w") as f: 66 | f.write("x") 67 | # valid file 68 | good = os.path.join(tmp, "good.txt") 69 | with open(good, "w") as f: 70 | f.write("Y") 71 | 72 | chunks = scraper.scrape_directory(tmp, inclusion_pattern="good") 73 | 74 | self.assertEqual(len(chunks), 1) 75 | 76 | # cast .text to str so Pylance knows it's not None 77 | text = cast(str, chunks[0].text) 78 | self.assertIn("Y", text) 79 | 80 | def test_scrape_html(self): 81 | filepath = os.path.join(self.files_directory, "example.html") 82 | chunks = scraper.scrape_file(filepath, verbose=True) 83 | # verify it scraped the url into chunks 84 | self.assertIsInstance(chunks, list) 85 | self.assertGreater(len(chunks), 0) 86 | # verify it scraped markdown data 87 | self.assertTrue(any(chunk.text and len(chunk.text) > 0 for chunk in chunks)) 88 | # verify it scraped to markdown correctly 89 | self.assertTrue(any("# Heading 1" in (chunk.text or "") for chunk in chunks)) 90 | self.assertTrue(any("## Heading 2" in (chunk.text or "") for chunk in chunks)) 91 | self.assertTrue(any("### Heading 3" in (chunk.text or "") for chunk in chunks)) 92 | self.assertTrue( 93 | any("| Name | Age | Country |" in (chunk.text or "") for chunk in chunks) 94 | ) 95 | # verify bold and italic 96 | self.assertTrue(any("**bold text**" in (chunk.text or "") for chunk in chunks)) 97 | self.assertTrue(any("*italic text*" in (chunk.text or "") for chunk in chunks)) 98 | # ensure javascript was not scraped 99 | self.assertFalse( 100 | any("function highlightText()" in (chunk.text or "") for chunk in chunks) 101 | ) 102 | 103 | def test_scrape_zip(self): 104 | with tempfile.TemporaryDirectory() as tmp: 105 | txt = os.path.join(tmp, "a.txt") 106 | with open(txt, "w") as f: 107 | f.write("TXT") 108 | imgf = os.path.join(tmp, "i.jpg") 109 | Image.new("RGB", (10, 10)).save(imgf) 110 | zf = os.path.join(tmp, "test.zip") 111 | with zipfile.ZipFile(zf, "w") as z: 112 | z.write(txt, arcname="a.txt") 113 | z.write(imgf, arcname="i.jpg") 114 | chunks = scraper.scrape_file(zf) 115 | 116 | self.assertTrue(any("TXT" in cast(str, c.text) for c in chunks)) 117 | self.assertTrue(any(c.images for c in chunks)) 118 | 119 | def test_scrape_spreadsheet(self): 120 | with tempfile.TemporaryDirectory() as tmp: 121 | df = pd.DataFrame({"a": [1, 2]}) 122 | csvp = os.path.join(tmp, "t.csv") 123 | df.to_csv(csvp, index=False) 124 | chunks_csv = scraper.scrape_spreadsheet(csvp, "application/vnd.ms-excel") 125 | self.assertEqual(len(chunks_csv), 2) 126 | for i, c in enumerate(chunks_csv): 127 | self.assertIsNotNone(c.text) 128 | rec = json.loads(cast(str, c.text)) 129 | self.assertEqual(rec["a"], i + 1) 130 | self.assertEqual(rec["row index"], i) 131 | 132 | xlsx = os.path.join(tmp, "t.xlsx") 133 | df.to_excel(xlsx, index=False) 134 | chunks_xlsx = scraper.scrape_spreadsheet( 135 | xlsx, 136 | "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", 137 | ) 138 | self.assertEqual(len(chunks_xlsx), 2) 139 | 140 | def test_scrape_ipynb(self): 141 | chunks = scraper.scrape_file( 142 | os.path.join(self.files_directory, "example.ipynb"), verbose=True 143 | ) 144 | # verify it scraped the ipynb file into chunks 145 | self.assertIsInstance(chunks, list) 146 | self.assertGreater(len(chunks), 0) 147 | self.assertIsInstance(chunks[0], core.Chunk) 148 | # verify it scraped text data 149 | self.assertTrue( 150 | any(chunk.text and len(chunk.text or "") > 0 for chunk in chunks) 151 | ) 152 | # verify it scraped image data 153 | self.assertTrue( 154 | any(chunk.images and len(chunk.images or []) > 0 for chunk in chunks) 155 | ) 156 | 157 | # requires LLM server to be set up 158 | @unittest.skipIf( 159 | OpenAI is None or not os.getenv("OPENAI_API_KEY"), "OpenAI API key required" 160 | ) 161 | def test_scrape_pdf_with_ai_extraction(self): 162 | chunks = scraper.scrape_file( 163 | os.path.join(self.files_directory, "example.pdf"), 164 | verbose=True, 165 | openai_client=self.client, 166 | ) 167 | # verify it scraped the pdf file into chunks 168 | self.assertIsInstance(chunks, list) 169 | self.assertGreater(len(chunks), 0) 170 | self.assertIsInstance(chunks[0], core.Chunk) 171 | # verify it scraped the data 172 | for chunk in chunks: 173 | self.assertTrue( 174 | (chunk.text and len(chunk.text or "") > 0) 175 | or (chunk.images and len(chunk.images or []) > 0) 176 | ) 177 | 178 | def test_scrape_docx(self): 179 | chunks = scraper.scrape_file( 180 | os.path.join(self.files_directory, "example.docx"), verbose=True 181 | ) 182 | # verify it scraped the docx file into chunks 183 | self.assertIsInstance(chunks, list) 184 | self.assertGreater(len(chunks), 0) 185 | self.assertIsInstance(chunks[0], core.Chunk) 186 | # verify it scraped data 187 | self.assertTrue( 188 | any(len(chunk.text or "") or len(chunk.images or []) for chunk in chunks) 189 | ) 190 | 191 | def test_extract_pdf_without_ai_extraction(self): 192 | chunks = scraper.scrape_file( 193 | os.path.join(self.files_directory, "example.pdf"), 194 | verbose=True, 195 | ) 196 | # verify it scraped the pdf file into chunks 197 | self.assertIsInstance(chunks, list) 198 | self.assertGreater(len(chunks), 0) 199 | self.assertIsInstance(chunks[0], core.Chunk) 200 | # verify it scraped text data 201 | self.assertTrue( 202 | any(chunk.text and len(chunk.text or "") > 0 for chunk in chunks) 203 | ) 204 | # verify it scraped image data 205 | self.assertTrue( 206 | any(chunk.images and len(chunk.images or []) > 0 for chunk in chunks) 207 | ) 208 | 209 | @unittest.skipUnless(HAS_WHISPER, "Whisper extra is not installed") 210 | def test_scrape_audio(self): 211 | chunks = scraper.scrape_file( 212 | os.path.join(self.files_directory, "example.mp3"), verbose=True 213 | ) 214 | # verify it scraped the audio file into chunks 215 | self.assertIsInstance(chunks, list) 216 | self.assertGreater(len(chunks), 0) 217 | self.assertIsInstance(chunks[0], core.Chunk) 218 | # verify it scraped audio data 219 | self.assertTrue( 220 | any(chunk.text and len(chunk.text or "") > 0 for chunk in chunks) 221 | ) 222 | # verify it transcribed the audio correctly 223 | self.assertTrue( 224 | any(chunk.text and "citizens" in chunk.text.lower() for chunk in chunks) 225 | ) 226 | 227 | @unittest.skipUnless(HAS_WHISPER, "Whisper extra is not installed") 228 | def test_scrape_video(self): 229 | chunks = scraper.scrape_file( 230 | os.path.join(self.files_directory, "example.mp4"), verbose=True 231 | ) 232 | # verify it scraped the video file into chunks 233 | self.assertIsInstance(chunks, list) 234 | self.assertGreater(len(chunks), 0) 235 | self.assertIsInstance(chunks[0], core.Chunk) 236 | # verify it scraped visual data 237 | self.assertTrue( 238 | any(chunk.images and len(chunk.images or []) > 0 for chunk in chunks) 239 | ) 240 | # verify it scraped audio data 241 | self.assertTrue( 242 | any(chunk.text and len(chunk.text or "") > 0 for chunk in chunks) 243 | ) 244 | # verify it transcribed the audio correctly 245 | self.assertTrue( 246 | any(chunk.text and "citizens" in chunk.text.lower() for chunk in chunks) 247 | ) 248 | 249 | def test_scrape_pptx(self): 250 | chunks = scraper.scrape_file( 251 | os.path.join(self.files_directory, "example.pptx"), verbose=True 252 | ) 253 | # verify it scraped the pptx file into chunks 254 | self.assertIsInstance(chunks, list) 255 | self.assertGreater(len(chunks), 0) 256 | self.assertIsInstance(chunks[0], core.Chunk) 257 | # verify it scraped text data 258 | self.assertTrue( 259 | any(chunk.text and len(chunk.text or "") > 0 for chunk in chunks) 260 | ) 261 | # verify it scraped image data 262 | self.assertTrue( 263 | any(chunk.images and len(chunk.images or []) > 0 for chunk in chunks) 264 | ) 265 | 266 | @unittest.skipIf(SKIP_NETWORK_TESTS, "Network tests disabled") 267 | def test_scrape_tweet(self): 268 | tweet_url = "https://x.com/ylecun/status/1796734866156843480" 269 | chunks = scraper.scrape_url(tweet_url) 270 | # verify it returned chunks representing the tweet 271 | self.assertIsInstance(chunks, list) 272 | self.assertGreater(len(chunks), 0) 273 | self.assertIsInstance(chunks[0], core.Chunk) 274 | # verify it scraped the tweet contents 275 | self.assertTrue(chunks[0].text and len(chunks[0].text or "") > 0) 276 | self.assertTrue(chunks[0].images and len(chunks[0].images or []) > 0) 277 | 278 | @unittest.skipIf(SKIP_NETWORK_TESTS, "Network tests disabled") 279 | def test_scrape_url(self): 280 | # verify web page scrape result 281 | chunks = scraper.scrape_url("https://en.wikipedia.org/wiki/Piping") 282 | 283 | # Basic checks 284 | self.assertIsInstance(chunks, list) 285 | self.assertGreater(len(chunks), 0, "No chunks were extracted") 286 | 287 | for chunk in chunks: 288 | self.assertIsInstance(chunk, core.Chunk) 289 | self.assertEqual(chunk.path, "https://en.wikipedia.org/wiki/Piping") 290 | 291 | # Check that we got some content 292 | has_text = any(chunk.text and len(chunk.text.strip()) > 0 for chunk in chunks) 293 | has_images = any(chunk.images and len(chunk.images) > 0 for chunk in chunks) 294 | 295 | self.assertTrue(has_text or has_images, "No text or images were extracted") 296 | 297 | # More flexible search for piping-related content 298 | if has_text: 299 | all_text = " ".join(chunk.text or "" for chunk in chunks).lower() 300 | piping_keywords = [ 301 | "pipe", 302 | "piping", 303 | "pipeline", 304 | "plumbing", 305 | "tube", 306 | "conduit", 307 | ] 308 | found_keyword = any(keyword in all_text for keyword in piping_keywords) 309 | self.assertTrue( 310 | found_keyword, 311 | f"No piping-related keywords found in text. Text preview: {all_text[:200]}...", 312 | ) 313 | 314 | # verify if at least one image was scraped 315 | self.assertTrue( 316 | any(chunk.images and len(chunk.images or []) > 0 for chunk in chunks) 317 | ) 318 | 319 | @unittest.skipIf(SKIP_NETWORK_TESTS, "Network tests disabled") 320 | @unittest.skipIf( 321 | OpenAI is None or not os.getenv("OPENAI_API_KEY"), "OpenAI API key required" 322 | ) 323 | def test_scrape_url_with_ai_extraction(self): 324 | # verify web page scrape result with ai extraction 325 | chunks = scraper.scrape_url( 326 | "https://en.wikipedia.org/wiki/Piping", openai_client=self.client 327 | ) 328 | for chunk in chunks: 329 | self.assertIsInstance(chunk, core.Chunk) 330 | self.assertEqual(chunk.path, "https://en.wikipedia.org/wiki/Piping") 331 | # assert if any of the texts contains 'pipe' 332 | print("test_scrape_url_with_ai_extraction chunks:", chunks) 333 | self.assertTrue( 334 | any(chunk.text and "pipe" in chunk.text.lower() for chunk in chunks) 335 | ) 336 | # verify if at least one image was scraped 337 | self.assertTrue( 338 | any(chunk.images and len(chunk.images or []) > 0 for chunk in chunks) 339 | ) 340 | 341 | @unittest.skipUnless(os.environ.get("GITHUB_TOKEN"), "requires GITHUB_TOKEN") 342 | def test_scrape_github(self): 343 | chunks = scraper.scrape_url("https://github.com/emcf/thepipe") 344 | self.assertIsInstance(chunks, list) 345 | self.assertGreater(len(chunks), 0) # should have some repo contents 346 | 347 | 348 | if __name__ == "__main__": 349 | unittest.main() 350 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | Pipeline Illustration 4 |

thepi.pe

5 |
6 | 7 | python-gh-action 8 | 9 | 10 | codecov 11 | 12 | 13 | MIT license 14 | 15 | 16 | PyPI 17 | 18 |
19 | 20 | ## Extract clean data from tricky documents ⚡ 21 | 22 | thepi.pe is a package that can scrape clean markdown, multimodal media, and structured data from complex documents. It uses vision-language models (VLMs) under the hood for superior output quality, and works out-of-the-box with any LLM, VLM, or vector database. It can extract well-formatted data from a wide range of sources, including PDFs, URLs, Word docs, Powerpoints, Python notebooks, videos, audio, and more. 23 | 24 | ## Features 🌟 25 | 26 | - Scrape clean markdown, tables, and images from any document 27 | - Scrape text, images, video, and audio from any file or URL 28 | - Works out-of-the-box with vision-language models, vector databases, and RAG frameworks 29 | - AI-native file-type detection, layout analysis, and structured data extraction 30 | - Accepts a wide range of sources, including PDFs, URLs, Word docs, Powerpoints, Python notebooks, GitHub repos, videos, audio, and more 31 | 32 | ## Get started in 5 minutes 🚀 33 | 34 | Thepipe can be installed via the command line: 35 | 36 | ```bash 37 | pip install thepipe-api 38 | ``` 39 | 40 | The default install only pulls in CPU-friendly dependencies so it is suitable for constrained environments and CI systems. GPU-enabled libraries such as PyTorch and Triton are left as optional extras. 41 | 42 | ### Optional extras 43 | 44 | The package exposes a set of extras so you can opt-in to heavier dependencies on demand: 45 | 46 | | Extra | Installs | When to use it | 47 | | -------------------------- | ----------------------------------------- | ----------------------------------------------------- | 48 | | `thepipe-api[audio]` | `openai-whisper` | Local audio/video transcription via Whisper. | 49 | | `thepipe-api[semantic]` | `sentence-transformers` | Semantic chunking with transformer embeddings. | 50 | | `thepipe-api[llama-index]` | `llama-index` | `Chunk.to_llamaindex()` conversions. | 51 | | `thepipe-api[gpu]` | PyTorch + Whisper + Sentence Transformers | Full GPU acceleration with VLM fine-tuning workloads. | 52 | 53 | If you are targeting CPU-only machines but still need the extras that depend on PyTorch, install the CPU wheels directly from the PyTorch index first and then add the extra. For example: 54 | 55 | ```bash 56 | pip install torch==2.5.1+cpu torchvision==0.20.1+cpu torchaudio==2.5.1+cpu \ 57 | --index-url https://download.pytorch.org/whl/cpu 58 | pip install thepipe-api[semantic] 59 | ``` 60 | 61 | If you need full functionality with media-rich sources such as webpages, video, and audio, you can choose to install the following system dependencies: 62 | 63 | ```bash 64 | apt-get update && apt-get install -y git ffmpeg 65 | python -m playwright install --with-deps chromium 66 | ``` 67 | 68 | and use the global installation with pip: 69 | 70 | ```bash 71 | pip install thepipe-api[all] 72 | ``` 73 | 74 | ### Default setup (OpenAI) 75 | 76 | By default, thepipe uses the [OpenAI API](https://platform.openai.com/docs/overview), so VLM features will work out-of-the-box provided you pass in an OpenAI client. 77 | 78 | ### Custom VLM server setup (OpenRouter, OpenLLM, etc.) 79 | 80 | If you wish to use a local vision-language model or a different cloud provider, you can provide a custom OpenAI client, for example, by setting the base url to `https://openrouter.ai/api/v1` for [OpenRouter](https://openrouter.ai/), or `http://localhost:3000/v1` for a local server such as [OpenLLM](https://github.com/bentoml/OpenLLM). Note that uou must also pass the api key to your non-OpenAI cloud provider into the OpenAI client. The model name can be changed with the `model` parameter. By default, the model will be `gpt-4o`. 81 | 82 | ### Scraping 83 | 84 | ```python 85 | from thepipe.scraper import scrape_file 86 | 87 | # scrape text and page images from a PDF 88 | chunks = scrape_file(filepath="paper.pdf") 89 | ``` 90 | 91 | For enhanced scraping with a vision-language model, you can pass in an OpenAI-compatible client and a model name. 92 | 93 | ```python 94 | from openai import OpenAI 95 | from thepipe.scraper import scrape_file 96 | 97 | # create an OpenAI-compatible client 98 | client = OpenAI() 99 | 100 | # scrape clean markdown and page images from a PDF 101 | chunks = scrape_file( 102 | filepath="paper.pdf", 103 | openai_client=client, 104 | model="gpt-4o" 105 | ) 106 | ``` 107 | 108 | ### Chunking 109 | 110 | To satisfy token-limit constraints, the following chunking methods are available to split the content into smaller chunks. 111 | 112 | - `chunk_by_document`: Returns one chunk with the entire content of the file. 113 | - `chunk_by_page`: Returns one chunk for each page (for example: each webpage, PDF page, or PowerPoint slide). 114 | - `chunk_by_length`: Splits chunks by length. 115 | - `chunk_by_section`: Splits chunks by markdown section. 116 | - `chunk_by_keyword`: Splits chunks at keywords. 117 | - `chunk_semantic` (experimental, requires [sentence-transformers](https://pypi.org/project/sentence-transformers/)): Returns chunks split by spikes in semantic changes, with a configurable threshold. 118 | - `chunk_agentic` (experimental, requires [OpenAI](https://pypi.org/project/openai/)): Returns chunks split by an LLM agent that attempts to find semantically meaningful sections. 119 | 120 | For example, 121 | 122 | ```python 123 | from thepipe.scraper import scrape_file 124 | from thepipe.chunker import chunk_by_document, chunk_by_page 125 | 126 | # optionally, pass in chunking_method 127 | # chunk_by_document returns one chunk for the entire document 128 | chunks = scrape_file( 129 | filepath="paper.pdf", 130 | chunking_method=chunk_by_document 131 | ) 132 | 133 | # you can also re-chunk later. 134 | # chunk_by_page returns one chunk for each page (for example: each webpage, PDF page, or PowerPoint slide). 135 | chunks = chunk_by_page(chunks) 136 | ``` 137 | 138 | ### OpenAI Chat Integration 🤖 139 | 140 | ```python 141 | from openai import OpenAI 142 | from thepipe.core import chunks_to_messages 143 | 144 | # Initialize OpenAI client 145 | client = OpenAI() 146 | 147 | # Use OpenAI-formatted chat messages 148 | messages = [{ 149 | "role": "user", 150 | "content": [{ 151 | "type": "text", 152 | "text": "What is the paper about?" 153 | }] 154 | }] 155 | 156 | # Simply add the scraped chunks to the messages 157 | messages += chunks_to_messages(chunks) 158 | 159 | # Call LLM 160 | response = client.chat.completions.create( 161 | model="gpt-4o", 162 | messages=messages, 163 | ) 164 | ``` 165 | 166 | `chunks_to_messages` takes in an optional `text_only` parameter to only output text from the source document. This is useful for downstream use with LLMs that lack multimodal capabilities. 167 | 168 | > ⚠️ **It is important to be mindful of your model's token limit.** 169 | > Be sure your prompt is within the token limit of your model. You can use chunking to split your messages into smaller chunks. 170 | 171 | ### LLamaIndex Integration 🦙 172 | 173 | Install the optional extra and then call `.to_llamaindex`: 174 | 175 | ```bash 176 | pip install thepipe-api[llama-index] 177 | ``` 178 | 179 | After installation, a chunk can be converted to LlamaIndex `Document`/`ImageDocument` with `.to_llamaindex`. Without the extra, a helpful error is raised instead of failing at import time. 180 | 181 | ### Structured extraction 🗂️ 182 | 183 | Note that structured extraction is being deprecated and will be removed in future releases. The current implementation is a simple wrapper around OpenAI's chat API, which is not ideal for structured data extraction. We recommend OpenAI's [structured outputs](https://platform.openai.com/docs/guides/structured-outputs?api-mode=chat) for structured data extraction, or using [Trellis AI](https://runtrellis.com/) for automated workflows with structured data. 184 | 185 | ```python 186 | from thepipe.extract import extract 187 | from openai import OpenAI 188 | 189 | client = OpenAI() 190 | 191 | schema = { 192 | "description": "string", 193 | "amount_usd": "float" 194 | } 195 | 196 | results, tokens_used = extract( 197 | chunks=chunks, 198 | schema=schema, 199 | multiple_extractions=True, # extract multiple rows of data per chunk 200 | openai_client=client 201 | ) 202 | ``` 203 | 204 | ## Running the test suite 🧪 205 | 206 | Install the base requirements plus any extras you rely on, then execute: 207 | 208 | ```bash 209 | pip install -r requirements.txt 210 | python -m unittest discover 211 | ``` 212 | 213 | Tests that depend on optional extras (Whisper, Sentence Transformers, LlamaIndex) or an OpenAI API key are skipped automatically when the corresponding dependency is unavailable. 214 | 215 | ## Sponsors 216 | 217 | Please consider supporting thepipe by [becoming a sponsor](mailto:emmett@thepi.pe). 218 | Your support helps me maintain and improve the project while helping the open-source community discover your work. 219 | 220 | Visit [Cal.com](https://cal.com/) for an open-source scheduling tool that helps you book meetings with ease. It's the perfect solution for busy professionals who want to streamline their scheduling process. 221 | 222 | Book us with Cal.com 223 | 224 | Looking for enterprise-ready document processing and intelligent automation? Discover how [Trellis AI](https://runtrellis.com/) can streamline your workflows and enhance productivity. 225 | 226 | ## How it works 🛠️ 227 | 228 | thepipe uses a combination of computer-vision models and heuristics to scrape clean content from the source and process it for downstream use with [large language models](https://en.wikipedia.org/wiki/Large_language_model), or [vision-language models](https://en.wikipedia.org/wiki/Vision_transformer). You can feed these messages directly into the model, or alternatively you can chunk these messages for downstream storage in a vector database such as ChromaDB, LLamaIndex, or an equivalent RAG framework. 229 | 230 | ## Supported File Types 📚 231 | 232 | | Source | Input types | Multimodal | Notes | 233 | | ---------------------------- | ------------------------------------------------------------------------------------ | ---------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | 234 | | Webpage | URLs starting with `http`, `https`, `ftp` | ✔️ | Scrapes markdown, images, and tables from web pages. AI extraction available by passing an OpenAI client for screenshot analysis | 235 | | PDF | `.pdf` | ✔️ | Extracts page markdown and page images. AI extraction available when an OpenAI client is supplied for complex or scanned documents | 236 | | Word Document | `.docx` | ✔️ | Extracts text, tables, and images | 237 | | PowerPoint | `.pptx` | ✔️ | Extracts text and images from slides | 238 | | Video | `.mp4`, `.mov`, `.wmv` | ✔️ | Uses Whisper for transcription and extracts frames | 239 | | Audio | `.mp3`, `.wav` | ✔️ | Uses Whisper for transcription | 240 | | Jupyter Notebook | `.ipynb` | ✔️ | Extracts markdown, code, outputs, and images | 241 | | Spreadsheet | `.csv`, `.xls`, `.xlsx` | ❌ | Converts each row to JSON format, including row index for each | 242 | | Plaintext | `.txt`, `.md`, `.rtf`, etc | ❌ | Simple text extraction | 243 | | Image | `.jpg`, `.jpeg`, `.png` | ✔️ | Uses VLM for OCR in text-only mode | 244 | | ZIP File | `.zip` | ✔️ | Extracts and processes contained files | 245 | | Directory | any `path/to/folder` | ✔️ | Recursively processes all files in directory. Optionally use `inclusion_pattern` to pass regex strings for file inclusion rules. | 246 | | YouTube Video (known issues) | YouTube video URLs starting with `https://youtube.com` or `https://www.youtube.com`. | ✔️ | Uses pytube for video download and Whisper for transcription. For consistent extraction, you may need to modify your `pytube` installation to send a valid user-agent header (see [this issue](https://github.com/pytube/pytube/issues/399)). | 247 | | Tweet | URLs starting with `https://twitter.com` or `https://x.com` | ✔️ | Uses unofficial API, may break unexpectedly | 248 | | GitHub Repository | GitHub repo URLs starting with `https://github.com` or `https://www.github.com` | ✔️ | Requires `GITHUB_TOKEN` environment variable | 249 | 250 | ## Configuration & Environment 251 | 252 | Set these environment variables to control API keys, hosting, and model defaults: 253 | 254 | ```bash 255 | # If you want longer-term image storage and hosting (saves to ./images and serves via HOST_URL) 256 | export HOST_IMAGES=true 257 | 258 | # GitHub token for scraping private/public repos via `scrape_url` 259 | export GITHUB_TOKEN=ghp_... 260 | 261 | # Control scraping defaults 262 | export DEFAULT_AI_MODEL=gpt-4o 263 | export DEFAULT_EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2 264 | export FILESIZE_LIMIT_MB=50 265 | 266 | # Max duration (in seconds) for audio transcription 267 | export MAX_WHISPER_DURATION=600 268 | 269 | # Filesize limit for webpages in mb 270 | export FILESIZE_LIMIT_MB = 50 271 | 272 | # Credientials for scraping repositories 273 | export GITHUB_TOKEN=... 274 | ``` 275 | 276 | ## CLI Usage 277 | 278 | `thepipe [options]` 279 | 280 | ### AI scraping options 281 | 282 | `--openai-api-key=KEY` To enable VLM scraping, pass in your OpenAI API key 283 | 284 | `--openai-model=MODEL` Model to use for scraping (default is `DEFAULT_AI_MODEL`, currently `gpt-4o`) 285 | 286 | `--openai-base-url=URL` Custom LLM endpoint, for local LLMs or hosted APIs like OpenRouter (default: https://api.openai.com/v1) 287 | 288 | `--ai_extraction` ⚠️ DEPRECATED; will get API key from `OPENAI_API_KEY` environment variable 289 | 290 | ### General scraping options 291 | 292 | `--text_only` Output text only (suppress images) 293 | 294 | `--inclusion_pattern=REGEX` Include only files whose \_full path\* matches REGEX (for dirs/zips) 295 | 296 | `--verbose` Print detailed progress messages 297 | 298 | ## Contributing 299 | 300 | This package is quite opinionated in its design and implementation. Some modules are tightly coupled to the overall architecture, while others are designed to be hacked. 301 | 302 | Before contributing, please create an issue on GitHub to discuss your ideas and how to best implement them. Pull requests that do not follow this process will be closed. 303 | -------------------------------------------------------------------------------- /thepipe/scraper.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, cast 2 | import base64 3 | from concurrent.futures import ThreadPoolExecutor, as_completed 4 | from collections import OrderedDict 5 | from io import BytesIO, StringIO 6 | import math 7 | import re 8 | import fnmatch 9 | import os 10 | import tempfile 11 | from urllib.parse import urlparse 12 | import zipfile 13 | from PIL import Image 14 | import requests 15 | import json 16 | from .core import ( 17 | HOST_IMAGES, 18 | Chunk, 19 | make_image_url, 20 | DEFAULT_AI_MODEL, 21 | ) 22 | from .chunker import ( 23 | chunk_by_page, 24 | chunk_by_document, 25 | chunk_by_section, 26 | chunk_semantic, 27 | chunk_by_keywords, 28 | chunk_by_length, 29 | chunk_agentic, 30 | ) 31 | import tempfile 32 | import mimetypes 33 | import dotenv 34 | from magika import Magika 35 | import markdownify 36 | import fitz 37 | from openai import OpenAI 38 | from openai.types.chat.chat_completion_message_param import ChatCompletionMessageParam 39 | 40 | dotenv.load_dotenv() 41 | 42 | FOLDERS_TO_IGNORE = { 43 | "*node_modules*", 44 | "*.git*", 45 | "*venv*", 46 | "*.vscode*", 47 | "*pycache*", 48 | "*.ipynb_checkpoints", 49 | } 50 | FILES_TO_IGNORE = { 51 | ".gitignore", 52 | "*.bin", 53 | # Python compiled files 54 | "*.pyc", 55 | "*.pyo", 56 | "*.pyd", 57 | # Shared libraries and binaries 58 | "*.so", 59 | "*.dll", 60 | "*.exe", 61 | # Archives and packages 62 | "*.tar", 63 | "*.tar.gz", 64 | "*.egg-info", 65 | "package-lock.json", 66 | "package.json", 67 | # Lock, log, and metadata files 68 | "*.lock", 69 | "*.log", 70 | "Pipfile.lock", 71 | "requirements.lock", 72 | "*.exe", 73 | "*.dll", 74 | ".DS_Store", 75 | "Thumbs.db", 76 | } 77 | GITHUB_TOKEN: Optional[str] = os.getenv("GITHUB_TOKEN", None) 78 | USER_AGENT_STRING: str = os.getenv( 79 | "USER_AGENT_STRING", 80 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3", 81 | ) 82 | MAX_WHISPER_DURATION = int(os.getenv("MAX_WHISPER_DURATION", 600)) # 10 minutes 83 | 84 | TWITTER_DOMAINS = { 85 | "https://twitter.com", 86 | "https://www.twitter.com", 87 | "https://x.com", 88 | "https://www.x.com", 89 | } 90 | YOUTUBE_DOMAINS = {"https://www.youtube.com", "https://youtube.com"} 91 | GITHUB_DOMAINS = {"https://github.com", "https://www.github.com"} 92 | SCRAPING_PROMPT = os.getenv( 93 | "SCRAPING_PROMPT", 94 | """A document is given. Please output the entire extracted contents from the document in detailed markdown format. 95 | Your accuracy is very important. Please be careful to not miss any content from the document. 96 | Be sure to correctly output a comprehensive format markdown for all the document contents (including, but not limited to, headers, paragraphs, lists, tables, menus, equations, full text contents, titles, subtitles, appendices, page breaks, columns, footers, page numbers, watermarks, footnotes, captions, annotations, images, figures, charts, shapes, form fields, content controls, signatures, etc.) 97 | Always reply immediately with only markdown. 98 | Do not give the markdown in a code block. Simply output the raw markdown immediately. 99 | Do not output anything else.""", 100 | ) 101 | FILESIZE_LIMIT_MB = int(os.getenv("FILESIZE_LIMIT_MB", 50)) # for url scraping only 102 | 103 | 104 | def _load_whisper(): 105 | try: 106 | import whisper 107 | except ImportError as exc: # pragma: no cover - optional dependency 108 | raise ImportError( 109 | "Audio and video transcription requires the optional dependency `openai-whisper`. " 110 | "Install it with `pip install thepipe-api[audio]` or include the `gpu` extra." 111 | ) from exc 112 | 113 | return whisper 114 | 115 | 116 | def detect_source_mimetype(source: str) -> str: 117 | # try to detect the file type by its extension 118 | _, extension = os.path.splitext(source) 119 | if extension: 120 | if extension == ".ipynb": 121 | # special case for notebooks, mimetypes is not familiar 122 | return "application/x-ipynb+json" 123 | guessed_mimetype, _ = mimetypes.guess_type(source) 124 | if guessed_mimetype: 125 | return guessed_mimetype 126 | # if that fails, try AI detection with Magika 127 | magika = Magika() 128 | with open(source, "rb") as file: 129 | result = magika.identify_bytes(file.read()) 130 | mimetype = result.output.mime_type 131 | return mimetype 132 | 133 | 134 | def scrape_file( 135 | filepath: str, 136 | verbose: bool = False, 137 | chunking_method: Optional[Callable[[List[Chunk]], List[Chunk]]] = chunk_by_page, 138 | openai_client: Optional[OpenAI] = None, 139 | model: str = DEFAULT_AI_MODEL, 140 | include_input_images: bool = True, 141 | include_output_images: bool = True, 142 | ) -> List[Chunk]: 143 | """ 144 | Scrapes a file and returns a list of Chunk objects containing the text and images extracted from the file. 145 | 146 | Parameters 147 | ---------- 148 | filepath : str 149 | The path to the file to scrape. 150 | verbose : bool, optional 151 | If ``True``, prints verbose output. 152 | chunking_method : Callable, optional 153 | A function to chunk the scraped content. Defaults to chunk_by_page. 154 | openai_client : OpenAI, optional 155 | An OpenAI client instance for LLM processing. If provided, uses VLM to scrape PDFs. 156 | model : str, optional 157 | The LLM model name to use for processing. Defaults to DEFAULT_AI_MODEL. 158 | include_input_images : bool, optional 159 | If ``True``, includes input images in the messages sent to the LLM. 160 | include_output_images : bool, optional 161 | If ``True``, includes output images in the returned chunks. 162 | Returns 163 | ------- 164 | List[Chunk] 165 | A list of Chunk objects containing the scraped content. 166 | """ 167 | # returns chunks of scraped content from any source (file, URL, etc.) 168 | scraped_chunks = [] 169 | source_mimetype = detect_source_mimetype(filepath) 170 | if source_mimetype is None: 171 | if verbose: 172 | print(f"[thepipe] Unsupported source type: {filepath}") 173 | return scraped_chunks 174 | if verbose: 175 | print(f"[thepipe] Scraping {source_mimetype}: {filepath}...") 176 | if source_mimetype == "application/pdf": 177 | scraped_chunks = scrape_pdf( 178 | file_path=filepath, 179 | verbose=verbose, 180 | model=model, 181 | openai_client=openai_client, 182 | include_input_images=include_input_images, 183 | include_output_images=include_output_images, 184 | ) 185 | elif ( 186 | source_mimetype 187 | == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" 188 | ): 189 | scraped_chunks = scrape_docx( 190 | file_path=filepath, 191 | verbose=verbose, 192 | include_output_images=include_output_images, 193 | ) 194 | elif ( 195 | source_mimetype 196 | == "application/vnd.openxmlformats-officedocument.presentationml.presentation" 197 | ): 198 | scraped_chunks = scrape_pptx( 199 | file_path=filepath, 200 | verbose=verbose, 201 | include_output_images=include_output_images, 202 | ) 203 | elif source_mimetype.startswith("image/"): 204 | scraped_chunks = scrape_image(file_path=filepath) 205 | elif ( 206 | source_mimetype.startswith("application/vnd.ms-excel") 207 | or source_mimetype 208 | == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" 209 | ): 210 | scraped_chunks = scrape_spreadsheet( 211 | file_path=filepath, source_type=source_mimetype 212 | ) 213 | elif source_mimetype == "application/x-ipynb+json": 214 | scraped_chunks = scrape_ipynb( 215 | file_path=filepath, 216 | verbose=verbose, 217 | include_output_images=include_output_images, 218 | ) 219 | elif ( 220 | source_mimetype == "application/zip" 221 | or source_mimetype == "application/x-zip-compressed" 222 | ): 223 | scraped_chunks = scrape_zip( 224 | file_path=filepath, 225 | verbose=verbose, 226 | openai_client=openai_client, 227 | include_input_images=include_input_images, 228 | include_output_images=include_output_images, 229 | ) 230 | elif source_mimetype.startswith("video/"): 231 | scraped_chunks = scrape_video( 232 | file_path=filepath, 233 | verbose=verbose, 234 | include_output_images=include_output_images, 235 | ) 236 | elif source_mimetype.startswith("audio/"): 237 | scraped_chunks = scrape_audio(file_path=filepath, verbose=verbose) 238 | elif source_mimetype.startswith("text/html"): 239 | scraped_chunks = scrape_html( 240 | file_path=filepath, 241 | verbose=verbose, 242 | include_output_images=include_output_images, 243 | ) 244 | elif source_mimetype.startswith("text/"): 245 | scraped_chunks = scrape_plaintext(file_path=filepath) 246 | else: 247 | try: 248 | scraped_chunks = scrape_plaintext(file_path=filepath) 249 | except Exception as e: 250 | if verbose: 251 | print(f"[thepipe] Error extracting from {filepath}: {e}") 252 | if verbose: 253 | if scraped_chunks: 254 | print(f"[thepipe] Extracted from {filepath}") 255 | else: 256 | print(f"[thepipe] No content extracted from {filepath}") 257 | if chunking_method: 258 | scraped_chunks = chunking_method(scraped_chunks) 259 | return scraped_chunks 260 | 261 | 262 | def scrape_html( 263 | file_path: str, 264 | verbose: bool = False, 265 | include_output_images: bool = True, 266 | ) -> List[Chunk]: 267 | with open(file_path, "r", encoding="utf-8", errors="ignore") as file: 268 | html_content = file.read() 269 | markdown_content = markdownify.markdownify(html_content, heading_style="ATX") 270 | images = get_images_from_markdown(html_content) if include_output_images else [] 271 | return [Chunk(path=file_path, text=markdown_content, images=images)] 272 | 273 | 274 | def scrape_plaintext(file_path: str) -> List[Chunk]: 275 | with open(file_path, "r", encoding="utf-8", errors="ignore") as file: 276 | text = file.read() 277 | return [Chunk(path=file_path, text=text)] 278 | 279 | 280 | def scrape_directory( 281 | dir_path: str, 282 | inclusion_pattern: Optional[str] = None, 283 | verbose: bool = False, 284 | openai_client: Optional[OpenAI] = None, 285 | model: str = DEFAULT_AI_MODEL, 286 | include_input_images: bool = True, 287 | include_output_images: bool = True, 288 | ) -> List[Chunk]: 289 | """ 290 | inclusion_pattern: Optional regex string; only files whose path matches this pattern will be scraped. 291 | By default, ignores all files in baked-in constants FOLDERS_TO_IGNORE and FILES_TO_IGNORE. 292 | """ 293 | # compile the include pattern once 294 | pattern = re.compile(inclusion_pattern) if inclusion_pattern else None 295 | extraction: List[Chunk] = [] 296 | 297 | try: 298 | for entry in os.scandir(dir_path): 299 | path = entry.path 300 | 301 | # skip ignored directories 302 | if entry.is_dir() and any( 303 | fnmatch.fnmatch(entry.name, pat) for pat in FOLDERS_TO_IGNORE 304 | ): 305 | if verbose: 306 | print(f"[thepipe] Skipping ignored directory: {path}") 307 | continue 308 | 309 | # skip ignored files 310 | if entry.is_file() and any( 311 | fnmatch.fnmatch(entry.name, pat) for pat in FILES_TO_IGNORE 312 | ): 313 | if verbose: 314 | print(f"[thepipe] Skipping ignored file: {path}") 315 | continue 316 | 317 | if entry.is_file(): 318 | # if include_pattern is set, skip files that don't match 319 | if pattern and not pattern.search(path): 320 | if verbose: 321 | print(f"[thepipe] Skipping non-matching file: {path}") 322 | continue 323 | 324 | if verbose: 325 | print(f"[thepipe] Scraping file: {path}") 326 | extraction += scrape_file( 327 | filepath=path, 328 | verbose=verbose, 329 | openai_client=openai_client, 330 | model=model, 331 | include_input_images=include_input_images, 332 | include_output_images=include_output_images, 333 | ) 334 | 335 | elif entry.is_dir(): 336 | # recurse into subdirectory 337 | if verbose: 338 | print(f"[thepipe] Entering directory: {path}") 339 | extraction += scrape_directory( 340 | dir_path=path, 341 | inclusion_pattern=inclusion_pattern, 342 | verbose=verbose, 343 | openai_client=openai_client, 344 | model=model, 345 | include_input_images=include_input_images, 346 | include_output_images=include_output_images, 347 | ) 348 | except PermissionError as e: 349 | if verbose: 350 | print(f"[thepipe] Skipping {dir_path} (permission denied): {e}") 351 | 352 | return extraction 353 | 354 | 355 | def scrape_zip( 356 | file_path: str, 357 | inclusion_pattern: Optional[str] = None, 358 | verbose: bool = False, 359 | openai_client: Optional[OpenAI] = None, 360 | include_input_images: bool = True, 361 | include_output_images: bool = True, 362 | ) -> List[Chunk]: 363 | chunks = [] 364 | with tempfile.TemporaryDirectory() as temp_dir: 365 | with zipfile.ZipFile(file_path, "r") as zip_ref: 366 | zip_ref.extractall(temp_dir) 367 | chunks = scrape_directory( 368 | dir_path=temp_dir, 369 | inclusion_pattern=inclusion_pattern, 370 | verbose=verbose, 371 | openai_client=openai_client, 372 | include_input_images=include_input_images, 373 | include_output_images=include_output_images, 374 | ) 375 | return chunks 376 | 377 | 378 | def scrape_pdf( 379 | file_path: str, 380 | openai_client: Optional[OpenAI] = None, 381 | model: str = DEFAULT_AI_MODEL, 382 | verbose: Optional[bool] = False, 383 | include_input_images: bool = True, 384 | include_output_images: bool = True, 385 | image_scale: float = 1.0, 386 | ) -> List[Chunk]: 387 | chunks: List[Chunk] = [] 388 | 389 | # Branch 1 – VLM path (OpenAI client supplied) 390 | if openai_client is not None: 391 | with open(file_path, "rb") as fp: 392 | pdf_bytes = fp.read() 393 | doc = fitz.open(stream=pdf_bytes, filetype="pdf") 394 | num_pages = len(doc) 395 | 396 | if verbose: 397 | print( 398 | f"[thepipe] Scraping PDF: {file_path} " 399 | f"({num_pages} pages) with model {model}" 400 | ) 401 | 402 | # Inner worker – processes one page 403 | def _process_page(page_num: int) -> Tuple[int, str, Optional[Image.Image]]: 404 | page = doc[page_num] 405 | text = page.get_text() # type: ignore[attr-defined] 406 | 407 | # Build message for the LLM 408 | msg_content: List[Dict[str, Union[Dict[str, str], str]]] = [ 409 | { 410 | "type": "text", 411 | "text": f"```\n{text}\n```\n{SCRAPING_PROMPT}", 412 | } 413 | ] 414 | 415 | image: Optional[Image.Image] = None 416 | if include_input_images or include_output_images: 417 | mat = fitz.Matrix(image_scale, image_scale) 418 | pix = page.get_pixmap(matrix=mat, alpha=False) # type: ignore[attr-defined] # noqa: E501 419 | image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) 420 | 421 | if include_input_images: 422 | encoded = make_image_url(image, host_images=HOST_IMAGES) 423 | msg_content.append( 424 | { 425 | "type": "image_url", 426 | "image_url": {"url": encoded, "detail": "high"}, 427 | } 428 | ) 429 | 430 | messages = cast( 431 | Iterable[ChatCompletionMessageParam], 432 | [{"role": "user", "content": msg_content}], 433 | ) 434 | 435 | response = openai_client.chat.completions.create( 436 | model=model, messages=messages 437 | ) 438 | 439 | llm_response = response.choices[0].message.content 440 | if not llm_response: 441 | raise RuntimeError("Empty LLM response.") 442 | 443 | llm_response = llm_response.strip() 444 | if llm_response.startswith("```markdown"): 445 | llm_response = llm_response[len("```markdown") :] 446 | elif llm_response.startswith("```"): 447 | llm_response = llm_response[len("```") :] 448 | if llm_response.endswith("```"): 449 | llm_response = llm_response[: -len("```")] 450 | 451 | return ( 452 | page_num, 453 | llm_response, 454 | image if include_output_images else None, 455 | ) 456 | 457 | # Parallel extraction 458 | max_workers = (os.cpu_count() or 1) * 2 459 | if verbose: 460 | print(f"[thepipe] Using {max_workers} threads for PDF extraction") 461 | 462 | page_results: OrderedDict[int, Tuple[str, Optional[Image.Image]]] = ( 463 | OrderedDict() 464 | ) 465 | with ThreadPoolExecutor(max_workers=max_workers) as executor: 466 | futures = [executor.submit(_process_page, p) for p in range(num_pages)] 467 | for fut in as_completed(futures): 468 | pg, txt, img = fut.result() 469 | page_results[pg] = (txt, img) 470 | 471 | for pg in sorted(page_results): 472 | txt, img = page_results[pg] 473 | chunks.append(Chunk(path=file_path, text=txt, images=[img] if img else [])) 474 | 475 | return chunks 476 | 477 | # Branch 2 – no OpenAI client – text-only offline mode 478 | from pymupdf4llm.helpers.pymupdf_rag import to_markdown # local import 479 | 480 | doc = fitz.open(file_path) 481 | md_pages = cast(List[Dict[str, Any]], to_markdown(file_path, page_chunks=True)) 482 | 483 | for i in range(doc.page_count): 484 | text = re.sub(r"\n{3,}", "\n\n", md_pages[i]["text"]).strip() 485 | 486 | images: List[Image.Image] = [] 487 | if include_output_images: 488 | mat = fitz.Matrix(image_scale, image_scale) 489 | pix = doc[i].get_pixmap(matrix=mat, alpha=False) # type: ignore[attr-defined] # noqa: E501 490 | images.append(Image.frombytes("RGB", [pix.width, pix.height], pix.samples)) 491 | 492 | chunks.append(Chunk(path=file_path, text=text, images=images)) 493 | 494 | doc.close() 495 | return chunks 496 | 497 | 498 | def get_images_from_markdown(text: str) -> List[Image.Image]: 499 | image_urls = re.findall(r"!\[.*?\]\((.*?)\)", text) 500 | images = [] 501 | for url in image_urls: 502 | extension = os.path.splitext(urlparse(url).path)[1] 503 | if extension not in {".jpg", ".jpeg", ".png"}: 504 | # ignore incompatible image extractions 505 | continue 506 | 507 | try: 508 | response = requests.get( 509 | url, 510 | timeout=10, 511 | headers={"User-Agent": USER_AGENT_STRING}, 512 | ) 513 | response.raise_for_status() 514 | except Exception: 515 | continue 516 | 517 | img = Image.open(BytesIO(response.content)) 518 | images.append(img) 519 | return images 520 | 521 | 522 | def scrape_image(file_path: str) -> List[Chunk]: 523 | img = Image.open(file_path) 524 | img.load() # needed to close the file 525 | chunk = Chunk(path=file_path, images=[img]) 526 | return [chunk] 527 | 528 | 529 | def scrape_spreadsheet(file_path: str, source_type: str) -> List[Chunk]: 530 | import pandas as pd 531 | 532 | if source_type == "application/vnd.ms-excel": 533 | df = pd.read_csv(file_path) 534 | elif ( 535 | source_type 536 | == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" 537 | ): 538 | df = pd.read_excel(file_path) 539 | else: 540 | raise ValueError("Unsupported file format") 541 | dicts = df.to_dict(orient="records") 542 | chunks = [] 543 | for i, item in enumerate(dicts): 544 | # format each row as json along with the row index 545 | item["row index"] = i 546 | item_json = json.dumps(item, indent=4) 547 | chunks.append(Chunk(path=file_path, text=item_json)) 548 | return chunks 549 | 550 | 551 | def parse_webpage_with_vlm( 552 | url: str, 553 | model: str = DEFAULT_AI_MODEL, 554 | verbose: Optional[bool] = False, 555 | openai_client: Optional[OpenAI] = None, 556 | include_output_images: bool = True, 557 | ) -> Chunk: 558 | if openai_client is None: 559 | raise ValueError("parse_webpage_with_vlm requires an openai_client argument.") 560 | from playwright.sync_api import sync_playwright 561 | 562 | with sync_playwright() as p: 563 | browser = p.chromium.launch() 564 | context = browser.new_context(user_agent=USER_AGENT_STRING) 565 | page = context.new_page() 566 | page.goto(url, wait_until="domcontentloaded") 567 | if not page.viewport_size: 568 | page.set_viewport_size({"width": 800, "height": 600}) 569 | if not page.viewport_size: 570 | raise ValueError( 571 | "Failed to set viewport size after finding no viewport size" 572 | ) 573 | viewport_height = page.viewport_size.get("height", 800) 574 | total_height = page.evaluate("document.body.scrollHeight") 575 | current_scroll_position = 0 576 | scrolldowns, max_scrolldowns = 0, 3 577 | images: List[Image.Image] = [] 578 | 579 | while current_scroll_position < total_height and scrolldowns < max_scrolldowns: 580 | page.wait_for_timeout(200) # wait for content to load 581 | screenshot = page.screenshot(full_page=False) 582 | img = Image.open(BytesIO(screenshot)) 583 | images.append(img) 584 | 585 | current_scroll_position += viewport_height 586 | page.evaluate(f"window.scrollTo(0, {current_scroll_position})") 587 | scrolldowns += 1 588 | total_height = page.evaluate("document.body.scrollHeight") 589 | if verbose: 590 | print( 591 | f"[thepipe] Scrolled to {current_scroll_position} of {total_height}. Waiting for content to load..." 592 | ) 593 | 594 | browser.close() 595 | 596 | if images: 597 | # Vertically stack the images 598 | total_height = sum(img.height for img in images) 599 | max_width = max(img.width for img in images) 600 | stacked_image = Image.new("RGB", (max_width, total_height)) 601 | y_offset = 0 602 | for img in images: 603 | stacked_image.paste(img, (0, y_offset)) 604 | y_offset += img.height 605 | 606 | # Process the stacked image with VLM 607 | messages = [ 608 | { 609 | "role": "user", 610 | "content": [ 611 | { 612 | "type": "image_url", 613 | "image_url": { 614 | "url": make_image_url( 615 | stacked_image, host_images=HOST_IMAGES 616 | ), 617 | "detail": "high", 618 | }, 619 | }, 620 | {"type": "text", "text": SCRAPING_PROMPT}, 621 | ], 622 | }, 623 | ] 624 | response = openai_client.chat.completions.create( 625 | model=model, 626 | messages=cast(Iterable[ChatCompletionMessageParam], messages), 627 | ) 628 | llm_response = response.choices[0].message.content 629 | if not llm_response: 630 | raise Exception( 631 | f"Failed to receive a message content from LLM Response: {response}" 632 | ) 633 | if verbose: 634 | print(f"[thepipe] LLM response: {llm_response}") 635 | chunk = Chunk( 636 | path=url, 637 | text=llm_response, 638 | images=[stacked_image] if include_output_images else [], 639 | ) 640 | else: 641 | raise ValueError("Model received 0 images from webpage") 642 | 643 | return chunk 644 | 645 | 646 | def extract_page_content( 647 | url: str, verbose: bool = False, include_output_images: bool = True 648 | ) -> Chunk: 649 | from bs4 import BeautifulSoup 650 | from playwright.sync_api import sync_playwright 651 | import base64 652 | import requests 653 | 654 | texts: List[str] = [] 655 | images: List[Image.Image] = [] 656 | 657 | with sync_playwright() as p: 658 | browser = p.chromium.launch() 659 | context = browser.new_context(user_agent=USER_AGENT_STRING) 660 | page = context.new_page() 661 | 662 | try: 663 | page.goto(url, wait_until="domcontentloaded", timeout=10000) 664 | 665 | # Wait for content to load 666 | page.wait_for_timeout(1000) 667 | 668 | # Scroll to load dynamic content 669 | if not page.viewport_size: 670 | page.set_viewport_size({"width": 1200, "height": 800}) 671 | 672 | viewport_height = page.viewport_size["height"] 673 | total_height = page.evaluate("document.body.scrollHeight") 674 | current_scroll_position = 0 675 | scrolldowns, max_scrolldowns = 0, 5 676 | 677 | while ( 678 | current_scroll_position < total_height and scrolldowns < max_scrolldowns 679 | ): 680 | page.wait_for_timeout(500) 681 | current_scroll_position += viewport_height 682 | page.evaluate(f"window.scrollTo(0, {current_scroll_position})") 683 | scrolldowns += 1 684 | new_height = page.evaluate("document.body.scrollHeight") 685 | if new_height == total_height: 686 | break 687 | total_height = new_height 688 | 689 | # Extract HTML content 690 | html_content = page.content() 691 | 692 | # Parse with BeautifulSoup and clean up 693 | soup = BeautifulSoup(html_content, "html.parser") 694 | 695 | # Remove script and style elements 696 | for script in soup(["script", "style", "nav", "footer", "header"]): 697 | script.decompose() 698 | 699 | # Convert to markdown 700 | markdown_content = markdownify.markdownify(str(soup), heading_style="ATX") 701 | 702 | # Clean up markdown 703 | markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content) 704 | markdown_content = markdown_content.strip() 705 | 706 | if verbose: 707 | print( 708 | f"[thepipe] Extracted {len(markdown_content)} characters from {url}" 709 | ) 710 | 711 | texts.append(markdown_content) 712 | 713 | # Extract images from the page using heuristics 714 | if include_output_images: 715 | for img in page.query_selector_all("img"): 716 | img_path = img.get_attribute("src") 717 | if not img_path: 718 | continue 719 | if img_path.startswith("data:image"): 720 | # Save base64 image to PIL Image 721 | try: 722 | decoded_data = base64.b64decode(img_path.split(",")[1]) 723 | image = Image.open(BytesIO(decoded_data)) 724 | images.append(image) 725 | except Exception as e: 726 | if verbose: 727 | print( 728 | f"[thepipe] Ignoring error loading base64 image: {e}" 729 | ) 730 | continue 731 | else: 732 | try: 733 | # Try direct URL first 734 | response = requests.get( 735 | img_path, 736 | timeout=10, 737 | headers={"User-Agent": USER_AGENT_STRING}, 738 | ) 739 | response.raise_for_status() 740 | image = Image.open(BytesIO(response.content)) 741 | images.append(image) 742 | except Exception as e: 743 | if verbose: 744 | print(f"[thepipe] Error loading image {img_path}: {e}") 745 | print("[thepipe] Attempting to load path with schema.") 746 | 747 | # Try with schema if path is relative 748 | if not img_path.startswith(("http://", "https://")): 749 | try: 750 | # Remove leading slashes 751 | while img_path.startswith("/"): 752 | img_path = img_path[1:] 753 | 754 | # Try with just the scheme 755 | parsed_url = urlparse(url) 756 | path_with_schema = ( 757 | f"{parsed_url.scheme}://{img_path}" 758 | ) 759 | response = requests.get( 760 | path_with_schema, 761 | timeout=10, 762 | headers={"User-Agent": USER_AGENT_STRING}, 763 | ) 764 | response.raise_for_status() 765 | image = Image.open(BytesIO(response.content)) 766 | images.append(image) 767 | except Exception as e: 768 | if verbose: 769 | print( 770 | f"[thepipe] Error loading image {img_path} with schema: {e}" 771 | ) 772 | print( 773 | "[thepipe] Attempting to load with schema and netloc." 774 | ) 775 | 776 | try: 777 | # Try with scheme and netloc 778 | path_with_schema_and_netloc = f"{parsed_url.scheme}://{parsed_url.netloc}/{img_path}" 779 | response = requests.get( 780 | path_with_schema_and_netloc, 781 | timeout=10, 782 | headers={"User-Agent": USER_AGENT_STRING}, 783 | ) 784 | response.raise_for_status() 785 | image = Image.open(BytesIO(response.content)) 786 | images.append(image) 787 | except Exception as e: 788 | if verbose: 789 | print( 790 | f"[thepipe] Final attempt failed for image {img_path}: {e}" 791 | ) 792 | continue 793 | else: 794 | if verbose: 795 | print( 796 | f"[thepipe] Skipping image {img_path} - all attempts failed" 797 | ) 798 | continue 799 | 800 | except Exception as e: 801 | if verbose: 802 | print(f"[thepipe] Error scraping {url}: {e}") 803 | # Fallback to simple requests 804 | try: 805 | response = requests.get( 806 | url, headers={"User-Agent": USER_AGENT_STRING}, timeout=30 807 | ) 808 | response.raise_for_status() 809 | soup = BeautifulSoup(response.content, "html.parser") 810 | 811 | # Remove unwanted elements 812 | for script in soup(["script", "style", "nav", "footer", "header"]): 813 | script.decompose() 814 | 815 | markdown_content = markdownify.markdownify( 816 | str(soup), heading_style="ATX" 817 | ) 818 | markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content).strip() 819 | texts.append(markdown_content) 820 | 821 | if verbose: 822 | print( 823 | f"[thepipe] Fallback extraction got {len(markdown_content)} characters" 824 | ) 825 | 826 | except Exception as fallback_e: 827 | if verbose: 828 | print(f"[thepipe] Fallback also failed: {fallback_e}") 829 | texts.append("") 830 | 831 | finally: 832 | browser.close() 833 | 834 | text = "\n".join(texts).strip() 835 | return Chunk(path=url, text=text, images=images) 836 | 837 | 838 | def scrape_url( 839 | url: str, 840 | verbose: bool = False, 841 | chunking_method: Callable[[List[Chunk]], List[Chunk]] = chunk_by_page, 842 | openai_client: Optional[OpenAI] = None, 843 | model: str = DEFAULT_AI_MODEL, 844 | include_input_images: bool = True, 845 | include_output_images: bool = True, 846 | ) -> List[Chunk]: 847 | if any(url.startswith(domain) for domain in TWITTER_DOMAINS): 848 | extraction = scrape_tweet(url=url, include_output_images=include_output_images) 849 | return extraction 850 | elif any(url.startswith(domain) for domain in YOUTUBE_DOMAINS): 851 | extraction = scrape_youtube( 852 | youtube_url=url, 853 | verbose=verbose, 854 | include_output_images=include_output_images, 855 | ) 856 | return extraction 857 | elif any(url.startswith(domain) for domain in GITHUB_DOMAINS): 858 | extraction = scrape_github( 859 | github_url=url, 860 | verbose=verbose, 861 | openai_client=openai_client, 862 | model=model, 863 | include_input_images=include_input_images, 864 | include_output_images=include_output_images, 865 | ) 866 | return extraction 867 | _, extension = os.path.splitext(urlparse(url).path) 868 | if extension and extension not in {".html", ".htm", ".php", ".asp", ".aspx"}: 869 | # if url leads to a file, attempt to download it and scrape it 870 | with tempfile.TemporaryDirectory() as temp_dir: 871 | file_path = os.path.join(temp_dir, os.path.basename(url)) 872 | response = requests.get(url) 873 | # verify the ingress/egress with be within limits, if there are any set 874 | response_length = int(response.headers.get("Content-Length", 0)) 875 | if FILESIZE_LIMIT_MB and response_length > FILESIZE_LIMIT_MB * 1024 * 1024: 876 | raise ValueError(f"File size exceeds {FILESIZE_LIMIT_MB} MB limit.") 877 | with open(file_path, "wb") as file: 878 | file.write(response.content) 879 | chunks = scrape_file( 880 | filepath=file_path, 881 | verbose=verbose, 882 | chunking_method=chunking_method, 883 | openai_client=openai_client, 884 | model=model, 885 | include_input_images=include_input_images, 886 | include_output_images=include_output_images, 887 | ) 888 | return chunks 889 | else: 890 | # if url leads to web content, scrape it directly 891 | if openai_client and include_input_images: 892 | chunk = parse_webpage_with_vlm( 893 | url=url, 894 | verbose=verbose, 895 | model=model, 896 | openai_client=openai_client, 897 | include_output_images=include_output_images, 898 | ) 899 | else: 900 | chunk = extract_page_content( 901 | url=url, verbose=verbose, include_output_images=include_output_images 902 | ) 903 | chunks = chunking_method([chunk]) 904 | # if no text or images were extracted, return error 905 | if not any(chunk.text for chunk in chunks) and not any( 906 | chunk.images for chunk in chunks 907 | ): 908 | raise ValueError("No content extracted from URL.") 909 | return chunks 910 | 911 | 912 | def format_timestamp(seconds, chunk_index, chunk_duration): 913 | # helper function to format the timestamp. 914 | total_seconds = chunk_index * chunk_duration + seconds 915 | hours = int(total_seconds // 3600) 916 | minutes = int((total_seconds % 3600) // 60) 917 | seconds = total_seconds % 60 918 | milliseconds = int((seconds - int(seconds)) * 1000) 919 | return f"{hours:02}:{minutes:02}:{int(seconds):02}.{milliseconds:03}" 920 | 921 | 922 | def scrape_video( 923 | file_path: str, 924 | verbose: bool = False, 925 | include_output_images: bool = True, 926 | ) -> List[Chunk]: 927 | whisper = _load_whisper() 928 | from moviepy.editor import VideoFileClip 929 | 930 | # Splits the video into chunks of length MAX_WHISPER_DURATION, extracts 931 | # one representative frame from the start of each chunk, and then transcribes 932 | # that chunk. 933 | model = whisper.load_model("base") 934 | video = VideoFileClip(file_path) 935 | num_chunks = math.ceil(video.duration / MAX_WHISPER_DURATION) 936 | chunks = [] 937 | 938 | try: 939 | for i in range(num_chunks): 940 | # Calculate the start and end time of the chunk 941 | start_time = i * MAX_WHISPER_DURATION 942 | end_time = start_time + MAX_WHISPER_DURATION 943 | if end_time > video.duration: 944 | end_time = video.duration 945 | 946 | # Extract a frame from the start of the chunk 947 | image = None 948 | if include_output_images: 949 | frame = video.get_frame(start_time) 950 | image = Image.fromarray(frame) 951 | 952 | # Save the audio to a temporary .wav file 953 | with tempfile.NamedTemporaryFile( 954 | suffix=".wav", delete=False 955 | ) as temp_audio_file: 956 | audio_path = temp_audio_file.name 957 | 958 | audio = video.subclip(start_time, end_time).audio # type: ignore[attr-defined] 959 | transcription = None 960 | 961 | if audio is not None: 962 | audio.write_audiofile(audio_path, codec="pcm_s16le") 963 | result = model.transcribe(audio=audio_path, verbose=verbose) 964 | 965 | # Format transcription with timestamps 966 | formatted_transcription = [] 967 | for segment in cast(List[Dict[str, Any]], result["segments"]): 968 | seg_start = format_timestamp( 969 | segment["start"], i, MAX_WHISPER_DURATION 970 | ) 971 | seg_end = format_timestamp(segment["end"], i, MAX_WHISPER_DURATION) 972 | formatted_transcription.append( 973 | f"[{seg_start} --> {seg_end}] {segment['text']}" 974 | ) 975 | 976 | transcription = "\n".join(formatted_transcription) 977 | os.remove(audio_path) 978 | 979 | # Only add chunks if there is either text or images 980 | if transcription or image: 981 | chunks.append( 982 | Chunk( 983 | path=file_path, 984 | text=transcription if transcription else None, 985 | images=[image] if image else [], 986 | ) 987 | ) 988 | finally: 989 | video.close() 990 | 991 | return chunks 992 | 993 | 994 | def scrape_youtube( 995 | youtube_url: str, 996 | verbose: bool = False, 997 | include_output_images: bool = True, 998 | ) -> List[Chunk]: 999 | from pytube import YouTube 1000 | 1001 | with tempfile.TemporaryDirectory() as temp_dir: 1002 | filename = "temp_video.mp4" 1003 | yt = YouTube(youtube_url) 1004 | stream = yt.streams.filter(progressive=True, file_extension="mp4").first() 1005 | if stream is None: 1006 | raise ValueError("No progressive stream for video found.") 1007 | stream.download(temp_dir, filename=filename) 1008 | video_path = os.path.join(temp_dir, filename) 1009 | chunks = scrape_video( 1010 | file_path=video_path, 1011 | verbose=verbose, 1012 | include_output_images=include_output_images, 1013 | ) 1014 | return chunks 1015 | 1016 | 1017 | def scrape_audio(file_path: str, verbose: bool = False) -> List[Chunk]: 1018 | whisper = _load_whisper() 1019 | 1020 | model = whisper.load_model("base") 1021 | result = model.transcribe(audio=file_path, verbose=verbose) 1022 | segments = cast(List[Dict[str, Any]], result.get("segments", [])) 1023 | 1024 | transcript: List[str] = [] 1025 | for segment in segments: 1026 | start = format_timestamp(segment["start"], 0, 0) 1027 | end = format_timestamp(segment["end"], 0, 0) 1028 | if segment["text"].strip(): 1029 | transcript.append(f"[{start} --> {end}] {segment['text']}") 1030 | # join the formatted transcription into a single string 1031 | transcript_text = "\n".join(transcript) 1032 | return [Chunk(path=file_path, text=transcript_text)] 1033 | 1034 | 1035 | def scrape_github( 1036 | github_url: str, 1037 | inclusion_pattern: Optional[str] = None, 1038 | branch: str = "main", 1039 | verbose: bool = False, 1040 | openai_client: Optional[OpenAI] = None, 1041 | model: str = DEFAULT_AI_MODEL, 1042 | include_input_images: bool = True, 1043 | include_output_images: bool = True, 1044 | ) -> List[Chunk]: 1045 | files_contents: List[Chunk] = [] 1046 | if not GITHUB_TOKEN: 1047 | raise ValueError("GITHUB_TOKEN environment variable is not set.") 1048 | # make new tempdir for cloned repo 1049 | with tempfile.TemporaryDirectory() as temp_dir: 1050 | # requires git 1051 | exit_code = os.system( 1052 | f'git clone --branch "{branch}" --single-branch {github_url} "{temp_dir}" --quiet' 1053 | ) 1054 | if exit_code != 0: 1055 | raise RuntimeError( 1056 | f"git clone failed for {github_url} at branch '{branch}'. " 1057 | "Verify the repository URL and branch name." 1058 | ) 1059 | files_contents = scrape_directory( 1060 | dir_path=temp_dir, 1061 | inclusion_pattern=inclusion_pattern, 1062 | verbose=verbose, 1063 | openai_client=openai_client, 1064 | model=model, 1065 | include_input_images=include_input_images, 1066 | include_output_images=include_output_images, 1067 | ) 1068 | return files_contents 1069 | 1070 | 1071 | def scrape_docx( 1072 | file_path: str, 1073 | verbose: bool = False, 1074 | include_output_images: bool = True, 1075 | ) -> List[Chunk]: 1076 | from docx import Document 1077 | from docx.oxml.table import CT_Tbl 1078 | from docx.oxml.text.paragraph import CT_P 1079 | from docx.table import Table, _Cell 1080 | from docx.text.paragraph import Paragraph 1081 | import csv 1082 | import io 1083 | 1084 | # helper function to iterate through blocks in the document 1085 | def iter_block_items(parent): 1086 | if parent.__class__.__name__ == "Document": 1087 | parent_elm = parent.element.body 1088 | elif parent.__class__.__name__ == "_Cell": 1089 | parent_elm = parent._tc 1090 | else: 1091 | raise ValueError("Unsupported parent type") 1092 | # iterate through each child element in the parent element 1093 | for child in parent_elm.iterchildren(): 1094 | child_elem_class_name = child.__class__.__name__ 1095 | if verbose: 1096 | print(f"[thepipe] Found element in docx: {child_elem_class_name}") 1097 | if child_elem_class_name == "CT_P": 1098 | yield Paragraph(child, parent) 1099 | elif child_elem_class_name == "CT_Tbl": 1100 | yield Table(child, parent) 1101 | 1102 | # helper function to read tables in the document 1103 | def read_docx_tables(tab): 1104 | vf = StringIO() 1105 | writer = csv.writer(vf) 1106 | for row in tab.rows: 1107 | writer.writerow(cell.text for cell in row.cells) 1108 | vf.seek(0) 1109 | return vf.getvalue() 1110 | 1111 | # read the document 1112 | document = Document(file_path) 1113 | chunks = [] 1114 | image_counter = 0 1115 | 1116 | # Define namespaces 1117 | nsmap = { 1118 | "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main", 1119 | "pic": "http://schemas.openxmlformats.org/drawingml/2006/picture", 1120 | "a": "http://schemas.openxmlformats.org/drawingml/2006/main", 1121 | } 1122 | 1123 | try: 1124 | # scrape each block in the document to create chunks 1125 | # A block can be a paragraph, table, or image 1126 | for block in iter_block_items(document): 1127 | block_texts = [] 1128 | block_images = [] 1129 | if isinstance(block, Paragraph): 1130 | block_texts.append(block.text) 1131 | # "runs" are the smallest units in a paragraph 1132 | for run in block.runs: 1133 | if "pic:pic" in run.element.xml and include_output_images: 1134 | # extract images from the paragraph 1135 | for pic in run.element.findall(".//pic:pic", nsmap): 1136 | cNvPr = pic.find(".//pic:cNvPr", nsmap) 1137 | name_attr = ( 1138 | cNvPr.get("name") 1139 | if cNvPr is not None 1140 | else f"image_{image_counter}" 1141 | ) 1142 | blip = pic.find(".//a:blip", nsmap) 1143 | if blip is not None: 1144 | embed_attr = blip.get( 1145 | "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed" 1146 | ) 1147 | if embed_attr: 1148 | image_part = document.part.related_parts[embed_attr] 1149 | image_data = BytesIO(image_part._blob) 1150 | image = Image.open(image_data) 1151 | image.load() 1152 | block_images.append(image) 1153 | image_counter += 1 1154 | elif isinstance(block, Table): 1155 | table_text = read_docx_tables(block) 1156 | block_texts.append(table_text) 1157 | if block_texts or block_images: 1158 | block_text = "\n".join(block_texts).strip() 1159 | if block_text or block_images: 1160 | chunks.append( 1161 | Chunk(path=file_path, text=block_text, images=block_images) 1162 | ) 1163 | except Exception as e: 1164 | raise ValueError(f"Error processing DOCX file {file_path}: {e}") 1165 | return chunks 1166 | 1167 | 1168 | def scrape_pptx( 1169 | file_path: str, 1170 | verbose: bool = False, 1171 | include_output_images: bool = True, 1172 | ) -> List[Chunk]: 1173 | from pptx import Presentation 1174 | from pptx.enum.shapes import MSO_SHAPE_TYPE 1175 | from pptx.shapes.picture import Picture 1176 | from pptx.shapes.autoshape import Shape as AutoShape 1177 | 1178 | prs = Presentation(file_path) 1179 | chunks = [] 1180 | # iterate through each slide in the presentation 1181 | for slide in prs.slides: 1182 | slide_texts = [] 1183 | slide_images = [] 1184 | # iterate through each shape in the slide 1185 | for shape in slide.shapes: 1186 | if shape.has_text_frame: 1187 | auto_shape = cast(AutoShape, shape) 1188 | for paragraph in auto_shape.text_frame.paragraphs: 1189 | text = paragraph.text 1190 | if len(slide_texts) == 0: 1191 | text = "# " + text # header for first text of a slide 1192 | slide_texts.append(text) 1193 | # extract images from shapes 1194 | if include_output_images and shape.shape_type == MSO_SHAPE_TYPE.PICTURE: 1195 | pic = cast(Picture, shape) 1196 | image_data = pic.image.blob 1197 | image = Image.open(BytesIO(image_data)) 1198 | slide_images.append(image) 1199 | # add slide to chunks if it has text or images 1200 | if slide_texts or slide_images: 1201 | text = "\n".join(slide_texts).strip() 1202 | if not include_output_images: 1203 | slide_images = [] 1204 | chunks.append(Chunk(path=file_path, text=text, images=slide_images)) 1205 | # return all chunks 1206 | return chunks 1207 | 1208 | 1209 | def scrape_ipynb( 1210 | file_path: str, 1211 | verbose: bool = False, 1212 | include_output_images: bool = True, 1213 | ) -> List[Chunk]: 1214 | with open(file_path, "r", encoding="utf-8") as file: 1215 | notebook = json.load(file) 1216 | chunks = [] 1217 | # parse cells in the notebook 1218 | for cell in notebook["cells"]: 1219 | texts = [] 1220 | images: List[Image.Image] = [] 1221 | cell_type = cell["cell_type"] 1222 | # parse cell content based on type 1223 | if verbose: 1224 | print(f"[thepipe] Scraping cell {cell_type} from {file_path}") 1225 | if cell_type == "markdown": 1226 | text = "".join(cell["source"]) 1227 | if include_output_images: 1228 | images = get_images_from_markdown(text) 1229 | texts.append(text) 1230 | elif cell_type == "code": 1231 | source = "".join(cell["source"]) 1232 | texts.append(source) 1233 | output_texts = [] 1234 | # code cells can have outputs 1235 | if "outputs" in cell: 1236 | for output in cell["outputs"]: 1237 | if ( 1238 | include_output_images 1239 | and "data" in output 1240 | and "image/png" in output["data"] 1241 | ): 1242 | image_data = output["data"]["image/png"] 1243 | image = Image.open(BytesIO(base64.b64decode(image_data))) 1244 | images.append(image) 1245 | elif "data" in output and "text/plain" in output["data"]: 1246 | output_text = "".join(output["data"]["text/plain"]) 1247 | output_texts.append(output_text) 1248 | if output_texts: 1249 | texts.extend(output_texts) 1250 | elif cell_type == "raw": 1251 | text = "".join(cell["source"]) 1252 | texts.append(text) 1253 | if texts or images: 1254 | text = "\n".join(texts).strip() 1255 | chunks.append(Chunk(path=file_path, text=text, images=images)) 1256 | return chunks 1257 | 1258 | 1259 | def scrape_tweet(url: str, include_output_images: bool = True) -> List[Chunk]: 1260 | """ 1261 | Magic function from https://github.com/vercel/react-tweet/blob/main/packages/react-tweet/src/api/fetch-tweet.ts 1262 | unofficial, could break at any time 1263 | """ 1264 | 1265 | def get_token(id: str) -> str: 1266 | result = (float(id) / 1e15) * math.pi 1267 | base_36_result = "" 1268 | characters = "0123456789abcdefghijklmnopqrstuvwxyz" 1269 | while result > 0: 1270 | remainder = int(result % (6**2)) 1271 | base_36_result = characters[remainder] + base_36_result 1272 | result = (result - remainder) // (6**2) 1273 | base_36_result = re.sub(r"(0+|\.)", "", base_36_result) 1274 | return base_36_result 1275 | 1276 | tweet_id = url.split("status/")[-1].split("?")[0] 1277 | token = get_token(tweet_id) 1278 | tweet_api_url = "https://cdn.syndication.twimg.com/tweet-result" 1279 | params = {"id": tweet_id, "language": "en", "token": token} 1280 | response = requests.get(tweet_api_url, params=params) 1281 | if response.status_code != 200: 1282 | raise ValueError(f"Failed to fetch tweet. Status code: {response.status_code}") 1283 | tweet_data = response.json() 1284 | # Extract tweet text 1285 | tweet_text = tweet_data.get("text", "") 1286 | # Extract images from tweet 1287 | images: List[Image.Image] = [] 1288 | if include_output_images and "mediaDetails" in tweet_data: 1289 | for media in tweet_data["mediaDetails"]: 1290 | image_url = media.get("media_url_https") 1291 | if image_url: 1292 | image_response = requests.get(image_url) 1293 | img = Image.open(BytesIO(image_response.content)) 1294 | images.append(img) 1295 | # Create chunks for text and images 1296 | chunk = Chunk(path=url, text=tweet_text, images=images) 1297 | return [chunk] 1298 | --------------------------------------------------------------------------------