├── tests
    ├── __init__.py
    ├── files
    │   ├── example.ini
    │   ├── example.json
    │   ├── example.txt
    │   ├── example.csv
    │   ├── example.docx
    │   ├── example.h
    │   ├── example.jpg
    │   ├── example.mp3
    │   ├── example.mp4
    │   ├── example.pdf
    │   ├── example.png
    │   ├── example.pptx
    │   ├── example.xlsx
    │   ├── example.zip
    │   ├── example_pdf_with_no_extension
    │   ├── example.py
    │   ├── example.cpp
    │   ├── example.css
    │   ├── example.tsx
    │   ├── example.ipynb
    │   ├── example.html
    │   └── example.md
    ├── test_extractor.py
    ├── test_chunker.py
    ├── test_core.py
    └── test_scraper.py
├── .gitignore
├── requirements.txt
├── LICENSE
├── .github
    └── workflows
    │   ├── python-publish.yml
    │   └── python-ci.yml
├── setup.py
├── thepipe
    ├── __init__.py
    ├── extract.py
    ├── chunker.py
    ├── core.py
    └── scraper.py
└── README.md


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/files/example.ini:
--------------------------------------------------------------------------------
1 | [Greetings]
2 | say = Hello, World!


--------------------------------------------------------------------------------
/tests/files/example.json:
--------------------------------------------------------------------------------
1 | {
2 |     "greeting": "Hello, World!"
3 | }


--------------------------------------------------------------------------------
/tests/files/example.txt:
--------------------------------------------------------------------------------
1 | Just a plain text,
2 | saying Hello, World!


--------------------------------------------------------------------------------
/tests/files/example.csv:
--------------------------------------------------------------------------------
1 | num,numtext
2 | 1,one
3 | 2,two
4 | 3,three
5 | 4,four
6 | 5,five


--------------------------------------------------------------------------------
/tests/files/example.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/emcf/thepipe/HEAD/tests/files/example.docx


--------------------------------------------------------------------------------
/tests/files/example.h:
--------------------------------------------------------------------------------
1 | #ifndef GREET_H
2 | #define GREET_H
3 | 
4 | void greet();
5 | 
6 | #endif


--------------------------------------------------------------------------------
/tests/files/example.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/emcf/thepipe/HEAD/tests/files/example.jpg


--------------------------------------------------------------------------------
/tests/files/example.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/emcf/thepipe/HEAD/tests/files/example.mp3


--------------------------------------------------------------------------------
/tests/files/example.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/emcf/thepipe/HEAD/tests/files/example.mp4


--------------------------------------------------------------------------------
/tests/files/example.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/emcf/thepipe/HEAD/tests/files/example.pdf


--------------------------------------------------------------------------------
/tests/files/example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/emcf/thepipe/HEAD/tests/files/example.png


--------------------------------------------------------------------------------
/tests/files/example.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/emcf/thepipe/HEAD/tests/files/example.pptx


--------------------------------------------------------------------------------
/tests/files/example.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/emcf/thepipe/HEAD/tests/files/example.xlsx


--------------------------------------------------------------------------------
/tests/files/example.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/emcf/thepipe/HEAD/tests/files/example.zip


--------------------------------------------------------------------------------
/tests/files/example_pdf_with_no_extension:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/emcf/thepipe/HEAD/tests/files/example_pdf_with_no_extension


--------------------------------------------------------------------------------
/tests/files/example.py:
--------------------------------------------------------------------------------
1 | class ExampleClass:
2 |     def greet():
3 |         print("Hello, World!")
4 | 
5 | 
6 | ExampleClass.greet()
7 | 


--------------------------------------------------------------------------------
/tests/files/example.cpp:
--------------------------------------------------------------------------------
1 | #include<iostream>
2 | using namespace std;
3 | 
4 | int main() {
5 |     cout << "Hello, World!";
6 |     return 0;
7 | }


--------------------------------------------------------------------------------
/tests/files/example.css:
--------------------------------------------------------------------------------
1 | body {
2 |     background-color: #f0f0f2;
3 |     margin: 0;
4 |     padding: 0;
5 |     font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
6 | }


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__/
 2 | tests/__pycache__/
 3 | thepipeapi/__pycache__/
 4 | .pytest_cache/
 5 | .vscode/
 6 | outputs/
 7 | logs/
 8 | benchmarks/
 9 | node/node_modules/
10 | thepipeapi/ctags.exe
11 | thepipe.bat
12 | *.egg-info/
13 | build/
14 | dist/
15 | youtube_temp/
16 | images/


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | beautifulsoup4>=4.12
 2 | markdownify==0.12.1
 3 | magika>=0.5.0
 4 | moviepy==1.0.3
 5 | numpy>=1.23
 6 | openai>=1.51.0
 7 | openpyxl>=3.1
 8 | pandas>=2.0
 9 | pillow>=9.4
10 | playwright>=1.47.0
11 | pydantic>=2.7
12 | pymupdf4llm==0.0.21
13 | PyMuPDF==1.25.5
14 | python-docx>=1.1
15 | python-dotenv>=1.0
16 | python-pptx>=0.6
17 | pytube>=15.0
18 | requests>=2.31
19 | 


--------------------------------------------------------------------------------
/tests/files/example.tsx:
--------------------------------------------------------------------------------
 1 | // pages/index.tsx
 2 | import type { NextPage } from 'next';
 3 | import Head from 'next/head';
 4 | import Image from 'next/image';
 5 | import "../app/globals.css";
 6 | 
 7 | const Home: NextPage = () => {
 8 |   return (
 9 |     <>
10 |       <title>The Pipe</title>
11 |       <meta name="description" content="The Pipe - Feeding complex real-world data into large language models." />
12 |       <link rel="icon" href="/favicon.ico" />
13 |     </>
14 |   );
15 | };
16 | 
17 | export default Home;


--------------------------------------------------------------------------------
/tests/files/example.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "id": "ff774e22",
 6 |    "metadata": {},
 7 |    "source": [
 8 |     "**Hello**"
 9 |    ]
10 |   },
11 |   {
12 |    "cell_type": "code",
13 |    "execution_count": null,
14 |    "id": "3e261faf",
15 |    "metadata": {},
16 |    "outputs": [
17 |     {
18 |      "data": {
19 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAIAAAD8GO2jAAAAKElEQVR4nO3NMQEAAAjDMMC/ZzDBvlRA01vZJvwHAAAAAAAAAAAAbx2jxAE/i2AjOgAAAABJRU5ErkJggg==",
20 |       "text/plain": [
21 |        "out"
22 |       ]
23 |      },
24 |      "metadata": {},
25 |      "output_type": "display_data"
26 |     }
27 |    ],
28 |    "source": [
29 |     "print('X')"
30 |    ]
31 |   }
32 |  ],
33 |  "metadata": {},
34 |  "nbformat": 4,
35 |  "nbformat_minor": 5
36 | }
37 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Emmett McFarlane
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflow uses actions that are not certified by GitHub.
 2 | # They are provided by a third-party and are governed by
 3 | # separate terms of service, privacy policy, and support
 4 | # documentation.
 5 | 
 6 | # GitHub recommends pinning actions to a commit SHA.
 7 | # To get a newer version, you will need to update the SHA.
 8 | # You can also reference a tag or branch, but the action may change without warning.
 9 | 
10 | name: Upload Python Package
11 | 
12 | on:
13 |   release:
14 |     types: [published]
15 | 
16 | jobs:
17 |   deploy:
18 |     runs-on: ubuntu-latest
19 |     steps:
20 |       - uses: actions/checkout@v4
21 |       - name: Set up Python
22 |         uses: actions/setup-python@v4
23 |         with:
24 |           python-version: '3.x'
25 |       - name: Install dependencies
26 |         run: |
27 |           python -m pip install --upgrade pip
28 |           pip install build
29 |       - name: Build package
30 |         run: python -m build
31 |       - name: Publish package
32 |         uses: pypa/gh-action-pypi-publish@release/v1
33 |         with:
34 |           password: ${{ secrets.PYPI_API_TOKEN }}


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | 
 4 | def read_requirements(file):
 5 |     with open(file, encoding="utf-8") as f:
 6 |         return [line.strip() for line in f if line.strip() and not line.startswith("#")]
 7 | 
 8 | 
 9 | EXTRAS = {
10 |     "audio": ["openai-whisper>=20231117"],
11 |     "semantic": ["sentence-transformers>=2.2.2"],
12 |     "llama-index": ["llama-index>=0.10.50,<0.11"],
13 |     "gpu": [
14 |         "torch>=2.5,<2.6",
15 |         "torchvision>=0.20,<0.21",
16 |         "torchaudio>=2.5,<2.6",
17 |         "sentence-transformers>=2.2.2",
18 |         "openai-whisper>=20231117",
19 |     ],
20 | }
21 | EXTRAS["all"] = sorted({pkg for deps in EXTRAS.values() for pkg in deps})
22 | 
23 | 
24 | setup(
25 |     name="thepipe_api",
26 |     version="1.7.1",
27 |     author="Emmett McFarlane",
28 |     author_email="emmett@thepi.pe",
29 |     description="Get clean data from tricky documents, powered by VLMs.",
30 |     long_description=open("README.md", encoding="utf-8").read(),
31 |     long_description_content_type="text/markdown",
32 |     url="https://github.com/emcf/thepipe",
33 |     packages=find_packages(),
34 |     classifiers=[
35 |         "Programming Language :: Python :: 3",
36 |         "License :: OSI Approved :: MIT License",
37 |         "Operating System :: OS Independent",
38 |     ],
39 |     python_requires=">=3.9",
40 |     install_requires=read_requirements("requirements.txt"),
41 |     extras_require=EXTRAS,
42 |     include_package_data=True,
43 |     entry_points={
44 |         "console_scripts": [
45 |             "thepipe=thepipe.__init__:main",
46 |         ],
47 |     },
48 | )
49 | 


--------------------------------------------------------------------------------
/.github/workflows/python-ci.yml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: ["main"]
 6 |   pull_request:
 7 |     branches: ["main"]
 8 | 
 9 | permissions:
10 |   contents: read
11 | 
12 | jobs:
13 |   build:
14 |     runs-on: ubuntu-latest
15 |     timeout-minutes: 30
16 | 
17 |     steps:
18 |       - uses: actions/checkout@v4
19 |       - uses: actions/setup-node@v4
20 |         with:
21 |           node-version: 18
22 |       - name: Install playwright
23 |         run: |
24 |           npm install playwright
25 |           npm install playwright-core
26 |       - name: Install Playwright
27 |         run: npx playwright install --with-deps
28 |       - name: Set up Python 3.10
29 |         uses: actions/setup-python@v3
30 |         with:
31 |           python-version: "3.10"
32 |       - name: Install FFmpeg
33 |         run: sudo apt-get update && sudo apt-get install -y ffmpeg
34 |       - name: Install pip dependencies
35 |         run: |
36 |           python -m pip install --upgrade pip
37 |           pip install flake8 coverage
38 |           pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cpu
39 |           pip install -e .[all]
40 |       - name: Install playwright browsers
41 |         run: python -m playwright install
42 |       - name: Lint with flake8
43 |         run: |
44 |           # stop the build if there are Python syntax errors or undefined names
45 |           flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
46 |           # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
47 |           flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
48 |       - name: Test with unittest and generate coverage report
49 |         env:
50 |           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
51 |         run: |
52 |           coverage run -m unittest discover
53 |           coverage xml -i
54 |       - name: Upload coverage reports to Codecov
55 |         uses: codecov/codecov-action@v4.0.1
56 |         with:
57 |           token: ${{ secrets.CODECOV_TOKEN }}
58 |           slug: emcf/thepipe
59 | 


--------------------------------------------------------------------------------
/tests/test_extractor.py:
--------------------------------------------------------------------------------
 1 | # test_extractor.py
 2 | 
 3 | import unittest
 4 | import sys
 5 | import os
 6 | import json
 7 | from openai import OpenAI
 8 | 
 9 | sys.path.append("..")
10 | from thepipe.extract import extract, extract_json_from_response
11 | from thepipe.core import Chunk
12 | 
13 | 
14 | class TestExtractor(unittest.TestCase):
15 |     def setUp(self):
16 |         self.example_receipt = """# Receipt
17 | Store Name: Grocery Mart
18 | ## Total
19 | Subtotal: $13.49 USD
20 | Tax (8%): $1.08 USD
21 | Total: $14.57 USD
22 | """
23 | 
24 |         self.schema = {
25 |             "store_name": "string",
26 |             "subtotal_usd": "float",
27 |             "tax_usd": "float",
28 |             "total_usd": "float",
29 |         }
30 | 
31 |         self.chunks = [Chunk(path="receipt.md", text=self.example_receipt)]
32 | 
33 |     def test_extract_json_from_response(self):
34 |         # List of test cases with expected results
35 |         test_cases = [
36 |             # Case 1: JSON enclosed in triple backticks
37 |             {
38 |                 "input": '```json\n{"key1": "value1", "key2": 2}\n```',
39 |                 "expected": {"key1": "value1", "key2": 2},
40 |             },
41 |             # Case 2: JSON directly in the response
42 |             {
43 |                 "input": '{"key1": "value1", "key2": 2}',
44 |                 "expected": {"key1": "value1", "key2": 2},
45 |             },
46 |             # Case 3: Response contains multiple JSON objects
47 |             {
48 |                 "input": 'Random text {"key1": "value1"} and another {"key2": 2}',
49 |                 "expected": [{"key1": "value1"}, {"key2": 2}],
50 |             },
51 |             # Case 4: Response contains incomplete JSON
52 |             {
53 |                 "input": 'Random text {"key1": "value1"} and another {"key2": 2',
54 |                 "expected": {"key1": "value1"},
55 |             },
56 |         ]
57 | 
58 |         for i, case in enumerate(test_cases):
59 |             with self.subTest(i=i):
60 |                 result = extract_json_from_response(case["input"])
61 |                 self.assertEqual(result, case["expected"])
62 | 
63 |     def test_extract(self):
64 |         # provide an explicit client so we cover the new parameter
65 |         client = OpenAI()
66 | 
67 |         results, total_tokens_used = extract(
68 |             chunks=self.chunks,  # receipt
69 |             schema=self.schema,
70 |             openai_client=client,
71 |         )
72 | 
73 |         # Check if we got a result
74 |         self.assertEqual(len(results), 1)
75 |         result = results[0]
76 | 
77 |         print("test_extract result:", json.dumps(result, indent=2))
78 | 
79 |         # Check if all expected fields are present
80 |         expected_fields = ["store_name", "subtotal_usd", "tax_usd", "total_usd"]
81 |         for field in expected_fields:
82 |             self.assertIn(field, result)
83 | 
84 |         # Check some specific values
85 |         self.assertEqual(result["store_name"], "Grocery Mart")
86 |         self.assertEqual(result["subtotal_usd"], 13.49)
87 |         self.assertEqual(result["tax_usd"], 1.08)
88 |         self.assertEqual(result["total_usd"], 14.57)
89 | 
90 |         # Check if tokens were used
91 |         self.assertGreater(total_tokens_used, 0)
92 | 
93 | 
94 | if __name__ == "__main__":
95 |     unittest.main()
96 | 


--------------------------------------------------------------------------------
/tests/files/example.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="en">
  3 |   <head>
  4 |     <meta charset="UTF-8" />
  5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  6 |     <title>HTML with Tables and JavaScript</title>
  7 |     <style>
  8 |       body {
  9 |         font-family: Arial, sans-serif;
 10 |         margin: 20px;
 11 |       }
 12 |       table {
 13 |         width: 100%;
 14 |         border-collapse: collapse;
 15 |         margin-bottom: 20px;
 16 |       }
 17 |       table,
 18 |       th,
 19 |       td {
 20 |         border: 1px solid #ddd;
 21 |       }
 22 |       th,
 23 |       td {
 24 |         padding: 10px;
 25 |         text-align: left;
 26 |       }
 27 |       th {
 28 |         background-color: #f4f4f4;
 29 |       }
 30 |       .highlight {
 31 |         font-weight: bold;
 32 |         color: blue;
 33 |       }
 34 |     </style>
 35 |     <script>
 36 |       function highlightText() {
 37 |         const elements = document.getElementsByClassName("highlight");
 38 |         for (let i = 0; i < elements.length; i++) {
 39 |           elements[i].style.backgroundColor = "yellow";
 40 |         }
 41 |       }
 42 |     </script>
 43 |   </head>
 44 |   <body>
 45 |     <!-- Markdown-like tags in HTML -->
 46 |     <h1>Heading 1</h1>
 47 |     <h2>Heading 2</h2>
 48 |     <h3>Heading 3</h3>
 49 |     <p>
 50 |       This is a paragraph with some <b>bold text</b> and some
 51 |       <i>italic text</i>. You can also use <strong>strong text</strong> for
 52 |       emphasis.
 53 |     </p>
 54 | 
 55 |     <p>Here is a simple unordered list:</p>
 56 |     <ul>
 57 |       <li>Item one</li>
 58 |       <li>Item two</li>
 59 |       <li>Item three</li>
 60 |     </ul>
 61 | 
 62 |     <p>And an ordered list:</p>
 63 |     <ol>
 64 |       <li>First step</li>
 65 |       <li>Second step</li>
 66 |       <li>Third step</li>
 67 |     </ol>
 68 | 
 69 |     <!-- Tables -->
 70 |     <h2>Sample Table</h2>
 71 |     <table>
 72 |       <thead>
 73 |         <tr>
 74 |           <th>Name</th>
 75 |           <th>Age</th>
 76 |           <th>Country</th>
 77 |         </tr>
 78 |       </thead>
 79 |       <tbody>
 80 |         <tr>
 81 |           <td class="highlight">John Doe</td>
 82 |           <td>28</td>
 83 |           <td>USA</td>
 84 |         </tr>
 85 |         <tr>
 86 |           <td>Jane Smith</td>
 87 |           <td>34</td>
 88 |           <td>Canada</td>
 89 |         </tr>
 90 |         <tr>
 91 |           <td class="highlight">Alice Johnson</td>
 92 |           <td>29</td>
 93 |           <td>UK</td>
 94 |         </tr>
 95 |       </tbody>
 96 |     </table>
 97 | 
 98 |     <h2>Another Table Example</h2>
 99 |     <table>
100 |       <thead>
101 |         <tr>
102 |           <th>Product</th>
103 |           <th>Price</th>
104 |           <th>Quantity</th>
105 |         </tr>
106 |       </thead>
107 |       <tbody>
108 |         <tr>
109 |           <td>Apples</td>
110 |           <td>$2</td>
111 |           <td>10</td>
112 |         </tr>
113 |         <tr>
114 |           <td>Bananas</td>
115 |           <td>$1</td>
116 |           <td>15</td>
117 |         </tr>
118 |         <tr>
119 |           <td>Cherries</td>
120 |           <td>$5</td>
121 |           <td>7</td>
122 |         </tr>
123 |       </tbody>
124 |     </table>
125 | 
126 |     <!-- Dummy JavaScript Function -->
127 |     <button onclick="highlightText()">Highlight Text</button>
128 |   </body>
129 | </html>
130 | 


--------------------------------------------------------------------------------
/tests/files/example.md:
--------------------------------------------------------------------------------
 1 | ## 4.1. Driven Hydrodynamic Turbulence
 2 | 
 3 | Figure 1 shows the mean-Mach relation found from timeaveraged PDFs over the full domain for driven hydro and driven MHD turbulence. We find that a value of in the func- a p 0.5 tion gives the best linear relations for the hydro case. y(M)
 4 | For the volume fraction we find
 5 | 
 6 | $$\mu_{V}=\,-0.36\,\mathrm{ln}\,[1\,+0.5{\mathcal M}^{2}]\,+0.10,$$
 7 | 
 8 | while for the mass fraction we find
 9 | 
10 | $$\mu_{M}=0.32\ln\,[1+0.5{\mathcal{M}}^{2}]-0.10.$$
11 | 
12 | The rms residuals for these fits are and 3 8.9 \# 10 6.5 \#
13 | , respectively. Because the density fluctuations in subsonic 3 10 turbulence are not produced by shocks, we have no reason to expect these relations to approach zero with Mach number. The mean-Mach pairs from the time-averaged PDFs fall very close to these relations. Over the full range of Mach numbers tested, however, the time-averaged means are smaller than those found by P97 ( ). To determine the 2 m p 0.5 ln [1 - 0.25M ] V,M
14 | magnitude of the effect that the driving may have had on the relations, we also compare values determined from hydrodynamic turbulence with (not shown), finding that k L pk /2p p 4 these points fall very close to the relations as k L pk /2p p 2 well.
15 | 
16 | Also shown in the figure are the instantaneous mean-Mach pairs found from each of the eight subboxes. The scatter in these points, illustrated by 1 j error bars on the plot, is quite significant. The ensemble average of the 56 points for each run falls at a slightly lower Mach number than the value found from the time average over the full domain since the Mach number typically decreases on smaller scales. These averages differ slightly from those computed from time-averaged fulldomain PDFs, although they still fall close to the relations found. The 1 j scatter in the Mach number is 4%–8%, while in the sub-PDF means it is 16%–17% for the lowest Mach number run and 10%–14% for the remaining runs. This scatter puts some of the instantaneous sub-PDF values in the vicinity of the P97 relation.
17 | 
18 | The scatter in and can be converted to scatter in the m m V M
19 | median density within the medium. For the ratio of the median to mean density within a cloud, , we can define and r rr ˜ ˜˜  
20 | for clouds where m is 1 j above or below the mean. The ratios r˜˜ ˜ ˜ /r p exp (2j ) and then gen- r /r p exp (2j ) V,- V, V,S M,- M, M,S
21 | erally increase with Mach number, ranging from 1.0 to 1.3.
22 | 
23 | $4.2\cdot A$ .
24 | 
25 | ## 4.2. Driven Mhd Turbulence
26 | 
27 | Figure 1 also includes the mean-Mach relation for driven strong-field MHD turbulence, where we continue to use a p 0.5. The mean-Mach pairs from the time-averaged PDFs again fall very close to these relations,
28 | 
29 | $$\mu_{v}=\,-0.29\,\ln\,[1\,+\,0.5\mathcal{M}^{2}]-0.06$$
30 | for the volume fraction and
31 | $$\mu_{M}=0.28\ln\,[1+0.5{\mathcal{M}}^{2}]+0.07$$
32 | $$(S)$$
33 | 
34 | ![0_image_0.png](0_image_0.png)
35 | 
36 | $$(6)$$
37 | 
38 | found by P97. The rms residuals for these fits are 1.1 \#
39 | and , respectively. 2 2 10 1.6 \# 10 The instantaneous mean-Mach pairs found from subboxes have more scatter with a strong magnetic field than they did in the purely hydrodynamic case. The 1 j scatter in the Mach number is 5%–8%, while for the sub-PDF means it is 22%–
40 | 24% for the lowest Mach number run and 13%–18% for the remaining runs. Converting this to scatter in the median density, we find that the ratios and range from 1.1 r˜˜ ˜ ˜ V,- /r r V, M,- /rM,
41 | to 1.3. This again puts some of the instantaneous sub-PDF values in the vicinity of the P97 mean-Mach relation.
42 | 
43 | The time-averaged mean-Mach relations found for hydro and MHD differ, as one should expect due to differences in the shock-jump conditions. However, the sub-PDF values overlap substantially, making them difficult to distinguish observationally. The time-averaged relations fall a bit less than 1 j (computed from the sub-PDF values) apart.
44 | 
45 | ## 4.3. Decaying Mhd Turbulence
46 | 
47 | $$\left(7\right)$$
48 | $$({\mathfrak{s}})$$
49 | 
50 | As it seems likely that molecular clouds are transient entities, it may be more appropriate to study the PDF of decaying turbulence. In Figure 2, we show the evolution of the PDF in a decaying strong-field MHD turbulence run, initialized from a snapshot of fully developed turbulence from our highest Mach number driven run. Although this snapshot has a full-domain PDF mean roughly 1 j more extreme than the time-averaged driven relations, this should not affect the results. At first the small change in mean as the Mach number decreases causes a shallower slope than that of the driven relation. Once the mean begins to change appreciably, however, the slope becomes much steeper, crossing the driven relation at roughly M p 4.5. Although the slope shallows as low Mach numbers are reached, it does so only as the means become very small.
51 | 


--------------------------------------------------------------------------------
/thepipe/__init__.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import argparse
  4 | import os
  5 | import warnings
  6 | from typing import Optional
  7 | 
  8 | from openai import OpenAI
  9 | 
 10 | from .scraper import scrape_directory, scrape_file, scrape_url
 11 | from .core import DEFAULT_AI_MODEL, save_outputs
 12 | 
 13 | 
 14 | # Argument parsing
 15 | def parse_arguments() -> argparse.Namespace:  # noqa: D401 – imperative is fine here
 16 |     """
 17 |     Parse CLI flags.
 18 | 
 19 |     Returns
 20 |     -------
 21 |     argparse.Namespace
 22 |         Parsed arguments.
 23 |     """
 24 |     parser = argparse.ArgumentParser(
 25 |         prog="thepipe",
 26 |         description="Universal document/Web scraper with optional OpenAI extraction.",
 27 |     )
 28 | 
 29 |     # Required source (file, directory, or URL)
 30 |     parser.add_argument(
 31 |         "source",
 32 |         help="File path, directory, or URL to scrape.",
 33 |     )
 34 | 
 35 |     # Optional flags
 36 |     parser.add_argument(
 37 |         "-i",
 38 |         "--inclusion-pattern",
 39 |         dest="inclusion_pattern",
 40 |         default=None,
 41 |         help="Regex pattern – only files whose *full path* matches are scraped "
 42 |         "(applies to directory/zip scraping).",
 43 |     )
 44 |     parser.add_argument(
 45 |         "-v",
 46 |         "--verbose",
 47 |         action="store_true",
 48 |         help="Enable verbose logging.",
 49 |     )
 50 |     parser.add_argument(
 51 |         "--text-only",
 52 |         dest="text_only",
 53 |         action="store_true",
 54 |         help="Suppress images – output only extracted text.",
 55 |     )
 56 | 
 57 |     # OpenAI-related flags
 58 |     parser.add_argument(
 59 |         "--openai-api-key",
 60 |         dest="openai_api_key",
 61 |         default=os.getenv("OPENAI_API_KEY"),
 62 |         help="OpenAI API key.  If omitted, env variable OPENAI_API_KEY is used.",
 63 |     )
 64 |     parser.add_argument(
 65 |         "--openai-base-url",
 66 |         dest="openai_base_url",
 67 |         default="https://api.openai.com/v1",
 68 |         help="Base URL for the OpenAI API (default: https://api.openai.com/v1).",
 69 |     )
 70 |     parser.add_argument(
 71 |         "--openai-model",
 72 |         dest="openai_model",
 73 |         default=DEFAULT_AI_MODEL,
 74 |         help=f"Chat/VLM model to use (default: {DEFAULT_AI_MODEL}).",
 75 |     )
 76 | 
 77 |     # Legacy flag (will be removed in future versions)
 78 |     parser.add_argument(
 79 |         "--ai-extraction",
 80 |         action="store_true",
 81 |         help=argparse.SUPPRESS,  # hidden but still accepted
 82 |     )
 83 | 
 84 |     return parser.parse_args()
 85 | 
 86 | 
 87 | # OpenAI client factory
 88 | def create_openai_client(
 89 |     *,
 90 |     api_key: Optional[str],
 91 |     base_url: str,
 92 |     enable_vlm: bool,
 93 | ) -> Optional[OpenAI]:
 94 |     if api_key:
 95 |         # Normal path – user gave an explicit key
 96 |         return OpenAI(api_key=api_key, base_url=base_url)
 97 | 
 98 |     if enable_vlm:
 99 |         # Old flag: fall back to env vars
100 |         warnings.warn(
101 |             "--ai-extraction is deprecated; "
102 |             "please use --openai-api-key and --openai-model "
103 |             "(and optionally --openai-base-url) instead.",
104 |             DeprecationWarning,
105 |             stacklevel=2,
106 |         )
107 |         return OpenAI(base_url=base_url, api_key=os.getenv("OPENAI_API_KEY"))
108 | 
109 |     # AI extraction disabled
110 |     return None
111 | 
112 | 
113 | def main() -> None:
114 |     """CLI entry point"""
115 |     args = parse_arguments()
116 | 
117 |     # Instantiate the OpenAI client if requested
118 |     openai_client = create_openai_client(
119 |         api_key=args.openai_api_key,
120 |         base_url=args.openai_base_url,
121 |         enable_vlm=args.ai_extraction,
122 |     )
123 | 
124 |     # Delegate scraping based on source type
125 |     if args.source.startswith(("http://", "https://")):
126 |         chunks = scrape_url(
127 |             args.source,
128 |             verbose=args.verbose,
129 |             openai_client=openai_client,
130 |             model=args.openai_model,
131 |         )
132 |     elif os.path.isdir(args.source):
133 |         chunks = scrape_directory(
134 |             dir_path=args.source,
135 |             inclusion_pattern=args.inclusion_pattern,
136 |             verbose=args.verbose,
137 |             openai_client=openai_client,
138 |         )
139 |     elif os.path.isfile(args.source):
140 |         chunks = scrape_file(
141 |             filepath=args.source,
142 |             verbose=args.verbose,
143 |             openai_client=openai_client,
144 |             model=args.openai_model,
145 |         )
146 |     else:
147 |         raise ValueError(f"Invalid source: {args.source}")
148 | 
149 |     # Persist results
150 |     save_outputs(
151 |         chunks=chunks,
152 |         verbose=args.verbose,
153 |         text_only=args.text_only,
154 |         output_folder="thepipe_output",
155 |     )
156 | 
157 |     if args.verbose:
158 |         print(f"Scraping complete. Outputs saved to 'thepipe_output/'.")
159 | 
160 | 
161 | # Entry-point shim
162 | if __name__ == "__main__":
163 |     main()
164 | 


--------------------------------------------------------------------------------
/tests/test_chunker.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import os
  3 | import sys
  4 | from typing import List, cast
  5 | 
  6 | try:
  7 |     from openai import OpenAI
  8 | except ImportError:  # pragma: no cover - optional in tests
  9 |     OpenAI = None  # type: ignore[assignment]
 10 | 
 11 | try:
 12 |     import sentence_transformers  # noqa: F401  # pragma: no cover - optional import
 13 | 
 14 |     HAS_SENTENCE_TRANSFORMERS = True
 15 | except ImportError:  # pragma: no cover - optional import
 16 |     HAS_SENTENCE_TRANSFORMERS = False
 17 | 
 18 | sys.path.append("..")
 19 | import thepipe.chunker as chunker
 20 | from thepipe.core import Chunk, calculate_tokens
 21 | 
 22 | 
 23 | class test_chunker(unittest.TestCase):
 24 |     def setUp(self):
 25 |         self.files_directory = os.path.join(os.path.dirname(__file__), "files")
 26 |         self.example_markdown_path = os.path.join(self.files_directory, "example.md")
 27 |         self.max_tokens_per_chunk = (
 28 |             10  # Define an arbitrary max tokens per chunk for testing
 29 |         )
 30 | 
 31 |     def read_markdown_file(self, file_path: str) -> List[Chunk]:
 32 |         with open(file_path, "r") as f:
 33 |             text = f.read()
 34 |         return [Chunk(path=file_path, text=text)]
 35 | 
 36 |     def test_chunk_by_keywords(self):
 37 |         text = "Intro line\nfoo starts here\nmiddle\nbar next\nend"
 38 |         chunk = Chunk(path="doc.md", text=text)
 39 | 
 40 |         result = chunker.chunk_by_keywords([chunk], keywords=["foo", "bar"])
 41 |         # There are 3 segments: before foo, between foo and bar, after bar
 42 |         self.assertEqual(len(result), 3)
 43 | 
 44 |         # 1st chunk: only the intro
 45 |         t0 = cast(str, result[0].text)
 46 |         self.assertIn("Intro line", t0)
 47 |         self.assertNotIn("foo", t0.lower())
 48 | 
 49 |         # 2nd chunk: starts with foo, includes 'middle'
 50 |         t1 = cast(str, result[1].text).lower()
 51 |         self.assertIn("foo starts here", t1)
 52 |         self.assertIn("middle", t1)
 53 | 
 54 |         # 3rd chunk: starts with bar, includes 'end'
 55 |         t2 = cast(str, result[2].text).lower()
 56 |         self.assertIn("bar next", t2)
 57 |         self.assertIn("end", t2)
 58 | 
 59 |     @unittest.skipIf(OpenAI is None or not os.getenv("OPENAI_API_KEY"), "OpenAI API key required")
 60 |     def test_chunk_agentic(self):
 61 |         openai_client = OpenAI()
 62 |         chunks = self.read_markdown_file(self.example_markdown_path)
 63 |         chunked_agentic = chunker.chunk_agentic(chunks, openai_client=openai_client)
 64 |         # Verify the output
 65 |         self.assertIsInstance(chunked_agentic, list)
 66 |         self.assertGreater(len(chunked_agentic), 0)
 67 |         # verify there are 3 chunks corresponding to the sections in the example markdown
 68 |         self.assertEqual(len(chunked_agentic), 3)
 69 |         # Verify the output contains chunks with text or images
 70 |         for chunk in chunked_agentic:
 71 |             self.assertIsInstance(chunk, Chunk)
 72 |             self.assertTrue(chunk.text or chunk.images)
 73 | 
 74 |     def test_chunk_by_length(self):
 75 |         chunks = self.read_markdown_file(self.example_markdown_path)
 76 |         chunked_length = chunker.chunk_by_length(
 77 |             chunks, max_tokens=self.max_tokens_per_chunk
 78 |         )
 79 |         # Verify the output
 80 |         self.assertIsInstance(chunked_length, list)
 81 |         self.assertGreater(len(chunked_length), 0)
 82 |         for chunk in chunked_length:
 83 |             self.assertIsInstance(chunk, Chunk)
 84 |             # Verify that the chunk text or images are not none
 85 |             self.assertTrue(chunk.text or chunk.images)
 86 |             # assert length of text is less than max_tokens
 87 |             if chunk.text or chunk.images:
 88 |                 self.assertLessEqual(
 89 |                     calculate_tokens([chunk]), self.max_tokens_per_chunk
 90 |                 )
 91 | 
 92 |     @unittest.skipUnless(HAS_SENTENCE_TRANSFORMERS, "sentence-transformers extra is not installed")
 93 |     def test_chunk_semantic(self):
 94 |         test_sentence = "Computational astrophysics. Numerical astronomy. Bananas."
 95 |         chunks = [Chunk(text=test_sentence)]
 96 |         chunked_semantic = chunker.chunk_semantic(
 97 |             chunks,
 98 |             model="sentence-transformers/all-MiniLM-L6-v2",
 99 |             buffer_size=2,
100 |             similarity_threshold=0.5,
101 |         )
102 |         # verify the output
103 |         self.assertIsInstance(chunked_semantic, list)
104 |         self.assertGreater(len(chunked_semantic), 0)
105 |         # verify it split into ['Computational astrophysics.\nNumerical astronomy.', 'Bananas.']
106 |         self.assertEqual(len(chunked_semantic), 2)
107 |         self.assertEqual(
108 |             chunked_semantic[0].text,
109 |             "Computational astrophysics.\nNumerical astronomy.",
110 |         )
111 |         self.assertEqual(
112 |             chunked_semantic[1].text,
113 |             "Bananas.",
114 |         )
115 | 
116 |     def test_chunk_by_page(self):
117 |         chunks = self.read_markdown_file(self.example_markdown_path)
118 |         chunked_pages = chunker.chunk_by_page(chunks)
119 |         # Verify the output
120 |         self.assertIsInstance(chunked_pages, list)
121 |         self.assertGreater(len(chunked_pages), 0)
122 |         for chunk in chunked_pages:
123 |             self.assertIsInstance(chunk, Chunk)
124 |             self.assertTrue(chunk.text or chunk.images)
125 | 
126 |     def test_chunk_by_section(self):
127 |         chunks = self.read_markdown_file(self.example_markdown_path)
128 |         chunked_sections = chunker.chunk_by_section(chunks)
129 |         self.assertIsInstance(chunked_sections, list)
130 |         self.assertEqual(len(chunked_sections), 3)
131 |         # Verify the output contains chunks with text or images
132 |         for chunk in chunked_sections:
133 |             self.assertIsInstance(chunk, Chunk)
134 |             self.assertTrue(chunk.text or chunk.images)
135 | 
136 |     def test_chunk_by_section_first_line_and_custom_separator(self):
137 |         # Default separator, with first line as a header
138 |         text1 = "## Alpha\nContent A\n## Beta\nContent B"
139 |         chunk1 = Chunk(text=text1)
140 |         out1 = chunker.chunk_by_section([chunk1])
141 |         self.assertEqual(len(out1), 2)
142 | 
143 |         # cast .text to str so Pylance stops complaining
144 |         t0 = cast(str, out1[0].text)
145 |         self.assertIn("Alpha", t0)
146 | 
147 |         t1 = cast(str, out1[1].text)
148 |         self.assertIn("Beta", t1)
149 | 
150 |         # Custom separator "### "
151 |         text2 = "### One\nX\n### Two\nY"
152 |         chunk2 = Chunk(text=text2)
153 |         out2 = chunker.chunk_by_section([chunk2], section_separator="### ")
154 |         self.assertEqual(len(out2), 2)
155 | 
156 |         o0 = cast(str, out2[0].text)
157 |         self.assertIn("One", o0)
158 | 
159 |         o1 = cast(str, out2[1].text)
160 |         self.assertIn("Two", o1)
161 | 
162 |     def test_chunk_by_document(self):
163 |         chunks = self.read_markdown_file(self.example_markdown_path)
164 |         chunked_documents = chunker.chunk_by_document(chunks)
165 |         self.assertIsInstance(chunked_documents, list)
166 |         self.assertEqual(len(chunked_documents), 1)
167 |         # Verify the output contains chunks with text or images
168 |         chunk = chunked_documents[0]
169 |         self.assertIsInstance(chunk, Chunk)
170 |         self.assertTrue(chunk.text or chunk.images)
171 | 
172 | 
173 | if __name__ == "__main__":
174 |     unittest.main()
175 | 


--------------------------------------------------------------------------------
/tests/test_core.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import base64
  3 | import shutil
  4 | from typing import List, cast
  5 | import unittest
  6 | import os
  7 | import sys
  8 | 
  9 | sys.path.append("..")
 10 | import thepipe.core as core
 11 | import thepipe.scraper as scraper
 12 | from PIL import Image
 13 | from io import BytesIO
 14 | 
 15 | 
 16 | class test_core(unittest.TestCase):
 17 |     def setUp(self):
 18 |         self.files_directory = os.path.join(os.path.dirname(__file__), "files")
 19 |         self.outputs_directory = "outputs"
 20 | 
 21 |     def tearDown(self):
 22 |         # clean up outputs
 23 |         if os.path.exists(self.outputs_directory):
 24 |             for file in os.listdir(self.outputs_directory):
 25 |                 os.remove(os.path.join(self.outputs_directory, file))
 26 |             os.rmdir(self.outputs_directory)
 27 | 
 28 |     def test_make_image_url_data_and_max_resolution(self):
 29 |         img = Image.new("RGB", (100, 50), color="purple")
 30 |         # max_resolution scales down before encoding
 31 |         url = core.make_image_url(img, host_images=False, max_resolution=25)
 32 |         self.assertTrue(url.startswith("data:image/jpeg;base64,"))
 33 |         b64 = url.split(",", 1)[1]
 34 |         img2 = Image.open(BytesIO(base64.b64decode(b64)))
 35 |         self.assertTrue(max(img2.size) <= 25)
 36 | 
 37 |     def test_make_image_url_host_images(self):
 38 | 
 39 |         # clear any old images
 40 |         if os.path.isdir("images"):
 41 |             shutil.rmtree("images")
 42 | 
 43 |         # override host for predictability
 44 |         import thepipe.core as core
 45 | 
 46 |         core.HOST_URL = "http://test-host"
 47 | 
 48 |         img = Image.new("RGB", (10, 10), color="orange")
 49 |         url = core.make_image_url(img, host_images=True)
 50 | 
 51 |         # URL should point to our HOST_URL
 52 |         self.assertTrue(url.startswith("http://test-host/images/"))
 53 | 
 54 |         # extract the image_id from the URL
 55 |         image_id = url.rsplit("/", 1)[-1]
 56 | 
 57 |         # confirm that exact file exists on disk
 58 |         self.assertTrue(os.path.exists(os.path.join("images", image_id)))
 59 | 
 60 |     def test_calculate_image_and_mixed_tokens(self):
 61 |         small = Image.new("RGB", (256, 256))
 62 |         self.assertEqual(core.calculate_image_tokens(small, detail="auto"), 85)
 63 |         large = Image.new("RGB", (2048, 2048))
 64 |         high = core.calculate_image_tokens(large, detail="high")
 65 |         self.assertGreater(high, 85)
 66 | 
 67 |         # Mixed text+image chunk
 68 |         txt = core.Chunk(text="abcd")  # 4 chars → 1 token
 69 |         img = core.Chunk(images=[small])  # 85 tokens
 70 |         total = core.calculate_tokens([txt, img])
 71 |         self.assertEqual(total, 1 + 85)
 72 | 
 73 |     def test_chunk_to_message_variants(self):
 74 |         img = Image.new("RGB", (5, 5))
 75 |         chunk = core.Chunk(path="f.md", text="![alt](foo.png)\nHello", images=[img])
 76 | 
 77 |         # text_only=True → no image_url entries
 78 |         msg1 = chunk.to_message(text_only=True)
 79 |         self.assertEqual(len(msg1["content"]), 1)
 80 |         self.assertEqual(msg1["content"][0]["type"], "text")
 81 | 
 82 |         # host_images & include_paths
 83 |         core.HOST_URL = "http://host"
 84 |         msg2 = chunk.to_message(host_images=True, include_paths=True)
 85 |         # First content block should include the <Document path="..."> wrapper
 86 |         self.assertIn('<Document path="f.md">', msg2["content"][0]["text"])
 87 |         # There must be at least one image_url entry
 88 |         self.assertTrue(any(item["type"] == "image_url" for item in msg2["content"]))
 89 | 
 90 |     def test_json_roundtrip(self):
 91 |         img = Image.new("RGB", (2, 2))
 92 |         chunk = core.Chunk(path="p", text="T", images=[img])
 93 |         data = chunk.to_json()
 94 |         chunk2 = core.Chunk.from_json(data)
 95 | 
 96 |         self.assertEqual(chunk2.path, "p")
 97 |         self.assertEqual(chunk2.text, "T")
 98 | 
 99 |         images = cast(List[Image.Image], chunk2.images)
100 |         self.assertIsInstance(images, list)
101 |         self.assertEqual(len(images), 1)
102 | 
103 |     @unittest.skipUnless(core.has_llama_index(), "llama-index extra is not installed")
104 |     def test_chunk_to_llamaindex(self):
105 |         chunk = core.Chunk(
106 |             path="example.md",
107 |             text="This is a coloured image",
108 |             images=[Image.new("RGB", (32, 32), color="red")],
109 |         )
110 |         llama_index_document = chunk.to_llamaindex()
111 |         self.assertEqual(type(llama_index_document), list)
112 |         self.assertEqual(len(llama_index_document), 1)
113 |         self.assertEqual(type(llama_index_document[0]), core.ImageDocument)
114 | 
115 |     def test_chunks_to_messages(self):
116 |         chunks = scraper.scrape_file(filepath=self.files_directory + "/example.md")
117 |         messages = core.chunks_to_messages(chunks)
118 |         self.assertEqual(type(messages), list)
119 |         for message in messages:
120 |             self.assertEqual(type(message), dict)
121 |             self.assertIn("role", message)
122 |             self.assertIn("content", message)
123 |         # test chunks_to_messages with path included
124 |         messages = core.chunks_to_messages(chunks, include_paths=True)
125 |         for message in messages:
126 |             self.assertIn("example.md", message["content"][0]["text"])
127 | 
128 |     def test_save_outputs_text_only_and_with_images(self):
129 |         # Text-only
130 |         c = core.Chunk(path="x.txt", text="XYZ")
131 |         core.save_outputs([c], text_only=True, output_folder="outputs")
132 |         self.assertTrue(os.path.exists("outputs/prompt.txt"))
133 |         files = os.listdir("outputs")
134 |         self.assertEqual(files, ["prompt.txt"])
135 |         shutil.rmtree("outputs")
136 | 
137 |         # With image
138 |         img = Image.new("RGB", (10, 10))
139 |         c2 = core.Chunk(path="y", text="TXT", images=[img])
140 |         core.save_outputs([c2], text_only=False, output_folder="outputs")
141 |         files = os.listdir("outputs")
142 |         self.assertIn("prompt.txt", files)
143 |         self.assertTrue(any(f.endswith(".jpg") for f in files))
144 | 
145 |     def test_chunk_json(self):
146 |         example_image_path = os.path.join(self.files_directory, "example.jpg")
147 |         image = Image.open(example_image_path)
148 |         chunk = core.Chunk(path="example.md", text="Hello, World!")
149 |         # convert to json
150 |         chunk_json = chunk.to_json()
151 |         # verify it is a dictionary with the expected items
152 |         self.assertEqual(type(chunk_json), dict)
153 |         self.assertIn("text", chunk_json)
154 |         self.assertIn("path", chunk_json)
155 |         # convert back
156 |         chunk = core.Chunk.from_json(chunk_json)
157 |         # verify it is the correct Chunk object
158 |         self.assertEqual(type(chunk), core.Chunk)
159 |         self.assertEqual(chunk.path, "example.md")
160 |         self.assertEqual(chunk.text, "Hello, World!")
161 | 
162 |     def test_calculate_tokens(self):
163 |         text = "Hello, World!"
164 |         tokens = core.calculate_tokens([core.Chunk(text=text)])
165 |         self.assertAlmostEqual(tokens, 3.25, places=0)
166 | 
167 |     def test_calculate_image_tokens(self):
168 |         image = Image.open(os.path.join(self.files_directory, "example.jpg"))
169 |         image.load()  # needed to close the file
170 |         tokens = core.calculate_image_tokens(image, detail="auto")
171 |         self.assertAlmostEqual(tokens, 85, places=0)
172 |         tokens = core.calculate_image_tokens(image, detail="low")
173 |         self.assertAlmostEqual(tokens, 85, places=0)
174 |         tokens = core.calculate_image_tokens(image, detail="high")
175 |         self.assertAlmostEqual(tokens, 595, places=0)
176 | 
177 |     def test_make_image_url(self):
178 |         image = Image.open(os.path.join(self.files_directory, "example.jpg"))
179 |         image.load()  # needed to close the file
180 |         url = core.make_image_url(image, host_images=False)
181 |         # verify it is in the correct format
182 |         self.assertTrue(url.startswith("data:image/jpeg;base64,"))
183 |         # verify it decodes correctly
184 |         remove_prefix = url.replace("data:image/jpeg;base64,", "")
185 |         image_data = base64.b64decode(remove_prefix)
186 |         image = Image.open(BytesIO(image_data))
187 |         self.assertEqual(image.format, "JPEG")
188 |         # verify it hosts the image correctly
189 |         url = core.make_image_url(image, host_images=True)
190 |         self.assertTrue(url.startswith(core.HOST_URL))
191 | 


--------------------------------------------------------------------------------
/thepipe/extract.py:
--------------------------------------------------------------------------------
  1 | from concurrent.futures import ThreadPoolExecutor, as_completed
  2 | import json
  3 | import re
  4 | from typing import Iterable, List, Dict, Union, Optional, Tuple, Callable, cast
  5 | from .core import (
  6 |     Chunk,
  7 |     calculate_tokens,
  8 |     DEFAULT_AI_MODEL,
  9 | )
 10 | from .scraper import scrape_url, scrape_file
 11 | from .chunker import (
 12 |     chunk_by_page,
 13 |     chunk_by_document,
 14 |     chunk_by_section,
 15 |     chunk_semantic,
 16 |     chunk_by_keywords,
 17 |     chunk_by_length,
 18 |     chunk_agentic,
 19 | )
 20 | import requests
 21 | import os
 22 | from openai import OpenAI
 23 | from openai.types.chat.chat_completion_message_param import ChatCompletionMessageParam
 24 | 
 25 | DEFAULT_EXTRACTION_PROMPT = "Extract all the information from the given document according to the following schema: {schema}. Immediately return valid JSON formatted data. If there is missing data, you may use null, but always fill in every column as best you can. Always immediately return valid JSON. You must extract ALL the information available in the entire document."
 26 | 
 27 | 
 28 | def extract_json_from_response(llm_response: str) -> Union[Dict, List[Dict], None]:
 29 |     def clean_response_text(llm_response: str) -> str:
 30 |         return llm_response.encode("utf-8", "ignore").decode("utf-8").strip()
 31 | 
 32 |     code_block_pattern = r"^```(?:json)?\s*([\s\S]*?)\s*```$"
 33 |     match = re.match(code_block_pattern, llm_response, re.MULTILINE | re.DOTALL)
 34 |     if match:
 35 |         llm_response = match.group(1)
 36 |     llm_response = clean_response_text(llm_response)
 37 | 
 38 |     try:
 39 |         parsed_json = json.loads(llm_response)
 40 |         return parsed_json
 41 |     except json.JSONDecodeError:
 42 |         json_pattern = r"($$[\s\S]*$$|\{[\s\S]*\})"
 43 |         match = re.search(json_pattern, llm_response)
 44 |         if match:
 45 |             try:
 46 |                 parsed_json = json.loads(match.group(1))
 47 |                 return parsed_json
 48 |             except json.JSONDecodeError:
 49 |                 pass
 50 | 
 51 |     objects = re.findall(r"\{[^{}]*\}", llm_response)
 52 |     if objects:
 53 |         valid_objects = []
 54 |         for obj in objects:
 55 |             try:
 56 |                 obj = obj.replace("\\", "").replace('\\"', '"')
 57 |                 valid_objects.append(json.loads(obj))
 58 |             except json.JSONDecodeError:
 59 |                 continue
 60 |         if valid_objects:
 61 |             return valid_objects if len(valid_objects) > 1 else valid_objects[0]
 62 |     print(f"[thepipe] Failed to extract valid JSON from LLM response: {llm_response}")
 63 |     return None
 64 | 
 65 | 
 66 | def extract_from_chunk(
 67 |     chunk: Chunk,
 68 |     chunk_index: int,
 69 |     schema: str,
 70 |     ai_model: str,
 71 |     source: str,
 72 |     multiple_extractions: bool,
 73 |     extraction_prompt: str,
 74 |     host_images: bool,
 75 |     openai_client: OpenAI,
 76 | ) -> Tuple[Dict, int]:
 77 |     response_dict = {"chunk_index": chunk_index, "source": source}
 78 |     tokens_used = 0
 79 |     try:
 80 |         corrected_extraction_prompt = extraction_prompt.replace("{schema}", schema)
 81 |         if multiple_extractions:
 82 |             corrected_extraction_prompt += """\nIf there are multiple extractions, return each JSON dictionary in a list under the key "extraction". The list should contain each extraction dict (according to the schema) and the entire list should be set to the "extraction" key. Immediately return this extraction JSON object with the "extraction" key mapping to a list containing all the extracted data."""
 83 |         else:
 84 |             corrected_extraction_prompt += (
 85 |                 """\nImmediately return the JSON dictionary."""
 86 |             )
 87 | 
 88 |         messages = [
 89 |             chunk.to_message(host_images=host_images),
 90 |             {
 91 |                 "role": "user",
 92 |                 "content": corrected_extraction_prompt,
 93 |             },
 94 |         ]
 95 | 
 96 |         response = openai_client.chat.completions.create(
 97 |             model=ai_model,
 98 |             messages=cast(Iterable[ChatCompletionMessageParam], messages),
 99 |             response_format={"type": "json_object"},
100 |         )
101 |         llm_response = response.choices[0].message.content
102 |         if not llm_response:
103 |             raise Exception(
104 |                 f"Failed to receive a message content from LLM Response: {response}"
105 |             )
106 |         input_tokens = calculate_tokens([chunk])
107 |         output_tokens = calculate_tokens([Chunk(text=llm_response)])
108 |         tokens_used += input_tokens + output_tokens
109 |         try:
110 |             llm_response_dict = extract_json_from_response(llm_response)
111 |             if llm_response_dict:
112 |                 if multiple_extractions:
113 |                     if (
114 |                         isinstance(llm_response_dict, dict)
115 |                         and "extraction" in llm_response_dict
116 |                     ):
117 |                         response_dict["extraction"] = llm_response_dict["extraction"]
118 |                     elif isinstance(llm_response_dict, list):
119 |                         response_dict["extraction"] = llm_response_dict
120 |                     else:
121 |                         response_dict["extraction"] = [llm_response_dict]
122 |                 else:
123 |                     if isinstance(llm_response_dict, dict):
124 |                         response_dict.update(llm_response_dict)
125 |                     elif isinstance(llm_response_dict, list):
126 |                         response_dict["error"] = (
127 |                             f"Expected a single JSON object but received a list: {llm_response_dict}. Try enabling multiple extractions."
128 |                         )
129 |                     else:
130 |                         response_dict["error"] = (
131 |                             f"Invalid JSON structure in LLM response: {llm_response_dict}"
132 |                         )
133 |             else:
134 |                 response_dict["error"] = (
135 |                     f"Failed to extract valid JSON from LLM response: {llm_response}"
136 |                 )
137 |         except Exception as e:
138 |             response_dict["error"] = f"Error processing LLM response: {e}"
139 |         if not multiple_extractions:
140 |             schema_keys = (
141 |                 json.loads(schema).keys() if isinstance(schema, str) else schema.keys()
142 |             )
143 |             for key in schema_keys:
144 |                 if key not in response_dict:
145 |                     response_dict[key] = None
146 |     except Exception as e:
147 |         response_dict = {"chunk_index": chunk_index, "source": source, "error": str(e)}
148 |     return response_dict, tokens_used
149 | 
150 | 
151 | def extract(
152 |     chunks: List[Chunk],
153 |     schema: Union[str, Dict],
154 |     model: str = DEFAULT_AI_MODEL,
155 |     multiple_extractions: bool = False,
156 |     extraction_prompt: str = DEFAULT_EXTRACTION_PROMPT,
157 |     host_images: bool = False,
158 |     openai_client: Optional[OpenAI] = None,
159 | ) -> Tuple[List[Dict], int]:
160 |     print(
161 |         f"[thepipe] Extract functions will be deprecated in future versions. See the README for more information"
162 |     )
163 |     if isinstance(schema, dict):
164 |         schema = json.dumps(schema)
165 | 
166 |     if openai_client is None:
167 |         raise ValueError(
168 |             "OpenAI client is required for structured extraction. Please provide a valid OpenAI client."
169 |         )
170 | 
171 |     results = []
172 |     total_tokens_used = 0
173 | 
174 |     n_threads = (os.cpu_count() or 1) * 2
175 |     with ThreadPoolExecutor(max_workers=n_threads) as executor:
176 |         future_to_chunk = {
177 |             executor.submit(
178 |                 extract_from_chunk,
179 |                 chunk=chunk,
180 |                 chunk_index=i,
181 |                 schema=schema,
182 |                 ai_model=model,
183 |                 source=chunk.path if chunk.path else "",
184 |                 multiple_extractions=multiple_extractions,
185 |                 extraction_prompt=extraction_prompt,
186 |                 host_images=host_images,
187 |                 openai_client=openai_client,
188 |             ): i
189 |             for i, chunk in enumerate(chunks)
190 |         }
191 | 
192 |         for future in as_completed(future_to_chunk):
193 |             try:
194 |                 result, tokens_used = future.result()
195 |                 results.append(result)
196 |                 total_tokens_used += tokens_used
197 |             except Exception as e:
198 |                 chunk_index = future_to_chunk[future]
199 |                 results.append(
200 |                     {
201 |                         "chunk_index": chunk_index,
202 |                         "source": chunks[chunk_index].path,
203 |                         "error": str(e),
204 |                     }
205 |                 )
206 | 
207 |     results.sort(key=lambda x: x["chunk_index"])
208 |     return results, total_tokens_used
209 | 
210 | 
211 | def extract_from_url(
212 |     url: str,
213 |     schema: Union[str, Dict],
214 |     model: str = DEFAULT_AI_MODEL,
215 |     multiple_extractions: bool = False,
216 |     extraction_prompt: str = DEFAULT_EXTRACTION_PROMPT,
217 |     host_images: bool = False,
218 |     verbose: bool = False,
219 |     chunking_method: Callable[[List[Chunk]], List[Chunk]] = chunk_by_page,
220 |     openai_client: Optional[OpenAI] = None,
221 | ) -> Tuple[List[Dict], int]:
222 |     print(
223 |         f"[thepipe] Extract functions will be deprecated in future versions. See the README for more information"
224 |     )
225 |     chunks = scrape_url(
226 |         url,
227 |         verbose=verbose,
228 |         chunking_method=chunking_method,
229 |         openai_client=openai_client,
230 |     )
231 |     extracted_chunks, tokens_used = extract(
232 |         chunks=chunks,
233 |         schema=schema,
234 |         model=model,
235 |         multiple_extractions=multiple_extractions,
236 |         extraction_prompt=extraction_prompt,
237 |         host_images=host_images,
238 |         openai_client=openai_client,
239 |     )
240 |     return extracted_chunks, tokens_used
241 | 
242 | 
243 | def extract_from_file(
244 |     file_path: str,
245 |     schema: Union[str, Dict],
246 |     model: str = DEFAULT_AI_MODEL,
247 |     multiple_extractions: bool = False,
248 |     extraction_prompt: str = DEFAULT_EXTRACTION_PROMPT,
249 |     host_images: bool = False,
250 |     verbose: bool = False,
251 |     chunking_method: Callable[[List[Chunk]], List[Chunk]] = chunk_by_page,
252 |     openai_client: Optional[OpenAI] = None,
253 | ) -> Tuple[List[Dict], int]:
254 |     print(
255 |         f"[thepipe] Extract functions will be deprecated in future versions. See the README for more information"
256 |     )
257 |     chunks = scrape_file(
258 |         file_path,
259 |         verbose=verbose,
260 |         chunking_method=chunking_method,
261 |         openai_client=openai_client,
262 |     )
263 |     extracted_chunks, tokens_used = extract(
264 |         chunks=chunks,
265 |         schema=schema,
266 |         model=model,
267 |         multiple_extractions=multiple_extractions,
268 |         extraction_prompt=extraction_prompt,
269 |         host_images=host_images,
270 |         openai_client=openai_client,
271 |     )
272 |     return extracted_chunks, tokens_used
273 | 


--------------------------------------------------------------------------------
/thepipe/chunker.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from typing import Dict, List, Optional, Tuple, Union
  3 | from .core import (
  4 |     Chunk,
  5 |     calculate_tokens,
  6 |     DEFAULT_AI_MODEL,
  7 |     DEFAULT_EMBEDDING_MODEL,
  8 | )
  9 | import numpy as np
 10 | from pydantic import BaseModel
 11 | from openai import OpenAI
 12 | 
 13 | 
 14 | class Section(BaseModel):
 15 |     title: str
 16 |     start_line: int
 17 |     end_line: int
 18 | 
 19 | 
 20 | class SectionList(BaseModel):
 21 |     sections: List[Section]
 22 | 
 23 | 
 24 | def chunk_by_document(chunks: List[Chunk]) -> List[Chunk]:
 25 |     chunks_by_doc = {}
 26 |     new_chunks = []
 27 |     for chunk in chunks:
 28 |         if not chunk.path:
 29 |             raise ValueError(
 30 |                 "Document chunking requires the path attribute to determine the document boundaries"
 31 |             )
 32 |         if chunk.path not in chunks_by_doc:
 33 |             chunks_by_doc[chunk.path] = []
 34 |         chunks_by_doc[chunk.path].append(chunk)
 35 |     for doc_chunks in chunks_by_doc.values():
 36 |         doc_texts = []
 37 |         doc_images = []
 38 |         for chunk in doc_chunks:
 39 |             doc_texts.append(chunk.text)
 40 |             doc_images.extend(chunk.images)
 41 |         text = "\n".join(doc_texts) if doc_texts else None
 42 |         new_chunks.append(Chunk(path=doc_chunks[0].path, text=text, images=doc_images))
 43 |     return new_chunks
 44 | 
 45 | 
 46 | def chunk_by_page(chunks: List[Chunk]) -> List[Chunk]:
 47 |     # by-page chunking is default behavior
 48 |     return chunks
 49 | 
 50 | 
 51 | def chunk_by_section(
 52 |     chunks: List[Chunk], section_separator: str = "## "
 53 | ) -> List[Chunk]:
 54 |     section_chunks: List[Chunk] = []
 55 |     cur_text: Optional[str] = None
 56 |     cur_images: List = []
 57 |     cur_path: Optional[str] = None
 58 | 
 59 |     for chunk in chunks:
 60 |         # Extract text (always a string or None)
 61 |         chunk_text = chunk.text or ""
 62 |         # Append images to current section once started
 63 |         if cur_text is not None and getattr(chunk, "images", None):
 64 |             if chunk.images:
 65 |                 cur_images.extend(chunk.images)
 66 | 
 67 |         for line in chunk_text.split("\n"):
 68 |             if line.startswith(section_separator):
 69 |                 # New section header found
 70 |                 if cur_text is not None:
 71 |                     # Flush previous section
 72 |                     section_chunks.append(
 73 |                         Chunk(
 74 |                             path=cur_path,
 75 |                             text=cur_text.rstrip("\n"),
 76 |                             images=cur_images.copy(),
 77 |                         )
 78 |                     )
 79 |                 # Start new section
 80 |                 cur_text = line + "\n"
 81 |                 cur_images = []
 82 |                 cur_path = chunk.path
 83 |             else:
 84 |                 if cur_text is not None:
 85 |                     cur_text += line + "\n"
 86 |                 else:
 87 |                     # Text before any section header: start first section
 88 |                     if line.strip():
 89 |                         cur_text = line + "\n"
 90 |                         cur_path = chunk.path
 91 |                         cur_images = []
 92 | 
 93 |     # Flush last section if present
 94 |     if cur_text is not None:
 95 |         section_chunks.append(
 96 |             Chunk(path=cur_path, text=cur_text.rstrip("\n"), images=cur_images.copy())
 97 |         )
 98 | 
 99 |     return section_chunks
100 | 
101 | 
102 | def chunk_semantic(
103 |     chunks: List[Chunk],
104 |     model: str = DEFAULT_EMBEDDING_MODEL,
105 |     buffer_size: int = 3,
106 |     similarity_threshold: float = 0.1,
107 | ) -> List[Chunk]:
108 |     try:
109 |         from sentence_transformers import SentenceTransformer
110 |     except ImportError as exc:  # pragma: no cover - exercised via runtime usage
111 |         raise ImportError(
112 |             "`chunk_semantic` requires the optional dependency `sentence-transformers`. "
113 |             "Install it with `pip install thepipe-api[semantic]` or include the `gpu` extra."
114 |         ) from exc
115 | 
116 |     embedding_model = SentenceTransformer(model_name_or_path=model)
117 |     # Flatten the chunks into sentences
118 |     sentences = []
119 |     sentence_chunk_map = []
120 |     sentence_path_map = []
121 |     for chunk in chunks:
122 |         chunk_text = chunk.text
123 |         if chunk_text:
124 |             lines = re.split(r"(?<=[.?!])\s+", chunk_text)
125 |             for line in lines:
126 |                 sentences.append(line)
127 |                 sentence_chunk_map.append(chunk)
128 |                 sentence_path_map.append(chunk.path)
129 | 
130 |     # Compute embeddings
131 |     embeddings = np.array(embedding_model.encode(sentences, convert_to_numpy=True))
132 | 
133 |     # Create groups based on sentence similarity
134 |     grouped_sentences = []
135 |     current_group = []
136 |     for i, embedding in enumerate(embeddings):
137 |         if not current_group:
138 |             current_group.append(i)
139 |             continue
140 |         # Check similarity with the last sentence in the current group
141 |         # If the similarity is above the threshold, add the sentence to the group
142 |         # Otherwise, start a new group
143 |         a = embedding
144 |         b = embeddings[current_group[-1]]
145 |         denom = float(np.linalg.norm(a) * np.linalg.norm(b))
146 |         similarity = float(np.dot(a, b) / denom) if denom else 0.0
147 |         if similarity >= similarity_threshold:
148 |             current_group.append(i)
149 |         else:
150 |             grouped_sentences.append(current_group)
151 |             current_group = [i]
152 | 
153 |     if current_group:
154 |         grouped_sentences.append(current_group)
155 | 
156 |     # Create new chunks based on grouped sentences
157 |     new_chunks = []
158 |     for group in grouped_sentences:
159 |         group_text = "\n".join(sentences[i] for i in group)
160 |         group_images = []
161 |         group_path = sentence_path_map[group[0]]
162 |         seen_images = []
163 |         for i in group:
164 |             for image in sentence_chunk_map[i].images:
165 |                 if image not in seen_images:
166 |                     group_images.append(image)
167 |                     seen_images.append(image)
168 |         new_chunks.append(Chunk(path=group_path, text=group_text, images=group_images))
169 | 
170 |     return new_chunks
171 | 
172 | 
173 | # starts a new chunk any time a word is found
174 | def chunk_by_keywords(
175 |     chunks: List[Chunk], keywords: List[str] = ["section"]
176 | ) -> List[Chunk]:
177 |     new_chunks = []
178 |     current_chunk_text = ""
179 |     current_chunk_images = []
180 |     current_chunk_path = chunks[0].path
181 |     for chunk in chunks:
182 |         if chunk.images:
183 |             current_chunk_images.extend(chunk.images)
184 |         lines = chunk.text.split("\n") if chunk.text else []
185 |         for line in lines:
186 |             if any(keyword.lower() in line.lower() for keyword in keywords):
187 |                 if current_chunk_text:
188 |                     new_chunks.append(
189 |                         Chunk(
190 |                             path=chunk.path,
191 |                             text=current_chunk_text,
192 |                             images=current_chunk_images,
193 |                         )
194 |                     )
195 |                     current_chunk_text = ""
196 |                     current_chunk_images = []
197 |                     current_chunk_path = chunk.path
198 |             current_chunk_text += line + "\n"
199 |     if current_chunk_text:
200 |         new_chunks.append(
201 |             Chunk(
202 |                 path=current_chunk_path,
203 |                 text=current_chunk_text,
204 |                 images=current_chunk_images,
205 |             )
206 |         )
207 |     return new_chunks
208 | 
209 | 
210 | def chunk_by_length(chunks: List[Chunk], max_tokens: int = 10000) -> List[Chunk]:
211 |     new_chunks = []
212 |     for chunk in chunks:
213 |         total_tokens = calculate_tokens([chunk])
214 |         if total_tokens < max_tokens:
215 |             new_chunks.append(chunk)
216 |             continue
217 |         text_halfway_index = len(chunk.text) // 2 if chunk.text else 0
218 |         images_halfway_index = len(chunk.images) // 2 if chunk.images else 0
219 |         if text_halfway_index == 0 and images_halfway_index == 0:
220 |             if chunk.images:
221 |                 # can't be split further: try to reduce the size of the images
222 |                 # by resizing each image to half its size
223 |                 new_images = []
224 |                 for image in chunk.images:
225 |                     new_width = image.width // 2
226 |                     new_height = image.height // 2
227 |                     resized_image = image.resize((new_width, new_height))
228 |                     new_images.append(resized_image)
229 |             else:
230 |                 # throw error to prevent downstream errors with LLM inference
231 |                 raise ValueError(
232 |                     "Chunk cannot be split further. Please increase the max_tokens limit."
233 |                 )
234 | 
235 |             return new_chunks
236 |         split_chunks = [
237 |             Chunk(
238 |                 path=chunk.path,
239 |                 text=chunk.text[:text_halfway_index] if chunk.text else None,
240 |                 images=chunk.images[:images_halfway_index] if chunk.images else None,
241 |             ),
242 |             Chunk(
243 |                 path=chunk.path,
244 |                 text=chunk.text[text_halfway_index:] if chunk.text else None,
245 |                 images=chunk.images[images_halfway_index:] if chunk.images else None,
246 |             ),
247 |         ]
248 |         # recursive call
249 |         new_chunks = chunk_by_length(split_chunks, max_tokens)
250 | 
251 |     return new_chunks
252 | 
253 | 
254 | # LLM-based agentic semantic chunking (experimental, openai only)
255 | def chunk_agentic(
256 |     chunks: List[Chunk],
257 |     openai_client: OpenAI,
258 |     model: str = DEFAULT_AI_MODEL,
259 |     max_tokens: int = 50000,
260 | ) -> List[Chunk]:
261 |     # 1) Enforce a hard token limit
262 |     chunks = chunk_by_length(chunks, max_tokens=max_tokens)
263 | 
264 |     # 2) Group by document
265 |     docs: Dict[str, List[Chunk]] = {}
266 |     for c in chunks:
267 |         docs.setdefault(c.path or "__no_path__", []).append(c)
268 | 
269 |     final_chunks: List[Chunk] = []
270 | 
271 |     for path, doc_chunks in docs.items():
272 |         # Flatten into numbered lines
273 |         lines: List[str] = []
274 |         line_to_chunk: List[Chunk] = []
275 |         for chunk in doc_chunks:
276 |             texts = (
277 |                 chunk.text
278 |                 if isinstance(chunk.text, list)
279 |                 else ([chunk.text] if chunk.text else [])
280 |             )
281 |             for text in texts:
282 |                 for line in text.split("\n"):
283 |                     lines.append(line)
284 |                     line_to_chunk.append(chunk)
285 |         if not lines:
286 |             continue
287 | 
288 |         numbered = "\n".join(f"{i+1}: {lines[i]}" for i in range(len(lines)))
289 | 
290 |         # 3) Ask the LLM for structured JSON
291 |         system_prompt = (
292 |             "Divide the following numbered document into semantically cohesive sections. "
293 |             "Return only a single JSON object matching the Pydantic schema `SectionList`, "
294 |             "e.g.:\n"
295 |             "{\n"
296 |             '  "sections": [\n'
297 |             '    {"title": "Introduction", "start_line": 1, "end_line": 5},\n'
298 |             "    ...\n"
299 |             "  ]\n"
300 |             "}\n"
301 |             "Ensure `start_line` and `end_line` are integers, cover every line in order, "
302 |             "and do not overlap or leave gaps."
303 |         )
304 |         user_prompt = numbered
305 | 
306 |         completion = openai_client.beta.chat.completions.parse(
307 |             model=model,
308 |             messages=[
309 |                 {"role": "system", "content": system_prompt},
310 |                 {"role": "user", "content": user_prompt},
311 |             ],
312 |             response_format=SectionList,
313 |         )
314 | 
315 |         if not completion.choices[0].message.parsed:
316 |             raise ValueError(
317 |                 "LLM did not return a valid response during agentic chunking."
318 |             )
319 | 
320 |         sections: List[Section] = completion.choices[0].message.parsed.sections
321 | 
322 |         # build chunks from those sections
323 |         for sec in sections:
324 |             start, end, title = sec.start_line, sec.end_line, sec.title
325 |             # clamp
326 |             start = max(1, min(start, len(lines)))
327 |             end = max(start, min(end, len(lines)))
328 | 
329 |             sec_lines = lines[start - 1 : end]
330 |             seen_imgs = set()
331 |             sec_images = []
332 |             for idx in range(start - 1, end):
333 |                 for img in getattr(line_to_chunk[idx], "images", []):
334 |                     if img not in seen_imgs:
335 |                         seen_imgs.add(img)
336 |                         sec_images.append(img)
337 | 
338 |             # prepend header
339 |             text_block = "\n".join(sec_lines)
340 |             new_chunk = Chunk(
341 |                 path=path if path != "__no_path__" else None,
342 |                 text=text_block,
343 |                 images=sec_images,
344 |             )
345 | 
346 |             # break further by length if needed
347 |             final_chunks.extend(chunk_by_length([new_chunk], max_tokens=max_tokens))
348 | 
349 |     return final_chunks
350 | 


--------------------------------------------------------------------------------
/thepipe/core.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import base64
  3 | from io import BytesIO
  4 | import json
  5 | import os
  6 | import re
  7 | import time
  8 | from typing import Dict, Iterable, List, Optional, Tuple, Union
  9 | import requests
 10 | from PIL import Image
 11 | 
 12 | try:  # Optional LlamaIndex dependency
 13 |     from llama_index.core.schema import Document as _LlamaDocument
 14 |     from llama_index.core.schema import ImageDocument as _LlamaImageDocument
 15 | except ImportError:  # pragma: no cover - handled dynamically in helpers below
 16 |     _LlamaDocument = None  # type: ignore[assignment]
 17 |     _LlamaImageDocument = None  # type: ignore[assignment]
 18 | 
 19 | # Re-export for backwards compatibility (may be ``None`` when not installed)
 20 | Document = _LlamaDocument  # type: ignore[assignment]
 21 | ImageDocument = _LlamaImageDocument  # type: ignore[assignment]
 22 | 
 23 | # LLM provider info, defaults to openai
 24 | DEFAULT_AI_MODEL = os.getenv("DEFAULT_AI_MODEL", "gpt-4o")
 25 | DEFAULT_EMBEDDING_MODEL = os.getenv(
 26 |     "DEFAULT_EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2"
 27 | )
 28 | 
 29 | # for persistent images via filehosting
 30 | HOST_IMAGES = os.getenv("HOST_IMAGES", "false").lower() == "true"
 31 | HOST_URL = os.getenv("HOST_URL", "https://thepipe-api.up.railway.app")
 32 | 
 33 | 
 34 | def prepare_image(image: Image.Image) -> Image.Image:
 35 |     """Return an in-memory copy of ``image`` with its underlying resources closed."""
 36 | 
 37 |     try:
 38 |         image.load()
 39 |     except Exception:
 40 |         pass
 41 | 
 42 |     try:
 43 |         prepared_image = image.copy()
 44 |     except Exception:
 45 |         return image
 46 | 
 47 |     try:
 48 |         image.close()
 49 |     except Exception:
 50 |         pass
 51 | 
 52 |     return prepared_image
 53 | 
 54 | 
 55 | def _ensure_llama_index() -> Tuple["Document", "ImageDocument"]:
 56 |     """Import LlamaIndex lazily and provide a helpful error message if missing."""
 57 | 
 58 |     global _LlamaDocument, _LlamaImageDocument
 59 | 
 60 |     if _LlamaDocument is not None and _LlamaImageDocument is not None:
 61 |         return _LlamaDocument, _LlamaImageDocument  # type: ignore[return-value]
 62 | 
 63 |     try:
 64 |         from llama_index.core.schema import Document as doc_cls
 65 |         from llama_index.core.schema import ImageDocument as image_doc_cls
 66 |     except ImportError as exc:  # pragma: no cover - exercised via has_llama_index
 67 |         raise ImportError(
 68 |             "LlamaIndex support is optional. Install it with "
 69 |             "`pip install thepipe-api[llama-index]` to use `Chunk.to_llamaindex`."
 70 |         ) from exc
 71 | 
 72 |     _LlamaDocument, _LlamaImageDocument = doc_cls, image_doc_cls
 73 |     return doc_cls, image_doc_cls  # type: ignore[return-value]
 74 | 
 75 | 
 76 | def has_llama_index() -> bool:
 77 |     """Return ``True`` when the optional LlamaIndex dependency is available."""
 78 | 
 79 |     try:
 80 |         _ensure_llama_index()
 81 |     except ImportError:
 82 |         return False
 83 |     return True
 84 | 
 85 | 
 86 | class Chunk:
 87 |     def __init__(
 88 |         self,
 89 |         path: Optional[str] = None,
 90 |         text: Optional[str] = None,
 91 |         images: Optional[Iterable[Image.Image]] = None,
 92 |         audios: Optional[Iterable] = None,
 93 |         videos: Optional[Iterable] = None,
 94 |     ):
 95 |         self.path = path
 96 |         self.text = text or ""
 97 |         self.images = [prepare_image(image) for image in images] if images else []
 98 |         self.audios = list(audios) if audios else []
 99 |         self.videos = list(videos) if videos else []
100 | 
101 |     def __repr__(self) -> str:
102 |         parts = []
103 |         if self.path is not None:
104 |             parts.append(f"path={self.path!r}")
105 |         if self.text:
106 |             # Show a concise preview of the text
107 |             snippet = self.text.replace("\n", " ")
108 |             if len(snippet) > 50:
109 |                 snippet = snippet[:47] + "..."
110 |             parts.append(f"text_snippet={snippet!r}")
111 |         if self.images:
112 |             parts.append(f"images_count={len(self.images)}")
113 |         if self.audios:
114 |             parts.append(f"audios_count={len(self.audios)}")
115 |         if self.videos:
116 |             parts.append(f"videos_count={len(self.videos)}")
117 |         content = ", ".join(parts) or "empty"
118 |         return f"Chunk({content})"
119 | 
120 |     def __str__(self) -> str:
121 |         return self.__repr__()
122 | 
123 |     def to_llamaindex(self) -> Union[List["Document"], List["ImageDocument"]]:
124 |         DocumentCls, ImageDocumentCls = _ensure_llama_index()
125 |         document_text = self.text if self.text else ""
126 |         metadata = {"filepath": self.path} if self.path else {}
127 | 
128 |         # If we have PIL Image objects in self.images, convert them to base64 strings
129 |         if self.images:
130 |             image_docs: List[ImageDocument] = []
131 |             for img in self.images:
132 |                 # Encode the image to JPEG (or use its original format if available)
133 |                 buffer = BytesIO()
134 |                 fmt = img.format or "JPEG"
135 |                 img = img.convert("RGB")  # ensure RGB
136 |                 img.save(buffer, format=fmt)
137 |                 img_bytes = buffer.getvalue()
138 | 
139 |                 # Base64‑encode and build MIME type
140 |                 img_b64 = base64.b64encode(img_bytes).decode("utf-8")
141 | 
142 |                 image_docs.append(
143 |                     ImageDocumentCls(
144 |                         text=document_text,
145 |                         image=img_b64,
146 |                         extra_info=metadata,
147 |                     )
148 |                 )
149 |             return image_docs
150 | 
151 |         # Fallback to plain text Document
152 |         return [DocumentCls(text=document_text, extra_info=metadata)]
153 | 
154 |     def to_message(
155 |         self,
156 |         text_only: bool = False,
157 |         host_images: bool = False,
158 |         max_resolution: Optional[int] = None,
159 |         include_paths: Optional[bool] = False,
160 |     ) -> Dict:
161 |         message_text = ""
162 |         message = {"role": "user", "content": []}
163 |         image_urls = (
164 |             [
165 |                 make_image_url(image, host_images, max_resolution)
166 |                 for image in self.images
167 |             ]
168 |             if self.images and not text_only
169 |             else []
170 |         )
171 |         img_index = 0
172 |         text = self.text if self.text else ""
173 |         if host_images:
174 | 
175 |             def replace_image(match):
176 |                 nonlocal img_index
177 |                 if img_index < len(image_urls):
178 |                     url = image_urls[img_index]
179 |                     img_index += 1
180 |                     return f"![image]({url})"
181 |                 return match.group(
182 |                     0
183 |                 )  # If we run out of images, leave the original text
184 | 
185 |             # Replace markdown image references with hosted URLs
186 |             text = re.sub(r"!\[([^\]]*)\]\([^\)]+\)", replace_image, text)
187 |         message_text += text + "\n\n"
188 |         # clean up, add to message
189 |         message_text = re.sub(r"\n{3,}", "\n\n", message_text).strip()
190 |         # Wrap the text in a path html block if it exists
191 |         if include_paths and self.path:
192 |             message_text = f'<Document path="{self.path}">\n{message_text}\n</Document>'
193 |         message["content"].append({"type": "text", "text": message_text})
194 | 
195 |         # Add remaining images that weren't referenced in the text
196 |         for image_url in image_urls:
197 |             message["content"].append({"type": "image_url", "image_url": image_url})
198 | 
199 |         return message
200 | 
201 |     def to_json(self, host_images: bool = False, text_only: bool = False) -> Dict:
202 |         data = {
203 |             "path": self.path,
204 |             "text": self.text.strip() if self.text else "",
205 |             "images": (
206 |                 [
207 |                     make_image_url(image=image, host_images=host_images)
208 |                     for image in self.images
209 |                     if not text_only
210 |                 ]
211 |                 if self.images
212 |                 else []
213 |             ),
214 |             "audios": self.audios,
215 |             "videos": self.videos,
216 |         }
217 |         return data
218 | 
219 |     @staticmethod
220 |     def from_json(data: Dict, host_images: bool = False) -> "Chunk":
221 |         images = []
222 |         if "images" in data:
223 |             for image_str in data["images"]:
224 |                 if host_images:
225 |                     image_data = requests.get(image_str).content
226 |                     image = Image.open(BytesIO(image_data))
227 |                     images.append(image)
228 |                 else:
229 |                     remove_prefix = image_str.replace("data:image/jpeg;base64,", "")
230 |                     image_data = base64.b64decode(remove_prefix)
231 |                     image = Image.open(BytesIO(image_data))
232 |                     images.append(image)
233 |         text = data["text"].strip() if "text" in data else None
234 |         return Chunk(
235 |             path=data["path"],
236 |             text=text,
237 |             images=images,
238 |             # audios=data['audios'],
239 |             # videos=data['videos'],
240 |         )
241 | 
242 | 
243 | def make_image_url(
244 |     image: Image.Image, host_images: bool = False, max_resolution: Optional[int] = None
245 | ) -> str:
246 |     if max_resolution:
247 |         width, height = image.size
248 |         if width > max_resolution or height > max_resolution:
249 |             scale = max_resolution / max(width, height)
250 |             new_width = int(width * scale)
251 |             new_height = int(height * scale)
252 |             image = image.resize((new_width, new_height))
253 |     if host_images:
254 |         if not os.path.exists("images"):
255 |             os.makedirs("images")
256 |         image_id = f"{time.time_ns()}.jpg"
257 |         image_path = os.path.join("images", image_id)
258 |         if image.mode in ("P", "RGBA"):
259 |             image = image.convert("RGB")
260 |         image.save(image_path)
261 |         return f"{HOST_URL}/images/{image_id}"
262 |     else:
263 |         buffered = BytesIO()
264 |         if image.mode != "RGB":
265 |             image = image.convert("RGB")
266 |         image.save(buffered, format="JPEG")
267 |         img_str = base64.b64encode(buffered.getvalue()).decode()
268 |         return f"data:image/jpeg;base64,{img_str}"
269 | 
270 | 
271 | def calculate_image_tokens(image: Image.Image, detail: str = "auto") -> int:
272 |     width, height = image.size
273 |     if detail == "low":
274 |         return 85
275 |     elif detail == "high":
276 |         width, height = min(width, 2048), min(height, 2048)
277 |         short_side = min(width, height)
278 |         scale = 768 / short_side
279 |         scaled_width = int(width * scale)
280 |         scaled_height = int(height * scale)
281 |         tiles = (scaled_width // 512) * (scaled_height // 512)
282 |         return 170 * tiles + 85
283 |     else:
284 |         if width <= 512 and height <= 512:
285 |             return 85
286 |         else:
287 |             return calculate_image_tokens(image, detail="high")
288 | 
289 | 
290 | def calculate_tokens(chunks: List[Chunk], text_only: bool = False) -> int:
291 |     n_tokens = 0
292 |     for chunk in chunks:
293 |         if chunk.text:
294 |             n_tokens += len(chunk.text) / 4
295 |         if chunk.images and not text_only:
296 |             for image in chunk.images:
297 |                 n_tokens += calculate_image_tokens(image)
298 |     return int(n_tokens)
299 | 
300 | 
301 | def chunks_to_messages(
302 |     chunks: List[Chunk],
303 |     text_only: bool = False,
304 |     host_images: bool = False,
305 |     max_resolution: Optional[int] = None,
306 |     include_paths: Optional[bool] = False,
307 | ) -> List[Dict]:
308 |     return [
309 |         chunk.to_message(
310 |             text_only=text_only,
311 |             host_images=host_images,
312 |             max_resolution=max_resolution,
313 |             include_paths=include_paths,
314 |         )
315 |         for chunk in chunks
316 |     ]
317 | 
318 | 
319 | def save_outputs(
320 |     chunks: List[Chunk],
321 |     output_folder: str,
322 |     verbose: bool = False,
323 |     text_only: bool = False,
324 | ) -> None:
325 |     if not os.path.exists(output_folder):
326 |         os.makedirs(output_folder)
327 |     text = ""
328 |     # Save the text and images to the outputs directory
329 |     for i, chunk in enumerate(chunks):
330 |         if chunk is None:
331 |             continue
332 |         if chunk.path is not None:
333 |             text += f"{chunk.path}:\n"
334 |         if chunk.text:
335 |             text += f"```\n{chunk.text}\n```\n"
336 |         if not text_only and chunk.images:
337 |             for j, image in enumerate(chunk.images):
338 |                 image.convert("RGB").save(f"{output_folder}/{i}_{j}.jpg")
339 |     # Save the text
340 |     with open(f"{output_folder}/prompt.txt", "w", encoding="utf-8") as file:
341 |         file.write(text)
342 |     if verbose:
343 |         print(f"[thepipe] {calculate_tokens(chunks)} tokens saved to {output_folder}")
344 | 
345 | 
346 | def parse_arguments() -> argparse.Namespace:
347 |     parser = argparse.ArgumentParser(
348 |         description="Compress project files into a context prompt."
349 |     )
350 |     parser.add_argument(
351 |         "source", type=str, help="The source file or directory to compress."
352 |     )
353 |     parser.add_argument(
354 |         "--inclusion_pattern",
355 |         type=str,
356 |         default=None,
357 |         help="Regex pattern to match in a directory.",
358 |     )
359 |     parser.add_argument(
360 |         "--ai_extraction",
361 |         action="store_true",
362 |         help="Use ai_extraction to extract text from images.",
363 |     )
364 |     parser.add_argument("--text_only", action="store_true", help="Only store text.")
365 |     parser.add_argument("--verbose", action="store_true", help="Print status messages.")
366 |     parser.add_argument("--local", action="store_true", help="Print status messages.")
367 |     args = parser.parse_args()
368 |     return args
369 | 


--------------------------------------------------------------------------------
/tests/test_scraper.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import tempfile
  3 | from typing import cast
  4 | import unittest
  5 | import os
  6 | import sys
  7 | import zipfile
  8 | from PIL import Image
  9 | import pandas as pd
 10 | 
 11 | try:
 12 |     from openai import OpenAI
 13 | except ImportError:  # pragma: no cover - optional in CI
 14 |     OpenAI = None  # type: ignore[assignment]
 15 | 
 16 | try:
 17 |     import whisper  # noqa: F401  # pragma: no cover - optional dependency
 18 | 
 19 |     HAS_WHISPER = True
 20 | except ImportError:  # pragma: no cover - optional dependency
 21 |     HAS_WHISPER = False
 22 | 
 23 | SKIP_NETWORK_TESTS = os.getenv("THEPIPE_SKIP_NETWORK_TESTS") == "1"
 24 | 
 25 | sys.path.append("..")
 26 | import thepipe.core as core
 27 | import thepipe.scraper as scraper
 28 | 
 29 | 
 30 | class test_scraper(unittest.TestCase):
 31 |     def setUp(self):
 32 |         self.files_directory = os.path.join(os.path.dirname(__file__), "files")
 33 |         self.outputs_directory = "outputs"
 34 |         # create a client we can re-use for ai_extraction scenarios
 35 |         self.client = OpenAI() if OpenAI is not None else None
 36 | 
 37 |     def tearDown(self):
 38 |         # clean up outputs
 39 |         if os.path.exists(self.outputs_directory):
 40 |             for file in os.listdir(self.outputs_directory):
 41 |                 os.remove(os.path.join(self.outputs_directory, file))
 42 |             os.rmdir(self.outputs_directory)
 43 | 
 44 |     def test_scrape_directory(self):
 45 |         # verify scraping entire example directory, bar the 'unknown' file
 46 |         chunks = scraper.scrape_directory(
 47 |             dir_path=self.files_directory, inclusion_pattern="^(?!.*unknown).*"
 48 |         )
 49 |         self.assertIsInstance(chunks, list)
 50 |         self.assertGreater(len(chunks), 0)
 51 |         for chunk in chunks:
 52 |             self.assertIsInstance(chunk, core.Chunk)
 53 |             # ensure at least one of text/images is non-empty
 54 |             if not (chunk.text or chunk.images):
 55 |                 self.fail("Empty chunk found: {}".format(chunk.path))
 56 |             self.assertTrue(chunk.text or chunk.images)
 57 | 
 58 |     def test_scrape_directory_inclusion_exclusion(self):
 59 |         with tempfile.TemporaryDirectory() as tmp:
 60 |             # ignored folder
 61 |             os.makedirs(os.path.join(tmp, "node_modules"))
 62 |             with open(os.path.join(tmp, "node_modules", "a.txt"), "w") as f:
 63 |                 f.write("x")
 64 |             # ignored extension
 65 |             with open(os.path.join(tmp, "bad.pyc"), "w") as f:
 66 |                 f.write("x")
 67 |             # valid file
 68 |             good = os.path.join(tmp, "good.txt")
 69 |             with open(good, "w") as f:
 70 |                 f.write("Y")
 71 | 
 72 |             chunks = scraper.scrape_directory(tmp, inclusion_pattern="good")
 73 | 
 74 |         self.assertEqual(len(chunks), 1)
 75 | 
 76 |         # cast .text to str so Pylance knows it's not None
 77 |         text = cast(str, chunks[0].text)
 78 |         self.assertIn("Y", text)
 79 | 
 80 |     def test_scrape_html(self):
 81 |         filepath = os.path.join(self.files_directory, "example.html")
 82 |         chunks = scraper.scrape_file(filepath, verbose=True)
 83 |         # verify it scraped the url into chunks
 84 |         self.assertIsInstance(chunks, list)
 85 |         self.assertGreater(len(chunks), 0)
 86 |         # verify it scraped markdown data
 87 |         self.assertTrue(any(chunk.text and len(chunk.text) > 0 for chunk in chunks))
 88 |         # verify it scraped to markdown correctly
 89 |         self.assertTrue(any("# Heading 1" in (chunk.text or "") for chunk in chunks))
 90 |         self.assertTrue(any("## Heading 2" in (chunk.text or "") for chunk in chunks))
 91 |         self.assertTrue(any("### Heading 3" in (chunk.text or "") for chunk in chunks))
 92 |         self.assertTrue(
 93 |             any("| Name | Age | Country |" in (chunk.text or "") for chunk in chunks)
 94 |         )
 95 |         # verify bold and italic
 96 |         self.assertTrue(any("**bold text**" in (chunk.text or "") for chunk in chunks))
 97 |         self.assertTrue(any("*italic text*" in (chunk.text or "") for chunk in chunks))
 98 |         # ensure javascript was not scraped
 99 |         self.assertFalse(
100 |             any("function highlightText()" in (chunk.text or "") for chunk in chunks)
101 |         )
102 | 
103 |     def test_scrape_zip(self):
104 |         with tempfile.TemporaryDirectory() as tmp:
105 |             txt = os.path.join(tmp, "a.txt")
106 |             with open(txt, "w") as f:
107 |                 f.write("TXT")
108 |             imgf = os.path.join(tmp, "i.jpg")
109 |             Image.new("RGB", (10, 10)).save(imgf)
110 |             zf = os.path.join(tmp, "test.zip")
111 |             with zipfile.ZipFile(zf, "w") as z:
112 |                 z.write(txt, arcname="a.txt")
113 |                 z.write(imgf, arcname="i.jpg")
114 |             chunks = scraper.scrape_file(zf)
115 | 
116 |         self.assertTrue(any("TXT" in cast(str, c.text) for c in chunks))
117 |         self.assertTrue(any(c.images for c in chunks))
118 | 
119 |     def test_scrape_spreadsheet(self):
120 |         with tempfile.TemporaryDirectory() as tmp:
121 |             df = pd.DataFrame({"a": [1, 2]})
122 |             csvp = os.path.join(tmp, "t.csv")
123 |             df.to_csv(csvp, index=False)
124 |             chunks_csv = scraper.scrape_spreadsheet(csvp, "application/vnd.ms-excel")
125 |             self.assertEqual(len(chunks_csv), 2)
126 |             for i, c in enumerate(chunks_csv):
127 |                 self.assertIsNotNone(c.text)
128 |                 rec = json.loads(cast(str, c.text))
129 |                 self.assertEqual(rec["a"], i + 1)
130 |                 self.assertEqual(rec["row index"], i)
131 | 
132 |             xlsx = os.path.join(tmp, "t.xlsx")
133 |             df.to_excel(xlsx, index=False)
134 |             chunks_xlsx = scraper.scrape_spreadsheet(
135 |                 xlsx,
136 |                 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
137 |             )
138 |             self.assertEqual(len(chunks_xlsx), 2)
139 | 
140 |     def test_scrape_ipynb(self):
141 |         chunks = scraper.scrape_file(
142 |             os.path.join(self.files_directory, "example.ipynb"), verbose=True
143 |         )
144 |         # verify it scraped the ipynb file into chunks
145 |         self.assertIsInstance(chunks, list)
146 |         self.assertGreater(len(chunks), 0)
147 |         self.assertIsInstance(chunks[0], core.Chunk)
148 |         # verify it scraped text data
149 |         self.assertTrue(
150 |             any(chunk.text and len(chunk.text or "") > 0 for chunk in chunks)
151 |         )
152 |         # verify it scraped image data
153 |         self.assertTrue(
154 |             any(chunk.images and len(chunk.images or []) > 0 for chunk in chunks)
155 |         )
156 | 
157 |     # requires LLM server to be set up
158 |     @unittest.skipIf(
159 |         OpenAI is None or not os.getenv("OPENAI_API_KEY"), "OpenAI API key required"
160 |     )
161 |     def test_scrape_pdf_with_ai_extraction(self):
162 |         chunks = scraper.scrape_file(
163 |             os.path.join(self.files_directory, "example.pdf"),
164 |             verbose=True,
165 |             openai_client=self.client,
166 |         )
167 |         # verify it scraped the pdf file into chunks
168 |         self.assertIsInstance(chunks, list)
169 |         self.assertGreater(len(chunks), 0)
170 |         self.assertIsInstance(chunks[0], core.Chunk)
171 |         # verify it scraped the data
172 |         for chunk in chunks:
173 |             self.assertTrue(
174 |                 (chunk.text and len(chunk.text or "") > 0)
175 |                 or (chunk.images and len(chunk.images or []) > 0)
176 |             )
177 | 
178 |     def test_scrape_docx(self):
179 |         chunks = scraper.scrape_file(
180 |             os.path.join(self.files_directory, "example.docx"), verbose=True
181 |         )
182 |         # verify it scraped the docx file into chunks
183 |         self.assertIsInstance(chunks, list)
184 |         self.assertGreater(len(chunks), 0)
185 |         self.assertIsInstance(chunks[0], core.Chunk)
186 |         # verify it scraped data
187 |         self.assertTrue(
188 |             any(len(chunk.text or "") or len(chunk.images or []) for chunk in chunks)
189 |         )
190 | 
191 |     def test_extract_pdf_without_ai_extraction(self):
192 |         chunks = scraper.scrape_file(
193 |             os.path.join(self.files_directory, "example.pdf"),
194 |             verbose=True,
195 |         )
196 |         # verify it scraped the pdf file into chunks
197 |         self.assertIsInstance(chunks, list)
198 |         self.assertGreater(len(chunks), 0)
199 |         self.assertIsInstance(chunks[0], core.Chunk)
200 |         # verify it scraped text data
201 |         self.assertTrue(
202 |             any(chunk.text and len(chunk.text or "") > 0 for chunk in chunks)
203 |         )
204 |         # verify it scraped image data
205 |         self.assertTrue(
206 |             any(chunk.images and len(chunk.images or []) > 0 for chunk in chunks)
207 |         )
208 | 
209 |     @unittest.skipUnless(HAS_WHISPER, "Whisper extra is not installed")
210 |     def test_scrape_audio(self):
211 |         chunks = scraper.scrape_file(
212 |             os.path.join(self.files_directory, "example.mp3"), verbose=True
213 |         )
214 |         # verify it scraped the audio file into chunks
215 |         self.assertIsInstance(chunks, list)
216 |         self.assertGreater(len(chunks), 0)
217 |         self.assertIsInstance(chunks[0], core.Chunk)
218 |         # verify it scraped audio data
219 |         self.assertTrue(
220 |             any(chunk.text and len(chunk.text or "") > 0 for chunk in chunks)
221 |         )
222 |         # verify it transcribed the audio correctly
223 |         self.assertTrue(
224 |             any(chunk.text and "citizens" in chunk.text.lower() for chunk in chunks)
225 |         )
226 | 
227 |     @unittest.skipUnless(HAS_WHISPER, "Whisper extra is not installed")
228 |     def test_scrape_video(self):
229 |         chunks = scraper.scrape_file(
230 |             os.path.join(self.files_directory, "example.mp4"), verbose=True
231 |         )
232 |         # verify it scraped the video file into chunks
233 |         self.assertIsInstance(chunks, list)
234 |         self.assertGreater(len(chunks), 0)
235 |         self.assertIsInstance(chunks[0], core.Chunk)
236 |         # verify it scraped visual data
237 |         self.assertTrue(
238 |             any(chunk.images and len(chunk.images or []) > 0 for chunk in chunks)
239 |         )
240 |         # verify it scraped audio data
241 |         self.assertTrue(
242 |             any(chunk.text and len(chunk.text or "") > 0 for chunk in chunks)
243 |         )
244 |         # verify it transcribed the audio correctly
245 |         self.assertTrue(
246 |             any(chunk.text and "citizens" in chunk.text.lower() for chunk in chunks)
247 |         )
248 | 
249 |     def test_scrape_pptx(self):
250 |         chunks = scraper.scrape_file(
251 |             os.path.join(self.files_directory, "example.pptx"), verbose=True
252 |         )
253 |         # verify it scraped the pptx file into chunks
254 |         self.assertIsInstance(chunks, list)
255 |         self.assertGreater(len(chunks), 0)
256 |         self.assertIsInstance(chunks[0], core.Chunk)
257 |         # verify it scraped text data
258 |         self.assertTrue(
259 |             any(chunk.text and len(chunk.text or "") > 0 for chunk in chunks)
260 |         )
261 |         # verify it scraped image data
262 |         self.assertTrue(
263 |             any(chunk.images and len(chunk.images or []) > 0 for chunk in chunks)
264 |         )
265 | 
266 |     @unittest.skipIf(SKIP_NETWORK_TESTS, "Network tests disabled")
267 |     def test_scrape_tweet(self):
268 |         tweet_url = "https://x.com/ylecun/status/1796734866156843480"
269 |         chunks = scraper.scrape_url(tweet_url)
270 |         # verify it returned chunks representing the tweet
271 |         self.assertIsInstance(chunks, list)
272 |         self.assertGreater(len(chunks), 0)
273 |         self.assertIsInstance(chunks[0], core.Chunk)
274 |         # verify it scraped the tweet contents
275 |         self.assertTrue(chunks[0].text and len(chunks[0].text or "") > 0)
276 |         self.assertTrue(chunks[0].images and len(chunks[0].images or []) > 0)
277 | 
278 |     @unittest.skipIf(SKIP_NETWORK_TESTS, "Network tests disabled")
279 |     def test_scrape_url(self):
280 |         # verify web page scrape result
281 |         chunks = scraper.scrape_url("https://en.wikipedia.org/wiki/Piping")
282 | 
283 |         # Basic checks
284 |         self.assertIsInstance(chunks, list)
285 |         self.assertGreater(len(chunks), 0, "No chunks were extracted")
286 | 
287 |         for chunk in chunks:
288 |             self.assertIsInstance(chunk, core.Chunk)
289 |             self.assertEqual(chunk.path, "https://en.wikipedia.org/wiki/Piping")
290 | 
291 |         # Check that we got some content
292 |         has_text = any(chunk.text and len(chunk.text.strip()) > 0 for chunk in chunks)
293 |         has_images = any(chunk.images and len(chunk.images) > 0 for chunk in chunks)
294 | 
295 |         self.assertTrue(has_text or has_images, "No text or images were extracted")
296 | 
297 |         # More flexible search for piping-related content
298 |         if has_text:
299 |             all_text = " ".join(chunk.text or "" for chunk in chunks).lower()
300 |             piping_keywords = [
301 |                 "pipe",
302 |                 "piping",
303 |                 "pipeline",
304 |                 "plumbing",
305 |                 "tube",
306 |                 "conduit",
307 |             ]
308 |             found_keyword = any(keyword in all_text for keyword in piping_keywords)
309 |             self.assertTrue(
310 |                 found_keyword,
311 |                 f"No piping-related keywords found in text. Text preview: {all_text[:200]}...",
312 |             )
313 | 
314 |         # verify if at least one image was scraped
315 |         self.assertTrue(
316 |             any(chunk.images and len(chunk.images or []) > 0 for chunk in chunks)
317 |         )
318 | 
319 |     @unittest.skipIf(SKIP_NETWORK_TESTS, "Network tests disabled")
320 |     @unittest.skipIf(
321 |         OpenAI is None or not os.getenv("OPENAI_API_KEY"), "OpenAI API key required"
322 |     )
323 |     def test_scrape_url_with_ai_extraction(self):
324 |         # verify web page scrape result with ai extraction
325 |         chunks = scraper.scrape_url(
326 |             "https://en.wikipedia.org/wiki/Piping", openai_client=self.client
327 |         )
328 |         for chunk in chunks:
329 |             self.assertIsInstance(chunk, core.Chunk)
330 |             self.assertEqual(chunk.path, "https://en.wikipedia.org/wiki/Piping")
331 |         # assert if any of the texts contains 'pipe'
332 |         print("test_scrape_url_with_ai_extraction chunks:", chunks)
333 |         self.assertTrue(
334 |             any(chunk.text and "pipe" in chunk.text.lower() for chunk in chunks)
335 |         )
336 |         # verify if at least one image was scraped
337 |         self.assertTrue(
338 |             any(chunk.images and len(chunk.images or []) > 0 for chunk in chunks)
339 |         )
340 | 
341 |     @unittest.skipUnless(os.environ.get("GITHUB_TOKEN"), "requires GITHUB_TOKEN")
342 |     def test_scrape_github(self):
343 |         chunks = scraper.scrape_url("https://github.com/emcf/thepipe")
344 |         self.assertIsInstance(chunks, list)
345 |         self.assertGreater(len(chunks), 0)  # should have some repo contents
346 | 
347 | 
348 | if __name__ == "__main__":
349 |     unittest.main()
350 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 |   <a href="https://thepi.pe/">
  3 |     <img src="https://rpnutzemutbrumczwvue.supabase.co/storage/v1/object/public/assets/pipeline_small%20(1).png" alt="Pipeline Illustration" style="width:96px; height:72px; vertical-align:middle;">
  4 |     <h1>thepi.pe</h1>
  5 |   </a>
  6 |   <a>
  7 |     <img src="https://github.com/emcf/thepipe/actions/workflows/python-ci.yml/badge.svg" alt="python-gh-action">
  8 |   </a>
  9 |     <a href="https://codecov.io/gh/emcf/thepipe">
 10 |     <img src="https://codecov.io/gh/emcf/thepipe/graph/badge.svg?token=OE7CUEFUL9" alt="codecov">
 11 |   </a>
 12 |   <a href="https://raw.githubusercontent.com/emcf/thepipe/main/LICENSE">
 13 |     <img src="https://img.shields.io/badge/license-MIT-green" alt="MIT license">
 14 |   </a>
 15 |   <a href="https://www.pepy.tech/projects/thepipe-api">
 16 |     <img src="https://static.pepy.tech/badge/thepipe-api" alt="PyPI">
 17 |   </a>
 18 | </div>
 19 | 
 20 | ## Extract clean data from tricky documents ⚡
 21 | 
 22 | thepi.pe is a package that can scrape clean markdown, multimodal media, and structured data from complex documents. It uses vision-language models (VLMs) under the hood for superior output quality, and works out-of-the-box with any LLM, VLM, or vector database. It can extract well-formatted data from a wide range of sources, including PDFs, URLs, Word docs, Powerpoints, Python notebooks, videos, audio, and more.
 23 | 
 24 | ## Features 🌟
 25 | 
 26 | - Scrape clean markdown, tables, and images from any document
 27 | - Scrape text, images, video, and audio from any file or URL
 28 | - Works out-of-the-box with vision-language models, vector databases, and RAG frameworks
 29 | - AI-native file-type detection, layout analysis, and structured data extraction
 30 | - Accepts a wide range of sources, including PDFs, URLs, Word docs, Powerpoints, Python notebooks, GitHub repos, videos, audio, and more
 31 | 
 32 | ## Get started in 5 minutes 🚀
 33 | 
 34 | Thepipe can be installed via the command line:
 35 | 
 36 | ```bash
 37 | pip install thepipe-api
 38 | ```
 39 | 
 40 | The default install only pulls in CPU-friendly dependencies so it is suitable for constrained environments and CI systems. GPU-enabled libraries such as PyTorch and Triton are left as optional extras.
 41 | 
 42 | ### Optional extras
 43 | 
 44 | The package exposes a set of extras so you can opt-in to heavier dependencies on demand:
 45 | 
 46 | | Extra                      | Installs                                  | When to use it                                        |
 47 | | -------------------------- | ----------------------------------------- | ----------------------------------------------------- |
 48 | | `thepipe-api[audio]`       | `openai-whisper`                          | Local audio/video transcription via Whisper.          |
 49 | | `thepipe-api[semantic]`    | `sentence-transformers`                   | Semantic chunking with transformer embeddings.        |
 50 | | `thepipe-api[llama-index]` | `llama-index`                             | `Chunk.to_llamaindex()` conversions.                  |
 51 | | `thepipe-api[gpu]`         | PyTorch + Whisper + Sentence Transformers | Full GPU acceleration with VLM fine-tuning workloads. |
 52 | 
 53 | If you are targeting CPU-only machines but still need the extras that depend on PyTorch, install the CPU wheels directly from the PyTorch index first and then add the extra. For example:
 54 | 
 55 | ```bash
 56 | pip install torch==2.5.1+cpu torchvision==0.20.1+cpu torchaudio==2.5.1+cpu \
 57 |   --index-url https://download.pytorch.org/whl/cpu
 58 | pip install thepipe-api[semantic]
 59 | ```
 60 | 
 61 | If you need full functionality with media-rich sources such as webpages, video, and audio, you can choose to install the following system dependencies:
 62 | 
 63 | ```bash
 64 | apt-get update && apt-get install -y git ffmpeg
 65 | python -m playwright install --with-deps chromium
 66 | ```
 67 | 
 68 | and use the global installation with pip:
 69 | 
 70 | ```bash
 71 | pip install thepipe-api[all]
 72 | ```
 73 | 
 74 | ### Default setup (OpenAI)
 75 | 
 76 | By default, thepipe uses the [OpenAI API](https://platform.openai.com/docs/overview), so VLM features will work out-of-the-box provided you pass in an OpenAI client.
 77 | 
 78 | ### Custom VLM server setup (OpenRouter, OpenLLM, etc.)
 79 | 
 80 | If you wish to use a local vision-language model or a different cloud provider, you can provide a custom OpenAI client, for example, by setting the base url to `https://openrouter.ai/api/v1` for [OpenRouter](https://openrouter.ai/), or `http://localhost:3000/v1` for a local server such as [OpenLLM](https://github.com/bentoml/OpenLLM). Note that uou must also pass the api key to your non-OpenAI cloud provider into the OpenAI client. The model name can be changed with the `model` parameter. By default, the model will be `gpt-4o`.
 81 | 
 82 | ### Scraping
 83 | 
 84 | ```python
 85 | from thepipe.scraper import scrape_file
 86 | 
 87 | # scrape text and page images from a PDF
 88 | chunks = scrape_file(filepath="paper.pdf")
 89 | ```
 90 | 
 91 | For enhanced scraping with a vision-language model, you can pass in an OpenAI-compatible client and a model name.
 92 | 
 93 | ```python
 94 | from openai import OpenAI
 95 | from thepipe.scraper import scrape_file
 96 | 
 97 | # create an OpenAI-compatible client
 98 | client = OpenAI()
 99 | 
100 | # scrape clean markdown and page images from a PDF
101 | chunks = scrape_file(
102 |   filepath="paper.pdf",
103 |   openai_client=client,
104 |   model="gpt-4o"
105 | )
106 | ```
107 | 
108 | ### Chunking
109 | 
110 | To satisfy token-limit constraints, the following chunking methods are available to split the content into smaller chunks.
111 | 
112 | - `chunk_by_document`: Returns one chunk with the entire content of the file.
113 | - `chunk_by_page`: Returns one chunk for each page (for example: each webpage, PDF page, or PowerPoint slide).
114 | - `chunk_by_length`: Splits chunks by length.
115 | - `chunk_by_section`: Splits chunks by markdown section.
116 | - `chunk_by_keyword`: Splits chunks at keywords.
117 | - `chunk_semantic` (experimental, requires [sentence-transformers](https://pypi.org/project/sentence-transformers/)): Returns chunks split by spikes in semantic changes, with a configurable threshold.
118 | - `chunk_agentic` (experimental, requires [OpenAI](https://pypi.org/project/openai/)): Returns chunks split by an LLM agent that attempts to find semantically meaningful sections.
119 | 
120 | For example,
121 | 
122 | ```python
123 | from thepipe.scraper import scrape_file
124 | from thepipe.chunker import chunk_by_document, chunk_by_page
125 | 
126 | # optionally, pass in chunking_method
127 | # chunk_by_document returns one chunk for the entire document
128 | chunks = scrape_file(
129 |   filepath="paper.pdf",
130 |   chunking_method=chunk_by_document
131 | )
132 | 
133 | # you can also re-chunk later.
134 | # chunk_by_page returns one chunk for each page (for example: each webpage, PDF page, or PowerPoint slide).
135 | chunks = chunk_by_page(chunks)
136 | ```
137 | 
138 | ### OpenAI Chat Integration 🤖
139 | 
140 | ```python
141 | from openai import OpenAI
142 | from thepipe.core import chunks_to_messages
143 | 
144 | # Initialize OpenAI client
145 | client = OpenAI()
146 | 
147 | # Use OpenAI-formatted chat messages
148 | messages = [{
149 |   "role": "user",
150 |   "content": [{
151 |       "type": "text",
152 |       "text": "What is the paper about?"
153 |     }]
154 | }]
155 | 
156 | # Simply add the scraped chunks to the messages
157 | messages += chunks_to_messages(chunks)
158 | 
159 | # Call LLM
160 | response = client.chat.completions.create(
161 |     model="gpt-4o",
162 |     messages=messages,
163 | )
164 | ```
165 | 
166 | `chunks_to_messages` takes in an optional `text_only` parameter to only output text from the source document. This is useful for downstream use with LLMs that lack multimodal capabilities.
167 | 
168 | > ⚠️ **It is important to be mindful of your model's token limit.**
169 | > Be sure your prompt is within the token limit of your model. You can use chunking to split your messages into smaller chunks.
170 | 
171 | ### LLamaIndex Integration 🦙
172 | 
173 | Install the optional extra and then call `.to_llamaindex`:
174 | 
175 | ```bash
176 | pip install thepipe-api[llama-index]
177 | ```
178 | 
179 | After installation, a chunk can be converted to LlamaIndex `Document`/`ImageDocument` with `.to_llamaindex`. Without the extra, a helpful error is raised instead of failing at import time.
180 | 
181 | ### Structured extraction 🗂️
182 | 
183 | Note that structured extraction is being deprecated and will be removed in future releases. The current implementation is a simple wrapper around OpenAI's chat API, which is not ideal for structured data extraction. We recommend OpenAI's [structured outputs](https://platform.openai.com/docs/guides/structured-outputs?api-mode=chat) for structured data extraction, or using [Trellis AI](https://runtrellis.com/) for automated workflows with structured data.
184 | 
185 | ```python
186 | from thepipe.extract import extract
187 | from openai import OpenAI
188 | 
189 | client = OpenAI()
190 | 
191 | schema = {
192 |   "description": "string",
193 |   "amount_usd": "float"
194 | }
195 | 
196 | results, tokens_used = extract(
197 |     chunks=chunks,
198 |     schema=schema,
199 |     multiple_extractions=True,  # extract multiple rows of data per chunk
200 |     openai_client=client
201 | )
202 | ```
203 | 
204 | ## Running the test suite 🧪
205 | 
206 | Install the base requirements plus any extras you rely on, then execute:
207 | 
208 | ```bash
209 | pip install -r requirements.txt
210 | python -m unittest discover
211 | ```
212 | 
213 | Tests that depend on optional extras (Whisper, Sentence Transformers, LlamaIndex) or an OpenAI API key are skipped automatically when the corresponding dependency is unavailable.
214 | 
215 | ## Sponsors
216 | 
217 | Please consider supporting thepipe by [becoming a sponsor](mailto:emmett@thepi.pe).
218 | Your support helps me maintain and improve the project while helping the open-source community discover your work.
219 | 
220 | Visit [Cal.com](https://cal.com/) for an open-source scheduling tool that helps you book meetings with ease. It's the perfect solution for busy professionals who want to streamline their scheduling process.
221 | 
222 | <a href="https://cal.com/emmett-mcf/30min"><img alt="Book us with Cal.com" src="https://cal.com/book-with-cal-dark.svg" /></a>
223 | 
224 | Looking for enterprise-ready document processing and intelligent automation? Discover how [Trellis AI](https://runtrellis.com/) can streamline your workflows and enhance productivity.
225 | 
226 | ## How it works 🛠️
227 | 
228 | thepipe uses a combination of computer-vision models and heuristics to scrape clean content from the source and process it for downstream use with [large language models](https://en.wikipedia.org/wiki/Large_language_model), or [vision-language models](https://en.wikipedia.org/wiki/Vision_transformer). You can feed these messages directly into the model, or alternatively you can chunk these messages for downstream storage in a vector database such as ChromaDB, LLamaIndex, or an equivalent RAG framework.
229 | 
230 | ## Supported File Types 📚
231 | 
232 | | Source                       | Input types                                                                          | Multimodal | Notes                                                                                                                                                                                                                                         |
233 | | ---------------------------- | ------------------------------------------------------------------------------------ | ---------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
234 | | Webpage                      | URLs starting with `http`, `https`, `ftp`                                            | ✔️         | Scrapes markdown, images, and tables from web pages. AI extraction available by passing an OpenAI client for screenshot analysis                                                                                                              |
235 | | PDF                          | `.pdf`                                                                               | ✔️         | Extracts page markdown and page images. AI extraction available when an OpenAI client is supplied for complex or scanned documents                                                                                                            |
236 | | Word Document                | `.docx`                                                                              | ✔️         | Extracts text, tables, and images                                                                                                                                                                                                             |
237 | | PowerPoint                   | `.pptx`                                                                              | ✔️         | Extracts text and images from slides                                                                                                                                                                                                          |
238 | | Video                        | `.mp4`, `.mov`, `.wmv`                                                               | ✔️         | Uses Whisper for transcription and extracts frames                                                                                                                                                                                            |
239 | | Audio                        | `.mp3`, `.wav`                                                                       | ✔️         | Uses Whisper for transcription                                                                                                                                                                                                                |
240 | | Jupyter Notebook             | `.ipynb`                                                                             | ✔️         | Extracts markdown, code, outputs, and images                                                                                                                                                                                                  |
241 | | Spreadsheet                  | `.csv`, `.xls`, `.xlsx`                                                              | ❌         | Converts each row to JSON format, including row index for each                                                                                                                                                                                |
242 | | Plaintext                    | `.txt`, `.md`, `.rtf`, etc                                                           | ❌         | Simple text extraction                                                                                                                                                                                                                        |
243 | | Image                        | `.jpg`, `.jpeg`, `.png`                                                              | ✔️         | Uses VLM for OCR in text-only mode                                                                                                                                                                                                            |
244 | | ZIP File                     | `.zip`                                                                               | ✔️         | Extracts and processes contained files                                                                                                                                                                                                        |
245 | | Directory                    | any `path/to/folder`                                                                 | ✔️         | Recursively processes all files in directory. Optionally use `inclusion_pattern` to pass regex strings for file inclusion rules.                                                                                                              |
246 | | YouTube Video (known issues) | YouTube video URLs starting with `https://youtube.com` or `https://www.youtube.com`. | ✔️         | Uses pytube for video download and Whisper for transcription. For consistent extraction, you may need to modify your `pytube` installation to send a valid user-agent header (see [this issue](https://github.com/pytube/pytube/issues/399)). |
247 | | Tweet                        | URLs starting with `https://twitter.com` or `https://x.com`                          | ✔️         | Uses unofficial API, may break unexpectedly                                                                                                                                                                                                   |
248 | | GitHub Repository            | GitHub repo URLs starting with `https://github.com` or `https://www.github.com`      | ✔️         | Requires `GITHUB_TOKEN` environment variable                                                                                                                                                                                                  |
249 | 
250 | ## Configuration & Environment
251 | 
252 | Set these environment variables to control API keys, hosting, and model defaults:
253 | 
254 | ```bash
255 | # If you want longer-term image storage and hosting (saves to ./images and serves via HOST_URL)
256 | export HOST_IMAGES=true
257 | 
258 | # GitHub token for scraping private/public repos via `scrape_url`
259 | export GITHUB_TOKEN=ghp_...
260 | 
261 | # Control scraping defaults
262 | export DEFAULT_AI_MODEL=gpt-4o
263 | export DEFAULT_EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
264 | export FILESIZE_LIMIT_MB=50
265 | 
266 | # Max duration (in seconds) for audio transcription
267 | export MAX_WHISPER_DURATION=600
268 | 
269 | # Filesize limit for webpages in mb
270 | export FILESIZE_LIMIT_MB = 50
271 | 
272 | # Credientials for scraping repositories
273 | export GITHUB_TOKEN=...
274 | ```
275 | 
276 | ## CLI Usage
277 | 
278 | `thepipe <source> [options]`
279 | 
280 | ### AI scraping options
281 | 
282 | `--openai-api-key=KEY` To enable VLM scraping, pass in your OpenAI API key
283 | 
284 | `--openai-model=MODEL` Model to use for scraping (default is `DEFAULT_AI_MODEL`, currently `gpt-4o`)
285 | 
286 | `--openai-base-url=URL` Custom LLM endpoint, for local LLMs or hosted APIs like OpenRouter (default: https://api.openai.com/v1)
287 | 
288 | `--ai_extraction` ⚠️ DEPRECATED; will get API key from `OPENAI_API_KEY` environment variable
289 | 
290 | ### General scraping options
291 | 
292 | `--text_only` Output text only (suppress images)
293 | 
294 | `--inclusion_pattern=REGEX` Include only files whose \_full path\* matches REGEX (for dirs/zips)
295 | 
296 | `--verbose` Print detailed progress messages
297 | 
298 | ## Contributing
299 | 
300 | This package is quite opinionated in its design and implementation. Some modules are tightly coupled to the overall architecture, while others are designed to be hacked.
301 | 
302 | Before contributing, please create an issue on GitHub to discuss your ideas and how to best implement them. Pull requests that do not follow this process will be closed.
303 | 


--------------------------------------------------------------------------------
/thepipe/scraper.py:
--------------------------------------------------------------------------------
   1 | from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, cast
   2 | import base64
   3 | from concurrent.futures import ThreadPoolExecutor, as_completed
   4 | from collections import OrderedDict
   5 | from io import BytesIO, StringIO
   6 | import math
   7 | import re
   8 | import fnmatch
   9 | import os
  10 | import tempfile
  11 | from urllib.parse import urlparse
  12 | import zipfile
  13 | from PIL import Image
  14 | import requests
  15 | import json
  16 | from .core import (
  17 |     HOST_IMAGES,
  18 |     Chunk,
  19 |     make_image_url,
  20 |     DEFAULT_AI_MODEL,
  21 | )
  22 | from .chunker import (
  23 |     chunk_by_page,
  24 |     chunk_by_document,
  25 |     chunk_by_section,
  26 |     chunk_semantic,
  27 |     chunk_by_keywords,
  28 |     chunk_by_length,
  29 |     chunk_agentic,
  30 | )
  31 | import tempfile
  32 | import mimetypes
  33 | import dotenv
  34 | from magika import Magika
  35 | import markdownify
  36 | import fitz
  37 | from openai import OpenAI
  38 | from openai.types.chat.chat_completion_message_param import ChatCompletionMessageParam
  39 | 
  40 | dotenv.load_dotenv()
  41 | 
  42 | FOLDERS_TO_IGNORE = {
  43 |     "*node_modules*",
  44 |     "*.git*",
  45 |     "*venv*",
  46 |     "*.vscode*",
  47 |     "*pycache*",
  48 |     "*.ipynb_checkpoints",
  49 | }
  50 | FILES_TO_IGNORE = {
  51 |     ".gitignore",
  52 |     "*.bin",
  53 |     # Python compiled files
  54 |     "*.pyc",
  55 |     "*.pyo",
  56 |     "*.pyd",
  57 |     # Shared libraries and binaries
  58 |     "*.so",
  59 |     "*.dll",
  60 |     "*.exe",
  61 |     # Archives and packages
  62 |     "*.tar",
  63 |     "*.tar.gz",
  64 |     "*.egg-info",
  65 |     "package-lock.json",
  66 |     "package.json",
  67 |     # Lock, log, and metadata files
  68 |     "*.lock",
  69 |     "*.log",
  70 |     "Pipfile.lock",
  71 |     "requirements.lock",
  72 |     "*.exe",
  73 |     "*.dll",
  74 |     ".DS_Store",
  75 |     "Thumbs.db",
  76 | }
  77 | GITHUB_TOKEN: Optional[str] = os.getenv("GITHUB_TOKEN", None)
  78 | USER_AGENT_STRING: str = os.getenv(
  79 |     "USER_AGENT_STRING",
  80 |     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
  81 | )
  82 | MAX_WHISPER_DURATION = int(os.getenv("MAX_WHISPER_DURATION", 600))  # 10 minutes
  83 | 
  84 | TWITTER_DOMAINS = {
  85 |     "https://twitter.com",
  86 |     "https://www.twitter.com",
  87 |     "https://x.com",
  88 |     "https://www.x.com",
  89 | }
  90 | YOUTUBE_DOMAINS = {"https://www.youtube.com", "https://youtube.com"}
  91 | GITHUB_DOMAINS = {"https://github.com", "https://www.github.com"}
  92 | SCRAPING_PROMPT = os.getenv(
  93 |     "SCRAPING_PROMPT",
  94 |     """A document is given. Please output the entire extracted contents from the document in detailed markdown format.
  95 | Your accuracy is very important. Please be careful to not miss any content from the document.
  96 | Be sure to correctly output a comprehensive format markdown for all the document contents (including, but not limited to, headers, paragraphs, lists, tables, menus, equations, full text contents, titles, subtitles, appendices, page breaks, columns, footers, page numbers, watermarks, footnotes, captions, annotations, images, figures, charts, shapes, form fields, content controls, signatures, etc.)
  97 | Always reply immediately with only markdown.
  98 | Do not give the markdown in a code block. Simply output the raw markdown immediately.
  99 | Do not output anything else.""",
 100 | )
 101 | FILESIZE_LIMIT_MB = int(os.getenv("FILESIZE_LIMIT_MB", 50))  # for url scraping only
 102 | 
 103 | 
 104 | def _load_whisper():
 105 |     try:
 106 |         import whisper
 107 |     except ImportError as exc:  # pragma: no cover - optional dependency
 108 |         raise ImportError(
 109 |             "Audio and video transcription requires the optional dependency `openai-whisper`. "
 110 |             "Install it with `pip install thepipe-api[audio]` or include the `gpu` extra."
 111 |         ) from exc
 112 | 
 113 |     return whisper
 114 | 
 115 | 
 116 | def detect_source_mimetype(source: str) -> str:
 117 |     # try to detect the file type by its extension
 118 |     _, extension = os.path.splitext(source)
 119 |     if extension:
 120 |         if extension == ".ipynb":
 121 |             # special case for notebooks, mimetypes is not familiar
 122 |             return "application/x-ipynb+json"
 123 |         guessed_mimetype, _ = mimetypes.guess_type(source)
 124 |         if guessed_mimetype:
 125 |             return guessed_mimetype
 126 |     # if that fails, try AI detection with Magika
 127 |     magika = Magika()
 128 |     with open(source, "rb") as file:
 129 |         result = magika.identify_bytes(file.read())
 130 |     mimetype = result.output.mime_type
 131 |     return mimetype
 132 | 
 133 | 
 134 | def scrape_file(
 135 |     filepath: str,
 136 |     verbose: bool = False,
 137 |     chunking_method: Optional[Callable[[List[Chunk]], List[Chunk]]] = chunk_by_page,
 138 |     openai_client: Optional[OpenAI] = None,
 139 |     model: str = DEFAULT_AI_MODEL,
 140 |     include_input_images: bool = True,
 141 |     include_output_images: bool = True,
 142 | ) -> List[Chunk]:
 143 |     """
 144 |     Scrapes a file and returns a list of Chunk objects containing the text and images extracted from the file.
 145 | 
 146 |     Parameters
 147 |     ----------
 148 |     filepath : str
 149 |         The path to the file to scrape.
 150 |     verbose : bool, optional
 151 |         If ``True``, prints verbose output.
 152 |     chunking_method : Callable, optional
 153 |         A function to chunk the scraped content. Defaults to chunk_by_page.
 154 |     openai_client : OpenAI, optional
 155 |         An OpenAI client instance for LLM processing. If provided, uses VLM to scrape PDFs.
 156 |     model : str, optional
 157 |         The LLM model name to use for processing. Defaults to DEFAULT_AI_MODEL.
 158 |     include_input_images : bool, optional
 159 |         If ``True``, includes input images in the messages sent to the LLM.
 160 |     include_output_images : bool, optional
 161 |         If ``True``, includes output images in the returned chunks.
 162 |     Returns
 163 |     -------
 164 |     List[Chunk]
 165 |         A list of Chunk objects containing the scraped content.
 166 |     """
 167 |     # returns chunks of scraped content from any source (file, URL, etc.)
 168 |     scraped_chunks = []
 169 |     source_mimetype = detect_source_mimetype(filepath)
 170 |     if source_mimetype is None:
 171 |         if verbose:
 172 |             print(f"[thepipe] Unsupported source type: {filepath}")
 173 |         return scraped_chunks
 174 |     if verbose:
 175 |         print(f"[thepipe] Scraping {source_mimetype}: {filepath}...")
 176 |     if source_mimetype == "application/pdf":
 177 |         scraped_chunks = scrape_pdf(
 178 |             file_path=filepath,
 179 |             verbose=verbose,
 180 |             model=model,
 181 |             openai_client=openai_client,
 182 |             include_input_images=include_input_images,
 183 |             include_output_images=include_output_images,
 184 |         )
 185 |     elif (
 186 |         source_mimetype
 187 |         == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
 188 |     ):
 189 |         scraped_chunks = scrape_docx(
 190 |             file_path=filepath,
 191 |             verbose=verbose,
 192 |             include_output_images=include_output_images,
 193 |         )
 194 |     elif (
 195 |         source_mimetype
 196 |         == "application/vnd.openxmlformats-officedocument.presentationml.presentation"
 197 |     ):
 198 |         scraped_chunks = scrape_pptx(
 199 |             file_path=filepath,
 200 |             verbose=verbose,
 201 |             include_output_images=include_output_images,
 202 |         )
 203 |     elif source_mimetype.startswith("image/"):
 204 |         scraped_chunks = scrape_image(file_path=filepath)
 205 |     elif (
 206 |         source_mimetype.startswith("application/vnd.ms-excel")
 207 |         or source_mimetype
 208 |         == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
 209 |     ):
 210 |         scraped_chunks = scrape_spreadsheet(
 211 |             file_path=filepath, source_type=source_mimetype
 212 |         )
 213 |     elif source_mimetype == "application/x-ipynb+json":
 214 |         scraped_chunks = scrape_ipynb(
 215 |             file_path=filepath,
 216 |             verbose=verbose,
 217 |             include_output_images=include_output_images,
 218 |         )
 219 |     elif (
 220 |         source_mimetype == "application/zip"
 221 |         or source_mimetype == "application/x-zip-compressed"
 222 |     ):
 223 |         scraped_chunks = scrape_zip(
 224 |             file_path=filepath,
 225 |             verbose=verbose,
 226 |             openai_client=openai_client,
 227 |             include_input_images=include_input_images,
 228 |             include_output_images=include_output_images,
 229 |         )
 230 |     elif source_mimetype.startswith("video/"):
 231 |         scraped_chunks = scrape_video(
 232 |             file_path=filepath,
 233 |             verbose=verbose,
 234 |             include_output_images=include_output_images,
 235 |         )
 236 |     elif source_mimetype.startswith("audio/"):
 237 |         scraped_chunks = scrape_audio(file_path=filepath, verbose=verbose)
 238 |     elif source_mimetype.startswith("text/html"):
 239 |         scraped_chunks = scrape_html(
 240 |             file_path=filepath,
 241 |             verbose=verbose,
 242 |             include_output_images=include_output_images,
 243 |         )
 244 |     elif source_mimetype.startswith("text/"):
 245 |         scraped_chunks = scrape_plaintext(file_path=filepath)
 246 |     else:
 247 |         try:
 248 |             scraped_chunks = scrape_plaintext(file_path=filepath)
 249 |         except Exception as e:
 250 |             if verbose:
 251 |                 print(f"[thepipe] Error extracting from {filepath}: {e}")
 252 |     if verbose:
 253 |         if scraped_chunks:
 254 |             print(f"[thepipe] Extracted from {filepath}")
 255 |         else:
 256 |             print(f"[thepipe] No content extracted from {filepath}")
 257 |     if chunking_method:
 258 |         scraped_chunks = chunking_method(scraped_chunks)
 259 |     return scraped_chunks
 260 | 
 261 | 
 262 | def scrape_html(
 263 |     file_path: str,
 264 |     verbose: bool = False,
 265 |     include_output_images: bool = True,
 266 | ) -> List[Chunk]:
 267 |     with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
 268 |         html_content = file.read()
 269 |     markdown_content = markdownify.markdownify(html_content, heading_style="ATX")
 270 |     images = get_images_from_markdown(html_content) if include_output_images else []
 271 |     return [Chunk(path=file_path, text=markdown_content, images=images)]
 272 | 
 273 | 
 274 | def scrape_plaintext(file_path: str) -> List[Chunk]:
 275 |     with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
 276 |         text = file.read()
 277 |     return [Chunk(path=file_path, text=text)]
 278 | 
 279 | 
 280 | def scrape_directory(
 281 |     dir_path: str,
 282 |     inclusion_pattern: Optional[str] = None,
 283 |     verbose: bool = False,
 284 |     openai_client: Optional[OpenAI] = None,
 285 |     model: str = DEFAULT_AI_MODEL,
 286 |     include_input_images: bool = True,
 287 |     include_output_images: bool = True,
 288 | ) -> List[Chunk]:
 289 |     """
 290 |     inclusion_pattern: Optional regex string; only files whose path matches this pattern will be scraped.
 291 |     By default, ignores all files in baked-in constants FOLDERS_TO_IGNORE and FILES_TO_IGNORE.
 292 |     """
 293 |     # compile the include pattern once
 294 |     pattern = re.compile(inclusion_pattern) if inclusion_pattern else None
 295 |     extraction: List[Chunk] = []
 296 | 
 297 |     try:
 298 |         for entry in os.scandir(dir_path):
 299 |             path = entry.path
 300 | 
 301 |             # skip ignored directories
 302 |             if entry.is_dir() and any(
 303 |                 fnmatch.fnmatch(entry.name, pat) for pat in FOLDERS_TO_IGNORE
 304 |             ):
 305 |                 if verbose:
 306 |                     print(f"[thepipe] Skipping ignored directory: {path}")
 307 |                 continue
 308 | 
 309 |             # skip ignored files
 310 |             if entry.is_file() and any(
 311 |                 fnmatch.fnmatch(entry.name, pat) for pat in FILES_TO_IGNORE
 312 |             ):
 313 |                 if verbose:
 314 |                     print(f"[thepipe] Skipping ignored file: {path}")
 315 |                 continue
 316 | 
 317 |             if entry.is_file():
 318 |                 # if include_pattern is set, skip files that don't match
 319 |                 if pattern and not pattern.search(path):
 320 |                     if verbose:
 321 |                         print(f"[thepipe] Skipping non-matching file: {path}")
 322 |                     continue
 323 | 
 324 |                 if verbose:
 325 |                     print(f"[thepipe] Scraping file: {path}")
 326 |                 extraction += scrape_file(
 327 |                     filepath=path,
 328 |                     verbose=verbose,
 329 |                     openai_client=openai_client,
 330 |                     model=model,
 331 |                     include_input_images=include_input_images,
 332 |                     include_output_images=include_output_images,
 333 |                 )
 334 | 
 335 |             elif entry.is_dir():
 336 |                 # recurse into subdirectory
 337 |                 if verbose:
 338 |                     print(f"[thepipe] Entering directory: {path}")
 339 |                 extraction += scrape_directory(
 340 |                     dir_path=path,
 341 |                     inclusion_pattern=inclusion_pattern,
 342 |                     verbose=verbose,
 343 |                     openai_client=openai_client,
 344 |                     model=model,
 345 |                     include_input_images=include_input_images,
 346 |                     include_output_images=include_output_images,
 347 |                 )
 348 |     except PermissionError as e:
 349 |         if verbose:
 350 |             print(f"[thepipe] Skipping {dir_path} (permission denied): {e}")
 351 | 
 352 |     return extraction
 353 | 
 354 | 
 355 | def scrape_zip(
 356 |     file_path: str,
 357 |     inclusion_pattern: Optional[str] = None,
 358 |     verbose: bool = False,
 359 |     openai_client: Optional[OpenAI] = None,
 360 |     include_input_images: bool = True,
 361 |     include_output_images: bool = True,
 362 | ) -> List[Chunk]:
 363 |     chunks = []
 364 |     with tempfile.TemporaryDirectory() as temp_dir:
 365 |         with zipfile.ZipFile(file_path, "r") as zip_ref:
 366 |             zip_ref.extractall(temp_dir)
 367 |         chunks = scrape_directory(
 368 |             dir_path=temp_dir,
 369 |             inclusion_pattern=inclusion_pattern,
 370 |             verbose=verbose,
 371 |             openai_client=openai_client,
 372 |             include_input_images=include_input_images,
 373 |             include_output_images=include_output_images,
 374 |         )
 375 |     return chunks
 376 | 
 377 | 
 378 | def scrape_pdf(
 379 |     file_path: str,
 380 |     openai_client: Optional[OpenAI] = None,
 381 |     model: str = DEFAULT_AI_MODEL,
 382 |     verbose: Optional[bool] = False,
 383 |     include_input_images: bool = True,
 384 |     include_output_images: bool = True,
 385 |     image_scale: float = 1.0,
 386 | ) -> List[Chunk]:
 387 |     chunks: List[Chunk] = []
 388 | 
 389 |     # Branch 1 – VLM path (OpenAI client supplied)
 390 |     if openai_client is not None:
 391 |         with open(file_path, "rb") as fp:
 392 |             pdf_bytes = fp.read()
 393 |         doc = fitz.open(stream=pdf_bytes, filetype="pdf")
 394 |         num_pages = len(doc)
 395 | 
 396 |         if verbose:
 397 |             print(
 398 |                 f"[thepipe] Scraping PDF: {file_path} "
 399 |                 f"({num_pages} pages) with model {model}"
 400 |             )
 401 | 
 402 |         # Inner worker – processes one page
 403 |         def _process_page(page_num: int) -> Tuple[int, str, Optional[Image.Image]]:
 404 |             page = doc[page_num]
 405 |             text = page.get_text()  # type: ignore[attr-defined]
 406 | 
 407 |             # Build message for the LLM
 408 |             msg_content: List[Dict[str, Union[Dict[str, str], str]]] = [
 409 |                 {
 410 |                     "type": "text",
 411 |                     "text": f"```\n{text}\n```\n{SCRAPING_PROMPT}",
 412 |                 }
 413 |             ]
 414 | 
 415 |             image: Optional[Image.Image] = None
 416 |             if include_input_images or include_output_images:
 417 |                 mat = fitz.Matrix(image_scale, image_scale)
 418 |                 pix = page.get_pixmap(matrix=mat, alpha=False)  # type: ignore[attr-defined]  # noqa: E501
 419 |                 image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
 420 | 
 421 |                 if include_input_images:
 422 |                     encoded = make_image_url(image, host_images=HOST_IMAGES)
 423 |                     msg_content.append(
 424 |                         {
 425 |                             "type": "image_url",
 426 |                             "image_url": {"url": encoded, "detail": "high"},
 427 |                         }
 428 |                     )
 429 | 
 430 |             messages = cast(
 431 |                 Iterable[ChatCompletionMessageParam],
 432 |                 [{"role": "user", "content": msg_content}],
 433 |             )
 434 | 
 435 |             response = openai_client.chat.completions.create(
 436 |                 model=model, messages=messages
 437 |             )
 438 | 
 439 |             llm_response = response.choices[0].message.content
 440 |             if not llm_response:
 441 |                 raise RuntimeError("Empty LLM response.")
 442 | 
 443 |             llm_response = llm_response.strip()
 444 |             if llm_response.startswith("```markdown"):
 445 |                 llm_response = llm_response[len("```markdown") :]
 446 |             elif llm_response.startswith("```"):
 447 |                 llm_response = llm_response[len("```") :]
 448 |             if llm_response.endswith("```"):
 449 |                 llm_response = llm_response[: -len("```")]
 450 | 
 451 |             return (
 452 |                 page_num,
 453 |                 llm_response,
 454 |                 image if include_output_images else None,
 455 |             )
 456 | 
 457 |         # Parallel extraction
 458 |         max_workers = (os.cpu_count() or 1) * 2
 459 |         if verbose:
 460 |             print(f"[thepipe] Using {max_workers} threads for PDF extraction")
 461 | 
 462 |         page_results: OrderedDict[int, Tuple[str, Optional[Image.Image]]] = (
 463 |             OrderedDict()
 464 |         )
 465 |         with ThreadPoolExecutor(max_workers=max_workers) as executor:
 466 |             futures = [executor.submit(_process_page, p) for p in range(num_pages)]
 467 |             for fut in as_completed(futures):
 468 |                 pg, txt, img = fut.result()
 469 |                 page_results[pg] = (txt, img)
 470 | 
 471 |         for pg in sorted(page_results):
 472 |             txt, img = page_results[pg]
 473 |             chunks.append(Chunk(path=file_path, text=txt, images=[img] if img else []))
 474 | 
 475 |         return chunks
 476 | 
 477 |     # Branch 2 – no OpenAI client – text-only offline mode
 478 |     from pymupdf4llm.helpers.pymupdf_rag import to_markdown  # local import
 479 | 
 480 |     doc = fitz.open(file_path)
 481 |     md_pages = cast(List[Dict[str, Any]], to_markdown(file_path, page_chunks=True))
 482 | 
 483 |     for i in range(doc.page_count):
 484 |         text = re.sub(r"\n{3,}", "\n\n", md_pages[i]["text"]).strip()
 485 | 
 486 |         images: List[Image.Image] = []
 487 |         if include_output_images:
 488 |             mat = fitz.Matrix(image_scale, image_scale)
 489 |             pix = doc[i].get_pixmap(matrix=mat, alpha=False)  # type: ignore[attr-defined]  # noqa: E501
 490 |             images.append(Image.frombytes("RGB", [pix.width, pix.height], pix.samples))
 491 | 
 492 |         chunks.append(Chunk(path=file_path, text=text, images=images))
 493 | 
 494 |     doc.close()
 495 |     return chunks
 496 | 
 497 | 
 498 | def get_images_from_markdown(text: str) -> List[Image.Image]:
 499 |     image_urls = re.findall(r"!\[.*?\]\((.*?)\)", text)
 500 |     images = []
 501 |     for url in image_urls:
 502 |         extension = os.path.splitext(urlparse(url).path)[1]
 503 |         if extension not in {".jpg", ".jpeg", ".png"}:
 504 |             # ignore incompatible image extractions
 505 |             continue
 506 | 
 507 |         try:
 508 |             response = requests.get(
 509 |                 url,
 510 |                 timeout=10,
 511 |                 headers={"User-Agent": USER_AGENT_STRING},
 512 |             )
 513 |             response.raise_for_status()
 514 |         except Exception:
 515 |             continue
 516 | 
 517 |         img = Image.open(BytesIO(response.content))
 518 |         images.append(img)
 519 |     return images
 520 | 
 521 | 
 522 | def scrape_image(file_path: str) -> List[Chunk]:
 523 |     img = Image.open(file_path)
 524 |     img.load()  # needed to close the file
 525 |     chunk = Chunk(path=file_path, images=[img])
 526 |     return [chunk]
 527 | 
 528 | 
 529 | def scrape_spreadsheet(file_path: str, source_type: str) -> List[Chunk]:
 530 |     import pandas as pd
 531 | 
 532 |     if source_type == "application/vnd.ms-excel":
 533 |         df = pd.read_csv(file_path)
 534 |     elif (
 535 |         source_type
 536 |         == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
 537 |     ):
 538 |         df = pd.read_excel(file_path)
 539 |     else:
 540 |         raise ValueError("Unsupported file format")
 541 |     dicts = df.to_dict(orient="records")
 542 |     chunks = []
 543 |     for i, item in enumerate(dicts):
 544 |         # format each row as json along with the row index
 545 |         item["row index"] = i
 546 |         item_json = json.dumps(item, indent=4)
 547 |         chunks.append(Chunk(path=file_path, text=item_json))
 548 |     return chunks
 549 | 
 550 | 
 551 | def parse_webpage_with_vlm(
 552 |     url: str,
 553 |     model: str = DEFAULT_AI_MODEL,
 554 |     verbose: Optional[bool] = False,
 555 |     openai_client: Optional[OpenAI] = None,
 556 |     include_output_images: bool = True,
 557 | ) -> Chunk:
 558 |     if openai_client is None:
 559 |         raise ValueError("parse_webpage_with_vlm requires an openai_client argument.")
 560 |     from playwright.sync_api import sync_playwright
 561 | 
 562 |     with sync_playwright() as p:
 563 |         browser = p.chromium.launch()
 564 |         context = browser.new_context(user_agent=USER_AGENT_STRING)
 565 |         page = context.new_page()
 566 |         page.goto(url, wait_until="domcontentloaded")
 567 |         if not page.viewport_size:
 568 |             page.set_viewport_size({"width": 800, "height": 600})
 569 |         if not page.viewport_size:
 570 |             raise ValueError(
 571 |                 "Failed to set viewport size after finding no viewport size"
 572 |             )
 573 |         viewport_height = page.viewport_size.get("height", 800)
 574 |         total_height = page.evaluate("document.body.scrollHeight")
 575 |         current_scroll_position = 0
 576 |         scrolldowns, max_scrolldowns = 0, 3
 577 |         images: List[Image.Image] = []
 578 | 
 579 |         while current_scroll_position < total_height and scrolldowns < max_scrolldowns:
 580 |             page.wait_for_timeout(200)  # wait for content to load
 581 |             screenshot = page.screenshot(full_page=False)
 582 |             img = Image.open(BytesIO(screenshot))
 583 |             images.append(img)
 584 | 
 585 |             current_scroll_position += viewport_height
 586 |             page.evaluate(f"window.scrollTo(0, {current_scroll_position})")
 587 |             scrolldowns += 1
 588 |             total_height = page.evaluate("document.body.scrollHeight")
 589 |             if verbose:
 590 |                 print(
 591 |                     f"[thepipe] Scrolled to {current_scroll_position} of {total_height}. Waiting for content to load..."
 592 |                 )
 593 | 
 594 |         browser.close()
 595 | 
 596 |     if images:
 597 |         # Vertically stack the images
 598 |         total_height = sum(img.height for img in images)
 599 |         max_width = max(img.width for img in images)
 600 |         stacked_image = Image.new("RGB", (max_width, total_height))
 601 |         y_offset = 0
 602 |         for img in images:
 603 |             stacked_image.paste(img, (0, y_offset))
 604 |             y_offset += img.height
 605 | 
 606 |         # Process the stacked image with VLM
 607 |         messages = [
 608 |             {
 609 |                 "role": "user",
 610 |                 "content": [
 611 |                     {
 612 |                         "type": "image_url",
 613 |                         "image_url": {
 614 |                             "url": make_image_url(
 615 |                                 stacked_image, host_images=HOST_IMAGES
 616 |                             ),
 617 |                             "detail": "high",
 618 |                         },
 619 |                     },
 620 |                     {"type": "text", "text": SCRAPING_PROMPT},
 621 |                 ],
 622 |             },
 623 |         ]
 624 |         response = openai_client.chat.completions.create(
 625 |             model=model,
 626 |             messages=cast(Iterable[ChatCompletionMessageParam], messages),
 627 |         )
 628 |         llm_response = response.choices[0].message.content
 629 |         if not llm_response:
 630 |             raise Exception(
 631 |                 f"Failed to receive a message content from LLM Response: {response}"
 632 |             )
 633 |         if verbose:
 634 |             print(f"[thepipe] LLM response: {llm_response}")
 635 |         chunk = Chunk(
 636 |             path=url,
 637 |             text=llm_response,
 638 |             images=[stacked_image] if include_output_images else [],
 639 |         )
 640 |     else:
 641 |         raise ValueError("Model received 0 images from webpage")
 642 | 
 643 |     return chunk
 644 | 
 645 | 
 646 | def extract_page_content(
 647 |     url: str, verbose: bool = False, include_output_images: bool = True
 648 | ) -> Chunk:
 649 |     from bs4 import BeautifulSoup
 650 |     from playwright.sync_api import sync_playwright
 651 |     import base64
 652 |     import requests
 653 | 
 654 |     texts: List[str] = []
 655 |     images: List[Image.Image] = []
 656 | 
 657 |     with sync_playwright() as p:
 658 |         browser = p.chromium.launch()
 659 |         context = browser.new_context(user_agent=USER_AGENT_STRING)
 660 |         page = context.new_page()
 661 | 
 662 |         try:
 663 |             page.goto(url, wait_until="domcontentloaded", timeout=10000)
 664 | 
 665 |             # Wait for content to load
 666 |             page.wait_for_timeout(1000)
 667 | 
 668 |             # Scroll to load dynamic content
 669 |             if not page.viewport_size:
 670 |                 page.set_viewport_size({"width": 1200, "height": 800})
 671 | 
 672 |             viewport_height = page.viewport_size["height"]
 673 |             total_height = page.evaluate("document.body.scrollHeight")
 674 |             current_scroll_position = 0
 675 |             scrolldowns, max_scrolldowns = 0, 5
 676 | 
 677 |             while (
 678 |                 current_scroll_position < total_height and scrolldowns < max_scrolldowns
 679 |             ):
 680 |                 page.wait_for_timeout(500)
 681 |                 current_scroll_position += viewport_height
 682 |                 page.evaluate(f"window.scrollTo(0, {current_scroll_position})")
 683 |                 scrolldowns += 1
 684 |                 new_height = page.evaluate("document.body.scrollHeight")
 685 |                 if new_height == total_height:
 686 |                     break
 687 |                 total_height = new_height
 688 | 
 689 |             # Extract HTML content
 690 |             html_content = page.content()
 691 | 
 692 |             # Parse with BeautifulSoup and clean up
 693 |             soup = BeautifulSoup(html_content, "html.parser")
 694 | 
 695 |             # Remove script and style elements
 696 |             for script in soup(["script", "style", "nav", "footer", "header"]):
 697 |                 script.decompose()
 698 | 
 699 |             # Convert to markdown
 700 |             markdown_content = markdownify.markdownify(str(soup), heading_style="ATX")
 701 | 
 702 |             # Clean up markdown
 703 |             markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
 704 |             markdown_content = markdown_content.strip()
 705 | 
 706 |             if verbose:
 707 |                 print(
 708 |                     f"[thepipe] Extracted {len(markdown_content)} characters from {url}"
 709 |                 )
 710 | 
 711 |             texts.append(markdown_content)
 712 | 
 713 |             # Extract images from the page using heuristics
 714 |             if include_output_images:
 715 |                 for img in page.query_selector_all("img"):
 716 |                     img_path = img.get_attribute("src")
 717 |                     if not img_path:
 718 |                         continue
 719 |                     if img_path.startswith("data:image"):
 720 |                         # Save base64 image to PIL Image
 721 |                         try:
 722 |                             decoded_data = base64.b64decode(img_path.split(",")[1])
 723 |                             image = Image.open(BytesIO(decoded_data))
 724 |                             images.append(image)
 725 |                         except Exception as e:
 726 |                             if verbose:
 727 |                                 print(
 728 |                                     f"[thepipe] Ignoring error loading base64 image: {e}"
 729 |                                 )
 730 |                             continue
 731 |                     else:
 732 |                         try:
 733 |                             # Try direct URL first
 734 |                             response = requests.get(
 735 |                                 img_path,
 736 |                                 timeout=10,
 737 |                                 headers={"User-Agent": USER_AGENT_STRING},
 738 |                             )
 739 |                             response.raise_for_status()
 740 |                             image = Image.open(BytesIO(response.content))
 741 |                             images.append(image)
 742 |                         except Exception as e:
 743 |                             if verbose:
 744 |                                 print(f"[thepipe] Error loading image {img_path}: {e}")
 745 |                                 print("[thepipe] Attempting to load path with schema.")
 746 | 
 747 |                             # Try with schema if path is relative
 748 |                             if not img_path.startswith(("http://", "https://")):
 749 |                                 try:
 750 |                                     # Remove leading slashes
 751 |                                     while img_path.startswith("/"):
 752 |                                         img_path = img_path[1:]
 753 | 
 754 |                                     # Try with just the scheme
 755 |                                     parsed_url = urlparse(url)
 756 |                                     path_with_schema = (
 757 |                                         f"{parsed_url.scheme}://{img_path}"
 758 |                                     )
 759 |                                     response = requests.get(
 760 |                                         path_with_schema,
 761 |                                         timeout=10,
 762 |                                         headers={"User-Agent": USER_AGENT_STRING},
 763 |                                     )
 764 |                                     response.raise_for_status()
 765 |                                     image = Image.open(BytesIO(response.content))
 766 |                                     images.append(image)
 767 |                                 except Exception as e:
 768 |                                     if verbose:
 769 |                                         print(
 770 |                                             f"[thepipe] Error loading image {img_path} with schema: {e}"
 771 |                                         )
 772 |                                         print(
 773 |                                             "[thepipe] Attempting to load with schema and netloc."
 774 |                                         )
 775 | 
 776 |                                     try:
 777 |                                         # Try with scheme and netloc
 778 |                                         path_with_schema_and_netloc = f"{parsed_url.scheme}://{parsed_url.netloc}/{img_path}"
 779 |                                         response = requests.get(
 780 |                                             path_with_schema_and_netloc,
 781 |                                             timeout=10,
 782 |                                             headers={"User-Agent": USER_AGENT_STRING},
 783 |                                         )
 784 |                                         response.raise_for_status()
 785 |                                         image = Image.open(BytesIO(response.content))
 786 |                                         images.append(image)
 787 |                                     except Exception as e:
 788 |                                         if verbose:
 789 |                                             print(
 790 |                                                 f"[thepipe] Final attempt failed for image {img_path}: {e}"
 791 |                                             )
 792 |                                         continue
 793 |                             else:
 794 |                                 if verbose:
 795 |                                     print(
 796 |                                         f"[thepipe] Skipping image {img_path} - all attempts failed"
 797 |                                     )
 798 |                                 continue
 799 | 
 800 |         except Exception as e:
 801 |             if verbose:
 802 |                 print(f"[thepipe] Error scraping {url}: {e}")
 803 |             # Fallback to simple requests
 804 |             try:
 805 |                 response = requests.get(
 806 |                     url, headers={"User-Agent": USER_AGENT_STRING}, timeout=30
 807 |                 )
 808 |                 response.raise_for_status()
 809 |                 soup = BeautifulSoup(response.content, "html.parser")
 810 | 
 811 |                 # Remove unwanted elements
 812 |                 for script in soup(["script", "style", "nav", "footer", "header"]):
 813 |                     script.decompose()
 814 | 
 815 |                 markdown_content = markdownify.markdownify(
 816 |                     str(soup), heading_style="ATX"
 817 |                 )
 818 |                 markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content).strip()
 819 |                 texts.append(markdown_content)
 820 | 
 821 |                 if verbose:
 822 |                     print(
 823 |                         f"[thepipe] Fallback extraction got {len(markdown_content)} characters"
 824 |                     )
 825 | 
 826 |             except Exception as fallback_e:
 827 |                 if verbose:
 828 |                     print(f"[thepipe] Fallback also failed: {fallback_e}")
 829 |                 texts.append("")
 830 | 
 831 |         finally:
 832 |             browser.close()
 833 | 
 834 |     text = "\n".join(texts).strip()
 835 |     return Chunk(path=url, text=text, images=images)
 836 | 
 837 | 
 838 | def scrape_url(
 839 |     url: str,
 840 |     verbose: bool = False,
 841 |     chunking_method: Callable[[List[Chunk]], List[Chunk]] = chunk_by_page,
 842 |     openai_client: Optional[OpenAI] = None,
 843 |     model: str = DEFAULT_AI_MODEL,
 844 |     include_input_images: bool = True,
 845 |     include_output_images: bool = True,
 846 | ) -> List[Chunk]:
 847 |     if any(url.startswith(domain) for domain in TWITTER_DOMAINS):
 848 |         extraction = scrape_tweet(url=url, include_output_images=include_output_images)
 849 |         return extraction
 850 |     elif any(url.startswith(domain) for domain in YOUTUBE_DOMAINS):
 851 |         extraction = scrape_youtube(
 852 |             youtube_url=url,
 853 |             verbose=verbose,
 854 |             include_output_images=include_output_images,
 855 |         )
 856 |         return extraction
 857 |     elif any(url.startswith(domain) for domain in GITHUB_DOMAINS):
 858 |         extraction = scrape_github(
 859 |             github_url=url,
 860 |             verbose=verbose,
 861 |             openai_client=openai_client,
 862 |             model=model,
 863 |             include_input_images=include_input_images,
 864 |             include_output_images=include_output_images,
 865 |         )
 866 |         return extraction
 867 |     _, extension = os.path.splitext(urlparse(url).path)
 868 |     if extension and extension not in {".html", ".htm", ".php", ".asp", ".aspx"}:
 869 |         # if url leads to a file, attempt to download it and scrape it
 870 |         with tempfile.TemporaryDirectory() as temp_dir:
 871 |             file_path = os.path.join(temp_dir, os.path.basename(url))
 872 |             response = requests.get(url)
 873 |             # verify the ingress/egress with be within limits, if there are any set
 874 |             response_length = int(response.headers.get("Content-Length", 0))
 875 |             if FILESIZE_LIMIT_MB and response_length > FILESIZE_LIMIT_MB * 1024 * 1024:
 876 |                 raise ValueError(f"File size exceeds {FILESIZE_LIMIT_MB} MB limit.")
 877 |             with open(file_path, "wb") as file:
 878 |                 file.write(response.content)
 879 |             chunks = scrape_file(
 880 |                 filepath=file_path,
 881 |                 verbose=verbose,
 882 |                 chunking_method=chunking_method,
 883 |                 openai_client=openai_client,
 884 |                 model=model,
 885 |                 include_input_images=include_input_images,
 886 |                 include_output_images=include_output_images,
 887 |             )
 888 |         return chunks
 889 |     else:
 890 |         # if url leads to web content, scrape it directly
 891 |         if openai_client and include_input_images:
 892 |             chunk = parse_webpage_with_vlm(
 893 |                 url=url,
 894 |                 verbose=verbose,
 895 |                 model=model,
 896 |                 openai_client=openai_client,
 897 |                 include_output_images=include_output_images,
 898 |             )
 899 |         else:
 900 |             chunk = extract_page_content(
 901 |                 url=url, verbose=verbose, include_output_images=include_output_images
 902 |             )
 903 |         chunks = chunking_method([chunk])
 904 |         # if no text or images were extracted, return error
 905 |         if not any(chunk.text for chunk in chunks) and not any(
 906 |             chunk.images for chunk in chunks
 907 |         ):
 908 |             raise ValueError("No content extracted from URL.")
 909 |         return chunks
 910 | 
 911 | 
 912 | def format_timestamp(seconds, chunk_index, chunk_duration):
 913 |     # helper function to format the timestamp.
 914 |     total_seconds = chunk_index * chunk_duration + seconds
 915 |     hours = int(total_seconds // 3600)
 916 |     minutes = int((total_seconds % 3600) // 60)
 917 |     seconds = total_seconds % 60
 918 |     milliseconds = int((seconds - int(seconds)) * 1000)
 919 |     return f"{hours:02}:{minutes:02}:{int(seconds):02}.{milliseconds:03}"
 920 | 
 921 | 
 922 | def scrape_video(
 923 |     file_path: str,
 924 |     verbose: bool = False,
 925 |     include_output_images: bool = True,
 926 | ) -> List[Chunk]:
 927 |     whisper = _load_whisper()
 928 |     from moviepy.editor import VideoFileClip
 929 | 
 930 |     # Splits the video into chunks of length MAX_WHISPER_DURATION, extracts
 931 |     # one representative frame from the start of each chunk, and then transcribes
 932 |     # that chunk.
 933 |     model = whisper.load_model("base")
 934 |     video = VideoFileClip(file_path)
 935 |     num_chunks = math.ceil(video.duration / MAX_WHISPER_DURATION)
 936 |     chunks = []
 937 | 
 938 |     try:
 939 |         for i in range(num_chunks):
 940 |             # Calculate the start and end time of the chunk
 941 |             start_time = i * MAX_WHISPER_DURATION
 942 |             end_time = start_time + MAX_WHISPER_DURATION
 943 |             if end_time > video.duration:
 944 |                 end_time = video.duration
 945 | 
 946 |             # Extract a frame from the start of the chunk
 947 |             image = None
 948 |             if include_output_images:
 949 |                 frame = video.get_frame(start_time)
 950 |                 image = Image.fromarray(frame)
 951 | 
 952 |             # Save the audio to a temporary .wav file
 953 |             with tempfile.NamedTemporaryFile(
 954 |                 suffix=".wav", delete=False
 955 |             ) as temp_audio_file:
 956 |                 audio_path = temp_audio_file.name
 957 | 
 958 |             audio = video.subclip(start_time, end_time).audio  # type: ignore[attr-defined]
 959 |             transcription = None
 960 | 
 961 |             if audio is not None:
 962 |                 audio.write_audiofile(audio_path, codec="pcm_s16le")
 963 |                 result = model.transcribe(audio=audio_path, verbose=verbose)
 964 | 
 965 |                 # Format transcription with timestamps
 966 |                 formatted_transcription = []
 967 |                 for segment in cast(List[Dict[str, Any]], result["segments"]):
 968 |                     seg_start = format_timestamp(
 969 |                         segment["start"], i, MAX_WHISPER_DURATION
 970 |                     )
 971 |                     seg_end = format_timestamp(segment["end"], i, MAX_WHISPER_DURATION)
 972 |                     formatted_transcription.append(
 973 |                         f"[{seg_start} --> {seg_end}]  {segment['text']}"
 974 |                     )
 975 | 
 976 |                 transcription = "\n".join(formatted_transcription)
 977 |                 os.remove(audio_path)
 978 | 
 979 |             # Only add chunks if there is either text or images
 980 |             if transcription or image:
 981 |                 chunks.append(
 982 |                     Chunk(
 983 |                         path=file_path,
 984 |                         text=transcription if transcription else None,
 985 |                         images=[image] if image else [],
 986 |                     )
 987 |                 )
 988 |     finally:
 989 |         video.close()
 990 | 
 991 |     return chunks
 992 | 
 993 | 
 994 | def scrape_youtube(
 995 |     youtube_url: str,
 996 |     verbose: bool = False,
 997 |     include_output_images: bool = True,
 998 | ) -> List[Chunk]:
 999 |     from pytube import YouTube
1000 | 
1001 |     with tempfile.TemporaryDirectory() as temp_dir:
1002 |         filename = "temp_video.mp4"
1003 |         yt = YouTube(youtube_url)
1004 |         stream = yt.streams.filter(progressive=True, file_extension="mp4").first()
1005 |         if stream is None:
1006 |             raise ValueError("No progressive stream for video found.")
1007 |         stream.download(temp_dir, filename=filename)
1008 |         video_path = os.path.join(temp_dir, filename)
1009 |         chunks = scrape_video(
1010 |             file_path=video_path,
1011 |             verbose=verbose,
1012 |             include_output_images=include_output_images,
1013 |         )
1014 |     return chunks
1015 | 
1016 | 
1017 | def scrape_audio(file_path: str, verbose: bool = False) -> List[Chunk]:
1018 |     whisper = _load_whisper()
1019 | 
1020 |     model = whisper.load_model("base")
1021 |     result = model.transcribe(audio=file_path, verbose=verbose)
1022 |     segments = cast(List[Dict[str, Any]], result.get("segments", []))
1023 | 
1024 |     transcript: List[str] = []
1025 |     for segment in segments:
1026 |         start = format_timestamp(segment["start"], 0, 0)
1027 |         end = format_timestamp(segment["end"], 0, 0)
1028 |         if segment["text"].strip():
1029 |             transcript.append(f"[{start} --> {end}]  {segment['text']}")
1030 |     # join the formatted transcription into a single string
1031 |     transcript_text = "\n".join(transcript)
1032 |     return [Chunk(path=file_path, text=transcript_text)]
1033 | 
1034 | 
1035 | def scrape_github(
1036 |     github_url: str,
1037 |     inclusion_pattern: Optional[str] = None,
1038 |     branch: str = "main",
1039 |     verbose: bool = False,
1040 |     openai_client: Optional[OpenAI] = None,
1041 |     model: str = DEFAULT_AI_MODEL,
1042 |     include_input_images: bool = True,
1043 |     include_output_images: bool = True,
1044 | ) -> List[Chunk]:
1045 |     files_contents: List[Chunk] = []
1046 |     if not GITHUB_TOKEN:
1047 |         raise ValueError("GITHUB_TOKEN environment variable is not set.")
1048 |     # make new tempdir for cloned repo
1049 |     with tempfile.TemporaryDirectory() as temp_dir:
1050 |         # requires git
1051 |         exit_code = os.system(
1052 |             f'git clone --branch "{branch}" --single-branch {github_url} "{temp_dir}" --quiet'
1053 |         )
1054 |         if exit_code != 0:
1055 |             raise RuntimeError(
1056 |                 f"git clone failed for {github_url} at branch '{branch}'. "
1057 |                 "Verify the repository URL and branch name."
1058 |             )
1059 |         files_contents = scrape_directory(
1060 |             dir_path=temp_dir,
1061 |             inclusion_pattern=inclusion_pattern,
1062 |             verbose=verbose,
1063 |             openai_client=openai_client,
1064 |             model=model,
1065 |             include_input_images=include_input_images,
1066 |             include_output_images=include_output_images,
1067 |         )
1068 |     return files_contents
1069 | 
1070 | 
1071 | def scrape_docx(
1072 |     file_path: str,
1073 |     verbose: bool = False,
1074 |     include_output_images: bool = True,
1075 | ) -> List[Chunk]:
1076 |     from docx import Document
1077 |     from docx.oxml.table import CT_Tbl
1078 |     from docx.oxml.text.paragraph import CT_P
1079 |     from docx.table import Table, _Cell
1080 |     from docx.text.paragraph import Paragraph
1081 |     import csv
1082 |     import io
1083 | 
1084 |     # helper function to iterate through blocks in the document
1085 |     def iter_block_items(parent):
1086 |         if parent.__class__.__name__ == "Document":
1087 |             parent_elm = parent.element.body
1088 |         elif parent.__class__.__name__ == "_Cell":
1089 |             parent_elm = parent._tc
1090 |         else:
1091 |             raise ValueError("Unsupported parent type")
1092 |         # iterate through each child element in the parent element
1093 |         for child in parent_elm.iterchildren():
1094 |             child_elem_class_name = child.__class__.__name__
1095 |             if verbose:
1096 |                 print(f"[thepipe] Found element in docx: {child_elem_class_name}")
1097 |             if child_elem_class_name == "CT_P":
1098 |                 yield Paragraph(child, parent)
1099 |             elif child_elem_class_name == "CT_Tbl":
1100 |                 yield Table(child, parent)
1101 | 
1102 |     # helper function to read tables in the document
1103 |     def read_docx_tables(tab):
1104 |         vf = StringIO()
1105 |         writer = csv.writer(vf)
1106 |         for row in tab.rows:
1107 |             writer.writerow(cell.text for cell in row.cells)
1108 |         vf.seek(0)
1109 |         return vf.getvalue()
1110 | 
1111 |     # read the document
1112 |     document = Document(file_path)
1113 |     chunks = []
1114 |     image_counter = 0
1115 | 
1116 |     # Define namespaces
1117 |     nsmap = {
1118 |         "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
1119 |         "pic": "http://schemas.openxmlformats.org/drawingml/2006/picture",
1120 |         "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
1121 |     }
1122 | 
1123 |     try:
1124 |         # scrape each block in the document to create chunks
1125 |         # A block can be a paragraph, table, or image
1126 |         for block in iter_block_items(document):
1127 |             block_texts = []
1128 |             block_images = []
1129 |             if isinstance(block, Paragraph):
1130 |                 block_texts.append(block.text)
1131 |                 # "runs" are the smallest units in a paragraph
1132 |                 for run in block.runs:
1133 |                     if "pic:pic" in run.element.xml and include_output_images:
1134 |                         # extract images from the paragraph
1135 |                         for pic in run.element.findall(".//pic:pic", nsmap):
1136 |                             cNvPr = pic.find(".//pic:cNvPr", nsmap)
1137 |                             name_attr = (
1138 |                                 cNvPr.get("name")
1139 |                                 if cNvPr is not None
1140 |                                 else f"image_{image_counter}"
1141 |                             )
1142 |                             blip = pic.find(".//a:blip", nsmap)
1143 |                             if blip is not None:
1144 |                                 embed_attr = blip.get(
1145 |                                     "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
1146 |                                 )
1147 |                                 if embed_attr:
1148 |                                     image_part = document.part.related_parts[embed_attr]
1149 |                                     image_data = BytesIO(image_part._blob)
1150 |                                     image = Image.open(image_data)
1151 |                                     image.load()
1152 |                                     block_images.append(image)
1153 |                                     image_counter += 1
1154 |             elif isinstance(block, Table):
1155 |                 table_text = read_docx_tables(block)
1156 |                 block_texts.append(table_text)
1157 |             if block_texts or block_images:
1158 |                 block_text = "\n".join(block_texts).strip()
1159 |                 if block_text or block_images:
1160 |                     chunks.append(
1161 |                         Chunk(path=file_path, text=block_text, images=block_images)
1162 |                     )
1163 |     except Exception as e:
1164 |         raise ValueError(f"Error processing DOCX file {file_path}: {e}")
1165 |     return chunks
1166 | 
1167 | 
1168 | def scrape_pptx(
1169 |     file_path: str,
1170 |     verbose: bool = False,
1171 |     include_output_images: bool = True,
1172 | ) -> List[Chunk]:
1173 |     from pptx import Presentation
1174 |     from pptx.enum.shapes import MSO_SHAPE_TYPE
1175 |     from pptx.shapes.picture import Picture
1176 |     from pptx.shapes.autoshape import Shape as AutoShape
1177 | 
1178 |     prs = Presentation(file_path)
1179 |     chunks = []
1180 |     # iterate through each slide in the presentation
1181 |     for slide in prs.slides:
1182 |         slide_texts = []
1183 |         slide_images = []
1184 |         # iterate through each shape in the slide
1185 |         for shape in slide.shapes:
1186 |             if shape.has_text_frame:
1187 |                 auto_shape = cast(AutoShape, shape)
1188 |                 for paragraph in auto_shape.text_frame.paragraphs:
1189 |                     text = paragraph.text
1190 |                     if len(slide_texts) == 0:
1191 |                         text = "# " + text  # header for first text of a slide
1192 |                     slide_texts.append(text)
1193 |             # extract images from shapes
1194 |             if include_output_images and shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
1195 |                 pic = cast(Picture, shape)
1196 |                 image_data = pic.image.blob
1197 |                 image = Image.open(BytesIO(image_data))
1198 |                 slide_images.append(image)
1199 |         # add slide to chunks if it has text or images
1200 |         if slide_texts or slide_images:
1201 |             text = "\n".join(slide_texts).strip()
1202 |             if not include_output_images:
1203 |                 slide_images = []
1204 |             chunks.append(Chunk(path=file_path, text=text, images=slide_images))
1205 |     # return all chunks
1206 |     return chunks
1207 | 
1208 | 
1209 | def scrape_ipynb(
1210 |     file_path: str,
1211 |     verbose: bool = False,
1212 |     include_output_images: bool = True,
1213 | ) -> List[Chunk]:
1214 |     with open(file_path, "r", encoding="utf-8") as file:
1215 |         notebook = json.load(file)
1216 |     chunks = []
1217 |     # parse cells in the notebook
1218 |     for cell in notebook["cells"]:
1219 |         texts = []
1220 |         images: List[Image.Image] = []
1221 |         cell_type = cell["cell_type"]
1222 |         # parse cell content based on type
1223 |         if verbose:
1224 |             print(f"[thepipe] Scraping cell {cell_type} from {file_path}")
1225 |         if cell_type == "markdown":
1226 |             text = "".join(cell["source"])
1227 |             if include_output_images:
1228 |                 images = get_images_from_markdown(text)
1229 |             texts.append(text)
1230 |         elif cell_type == "code":
1231 |             source = "".join(cell["source"])
1232 |             texts.append(source)
1233 |             output_texts = []
1234 |             # code cells can have outputs
1235 |             if "outputs" in cell:
1236 |                 for output in cell["outputs"]:
1237 |                     if (
1238 |                         include_output_images
1239 |                         and "data" in output
1240 |                         and "image/png" in output["data"]
1241 |                     ):
1242 |                         image_data = output["data"]["image/png"]
1243 |                         image = Image.open(BytesIO(base64.b64decode(image_data)))
1244 |                         images.append(image)
1245 |                     elif "data" in output and "text/plain" in output["data"]:
1246 |                         output_text = "".join(output["data"]["text/plain"])
1247 |                         output_texts.append(output_text)
1248 |             if output_texts:
1249 |                 texts.extend(output_texts)
1250 |         elif cell_type == "raw":
1251 |             text = "".join(cell["source"])
1252 |             texts.append(text)
1253 |         if texts or images:
1254 |             text = "\n".join(texts).strip()
1255 |             chunks.append(Chunk(path=file_path, text=text, images=images))
1256 |     return chunks
1257 | 
1258 | 
1259 | def scrape_tweet(url: str, include_output_images: bool = True) -> List[Chunk]:
1260 |     """
1261 |     Magic function from https://github.com/vercel/react-tweet/blob/main/packages/react-tweet/src/api/fetch-tweet.ts
1262 |     unofficial, could break at any time
1263 |     """
1264 | 
1265 |     def get_token(id: str) -> str:
1266 |         result = (float(id) / 1e15) * math.pi
1267 |         base_36_result = ""
1268 |         characters = "0123456789abcdefghijklmnopqrstuvwxyz"
1269 |         while result > 0:
1270 |             remainder = int(result % (6**2))
1271 |             base_36_result = characters[remainder] + base_36_result
1272 |             result = (result - remainder) // (6**2)
1273 |         base_36_result = re.sub(r"(0+|\.)", "", base_36_result)
1274 |         return base_36_result
1275 | 
1276 |     tweet_id = url.split("status/")[-1].split("?")[0]
1277 |     token = get_token(tweet_id)
1278 |     tweet_api_url = "https://cdn.syndication.twimg.com/tweet-result"
1279 |     params = {"id": tweet_id, "language": "en", "token": token}
1280 |     response = requests.get(tweet_api_url, params=params)
1281 |     if response.status_code != 200:
1282 |         raise ValueError(f"Failed to fetch tweet. Status code: {response.status_code}")
1283 |     tweet_data = response.json()
1284 |     # Extract tweet text
1285 |     tweet_text = tweet_data.get("text", "")
1286 |     # Extract images from tweet
1287 |     images: List[Image.Image] = []
1288 |     if include_output_images and "mediaDetails" in tweet_data:
1289 |         for media in tweet_data["mediaDetails"]:
1290 |             image_url = media.get("media_url_https")
1291 |             if image_url:
1292 |                 image_response = requests.get(image_url)
1293 |                 img = Image.open(BytesIO(image_response.content))
1294 |                 images.append(img)
1295 |     # Create chunks for text and images
1296 |     chunk = Chunk(path=url, text=tweet_text, images=images)
1297 |     return [chunk]
1298 | 


--------------------------------------------------------------------------------