├── .github └── workflows │ └── release.yml ├── .gitignore ├── CHANGELOG.md ├── COMMAND_ORDER_FIX.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── __init__.py ├── assets ├── PathikvPlaywright.png ├── pathik_logo.jpg ├── pathik_logo.png └── script.py ├── benchmarks ├── concurrency │ ├── batch_bench.py │ └── batch_results.png └── speed │ ├── benchmark.py │ └── comparison.png ├── build_binary.py ├── commit_message.txt ├── crawler └── crawler.go ├── docs ├── safe_api_example.py └── type_safe_api.md ├── examples ├── README.md ├── basic_usage.py ├── command_ordering_test.py ├── diagnose_imports.py ├── docker-compose.yml ├── example.py ├── example2.py ├── example_kafka.py ├── kafka_consumer.go ├── kafka_consumer.js ├── kafka_consumer.py ├── kafka_real_test.py ├── native_kafka_demo.py ├── news_aggregator.py ├── package.json ├── pathik ├── pathik_bin ├── safe_kafka_demo.py ├── simple_crawl.py ├── simple_kafka_example.py └── test_secure_kafka.py ├── go.mod ├── go.sum ├── kafka_consumer_direct.py ├── main.go ├── new-version.sh ├── package-lock.json ├── package.json ├── parallel_test.js ├── parallel_test.py ├── pathik-js ├── README.md ├── benchmark │ ├── benchmark.js │ ├── create-comparison-chart.js │ └── results │ │ ├── benchmark_data.json │ │ └── memory-comparison.png ├── bin │ ├── pathik-cli.js │ └── pathik_bin ├── bun.lock ├── examples │ ├── basic.js │ └── kafka.js ├── package.json ├── scripts │ ├── build.js │ └── install.js ├── src │ ├── crawler.js │ ├── index.js │ └── utils.js └── types │ └── index.d.ts ├── pathik.egg-info ├── PKG-INFO ├── SOURCES.txt ├── dependency_links.txt ├── entry_points.txt ├── requires.txt └── top_level.txt ├── pathik ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-312.pyc │ ├── cli.cpython-312.pyc │ ├── crawler.cpython-312.pyc │ ├── safe_api.cpython-312.pyc │ └── schema.cpython-312.pyc ├── bin │ ├── .gitkeep │ ├── darwin_amd64 │ │ └── pathik_bin │ ├── darwin_arm64 │ │ └── pathik_bin │ ├── linux_amd64 │ │ └── pathik_bin │ ├── linux_arm64 │ │ └── pathik_bin │ ├── pathik_crawler │ └── windows_amd64 │ │ └── pathik_bin.exe ├── cli.py ├── crawler.py ├── safe_api.py ├── schema.py └── simple.py ├── pathik_bin ├── publish_to_pypi.py ├── setup.py ├── storage ├── kafka.go └── storage.go ├── test.py ├── test ├── cli_test.sh ├── debug_command_order.py ├── direct_test.py ├── python_test.py ├── run_all_tests.sh └── simple_test.py └── test_secure_kafka.py /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Build and Release 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*' # Push events to matching v*, i.e. v1.0, v20.15.10 7 | 8 | jobs: 9 | build: 10 | runs-on: ubuntu-latest 11 | permissions: 12 | contents: write 13 | packages: write 14 | steps: 15 | - uses: actions/checkout@v3 16 | 17 | - name: Set up Go 18 | uses: actions/setup-go@v4 19 | with: 20 | go-version: '1.24' 21 | 22 | - name: Set up Python 23 | uses: actions/setup-python@v4 24 | with: 25 | python-version: '3.10' 26 | 27 | - name: Install dependencies 28 | run: | 29 | python -m pip install --upgrade pip 30 | pip install setuptools wheel twine build requests 31 | 32 | - name: Extract version from tag 33 | id: get_version 34 | run: echo "VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_ENV 35 | 36 | - name: Set up Go module 37 | run: | 38 | # Change module name in go.mod while preserving dependencies 39 | # First, save the original content excluding the module line 40 | grep -v "^module " go.mod > go.mod.temp 41 | 42 | # Create new go.mod with pathik module and original dependencies 43 | echo "module pathik" > go.mod 44 | cat go.mod.temp >> go.mod 45 | rm go.mod.temp 46 | 47 | # Run go mod tidy to ensure dependencies are set correctly 48 | go mod tidy 49 | 50 | # Verify module setup 51 | cat go.mod 52 | 53 | - name: Build binaries for all platforms 54 | run: | 55 | # Set environment variable for use in build_binary.py 56 | export PATHIK_VERSION="${VERSION}" 57 | 58 | # Note: The build script creates a simplified Windows binary 59 | # to avoid cross-compilation issues with local imports 60 | python build_binary.py --all 61 | 62 | - name: Check output structure 63 | run: | 64 | find . -name "pathik_bin*" | sort 65 | 66 | - name: Debug pathik directory 67 | run: | 68 | echo "Contents of the bin directory:" 69 | ls -la pathik/bin/ || echo "Directory not found" 70 | echo "Searching for all binary files:" 71 | find pathik -type f -name "pathik_bin*" || echo "No binaries found" 72 | 73 | - name: Prepare binaries with unique names 74 | run: | 75 | mkdir -p release_assets 76 | cp pathik/bin/darwin_amd64/pathik_bin release_assets/pathik_bin_darwin_amd64 77 | cp pathik/bin/darwin_arm64/pathik_bin release_assets/pathik_bin_darwin_arm64 78 | cp pathik/bin/linux_amd64/pathik_bin release_assets/pathik_bin_linux_amd64 79 | cp pathik/bin/linux_arm64/pathik_bin release_assets/pathik_bin_linux_arm64 80 | cp pathik/bin/windows_amd64/pathik_bin.exe release_assets/pathik_bin_windows_amd64.exe 81 | 82 | ls -la release_assets 83 | 84 | - name: Create GitHub Release 85 | id: create_release 86 | uses: softprops/action-gh-release@v1 87 | with: 88 | name: Pathik ${{ env.VERSION }} 89 | draft: false 90 | prerelease: false 91 | token: ${{ secrets.GITHUB_TOKEN }} 92 | files: release_assets/* 93 | 94 | - name: Build Python package 95 | run: | 96 | # Ensure version numbers match 97 | sed -i "s/VERSION = '.*'/VERSION = '${{ env.VERSION }}'/g" setup.py 98 | sed -i "s/__version__ = \".*\"/__version__ = \"${{ env.VERSION }}\"/g" pathik/__init__.py 99 | 100 | # Build the package 101 | python -m build 102 | 103 | - name: Upload to PyPI 104 | env: 105 | TWINE_USERNAME: __token__ 106 | TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} 107 | run: | 108 | twine upload dist/* 109 | 110 | publish-npm: 111 | needs: build 112 | runs-on: ubuntu-latest 113 | steps: 114 | - uses: actions/checkout@v3 115 | 116 | - name: Set up Node.js 117 | uses: actions/setup-node@v3 118 | with: 119 | node-version: '18' 120 | registry-url: 'https://registry.npmjs.org' 121 | 122 | - name: Install GitHub CLI 123 | run: | 124 | curl -fsSL https://cli.github.com/packages/githubcli-archive-keyring.gpg | sudo dd of=/usr/share/keyrings/githubcli-archive-keyring.gpg 125 | echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" | sudo tee /etc/apt/sources.list.d/github-cli.list > /dev/null 126 | sudo apt update 127 | sudo apt install gh 128 | 129 | - name: Extract version from tag 130 | run: echo "VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_ENV 131 | 132 | - name: Create JS package binaries directory 133 | run: mkdir -p pathik-js/bin 134 | 135 | - name: Download released binaries 136 | run: | 137 | # Create a temporary directory to download the binaries 138 | mkdir -p temp_binaries 139 | 140 | # Download binaries from the GitHub release 141 | gh release download v${{ env.VERSION }} --dir temp_binaries 142 | 143 | # List downloaded files 144 | ls -la temp_binaries 145 | env: 146 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 147 | 148 | - name: Copy binaries to JS package 149 | run: | 150 | # Create platform-specific directories 151 | mkdir -p pathik-js/bin/darwin-amd64 152 | mkdir -p pathik-js/bin/darwin-arm64 153 | mkdir -p pathik-js/bin/linux-amd64 154 | mkdir -p pathik-js/bin/linux-arm64 155 | mkdir -p pathik-js/bin/win32-amd64 156 | 157 | # Copy binaries to the correct locations 158 | cp temp_binaries/pathik_bin_darwin_amd64 pathik-js/bin/darwin-amd64/pathik_bin 159 | cp temp_binaries/pathik_bin_darwin_arm64 pathik-js/bin/darwin-arm64/pathik_bin 160 | cp temp_binaries/pathik_bin_linux_amd64 pathik-js/bin/linux-amd64/pathik_bin 161 | cp temp_binaries/pathik_bin_linux_arm64 pathik-js/bin/linux-arm64/pathik_bin 162 | cp temp_binaries/pathik_bin_windows_amd64.exe pathik-js/bin/win32-amd64/pathik_bin.exe 163 | 164 | # Make binaries executable 165 | chmod +x pathik-js/bin/darwin-amd64/pathik_bin 166 | chmod +x pathik-js/bin/darwin-arm64/pathik_bin 167 | chmod +x pathik-js/bin/linux-amd64/pathik_bin 168 | chmod +x pathik-js/bin/linux-arm64/pathik_bin 169 | 170 | # Verify the structure 171 | find pathik-js/bin -type f | sort 172 | 173 | - name: Update package version 174 | run: | 175 | cd pathik-js 176 | npm version ${{ env.VERSION }} --no-git-tag-version 177 | cat package.json | grep version 178 | 179 | - name: Install dependencies 180 | run: | 181 | cd pathik-js 182 | npm install 183 | 184 | - name: Publish to npm 185 | run: | 186 | cd pathik-js 187 | npm publish 188 | env: 189 | NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }} -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .env 2 | node_modules 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | *.py,cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | cover/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | db.sqlite3-journal 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | .pybuilder/ 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # IPython 85 | profile_default/ 86 | ipython_config.py 87 | 88 | # pyenv 89 | # For a library or package, you might want to ignore these files since the code is 90 | # intended to run in multiple environments; otherwise, check them in: 91 | # .python-version 92 | 93 | # pipenv 94 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 95 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 96 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 97 | # install all needed dependencies. 98 | #Pipfile.lock 99 | 100 | # UV 101 | # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. 102 | # This is especially recommended for binary packages to ensure reproducibility, and is more 103 | # commonly ignored for libraries. 104 | #uv.lock 105 | 106 | # poetry 107 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 108 | # This is especially recommended for binary packages to ensure reproducibility, and is more 109 | # commonly ignored for libraries. 110 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 111 | #poetry.lock 112 | 113 | # pdm 114 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 115 | #pdm.lock 116 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 117 | # in version control. 118 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 119 | .pdm.toml 120 | .pdm-python 121 | .pdm-build/ 122 | 123 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 124 | __pypackages__/ 125 | 126 | # Celery stuff 127 | celerybeat-schedule 128 | celerybeat.pid 129 | 130 | # SageMath parsed files 131 | *.sage.py 132 | 133 | # Environments 134 | .env 135 | .venv 136 | env/ 137 | venv/ 138 | ENV/ 139 | env.bak/ 140 | venv.bak/ 141 | 142 | # Spyder project settings 143 | .spyderproject 144 | .spyproject 145 | 146 | # Rope project settings 147 | .ropeproject 148 | 149 | # mkdocs documentation 150 | /site 151 | 152 | # mypy 153 | .mypy_cache/ 154 | .dmypy.json 155 | dmypy.json 156 | 157 | # Pyre type checker 158 | .pyre/ 159 | 160 | # pytype static type analyzer 161 | .pytype/ 162 | 163 | # Cython debug symbols 164 | cython_debug/ 165 | 166 | # PyCharm 167 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 168 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 169 | # and can be added to the global gitignore or merged into this file. For a more nuclear 170 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 171 | #.idea/ 172 | 173 | # Ruff stuff: 174 | .ruff_cache/ 175 | 176 | # PyPI configuration file 177 | .pypirc -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to Pathik will be documented in this file. 4 | 5 | ## [0.3.1] - 2023-10-28 6 | 7 | ### Added 8 | - New example scripts for Kafka streaming: `native_kafka_demo.py` and `safe_kafka_demo.py` 9 | - Improved documentation for Kafka integration 10 | - Added more compression options (gzip, snappy, lz4, zstd) for Kafka streaming 11 | - Added max message size and buffer memory controls for Kafka performance tuning 12 | 13 | ### Changed 14 | - Fixed command line flags to align with Go binary expectations 15 | - Corrected argument ordering for Kafka flags 16 | - Simplified Kafka streaming interface in `__init__.py` 17 | - Updated Go binary to support compression parameters 18 | - Modified stream_to_kafka to support compression options in Python SDK 19 | 20 | ### Fixed 21 | - Fixed binary command flag format issues in CLI module 22 | - Fixed argument mismatches between Python wrapper and Go binary 23 | - Fixed session ID handling in Kafka streaming functions 24 | - Addressed compression codec selection in Kafka producer configuration 25 | - Fixed missing compression flags in Go binary for Kafka streaming 26 | 27 | ## [0.3.0] - 2023-10-28 28 | 29 | ### Added 30 | - Enhanced URL validation to prevent security vulnerabilities 31 | - Support for customizable buffer sizes in Kafka streaming 32 | - Improved error handling for crawler operations 33 | - New `kafka_consumer_direct.py` script with better security 34 | - Configurable compression options (Gzip and Snappy support) 35 | - Added Snappy compression library installation instructions 36 | 37 | ### Changed 38 | - Updated Kafka producer to use Gzip compression instead of Snappy by default 39 | - Improved input validation across all user-provided parameters 40 | - Enhanced session ID tracking for better multi-user support 41 | - Refactored code for better security and performance 42 | 43 | ### Fixed 44 | - Fixed URL validation to properly handle invalid URLs 45 | - Fixed command execution when arguments are missing 46 | - Fixed UnsupportedCodecError by adding proper compression library support 47 | - Resolved buffer size issues when streaming large web pages 48 | - Fixed message encoding problems in Kafka consumer 49 | 50 | ## [0.2.6] - 2025-03-27 51 | 52 | ### Added 53 | - Kafka streaming functionality 54 | - Session-based message tracking 55 | - Support for R2 storage integration 56 | - Binary version management system 57 | - Parallel URL processing 58 | 59 | ### Changed 60 | - Refactored crawler implementation for better performance 61 | - Improved HTML content extraction 62 | - Enhanced Markdown conversion quality 63 | 64 | ### Fixed 65 | - Fixed memory leaks in long-running operations 66 | - Resolved concurrent processing issues 67 | -------------------------------------------------------------------------------- /COMMAND_ORDER_FIX.md: -------------------------------------------------------------------------------- 1 | # Pathik Command Ordering Fix 2 | 3 | ## Issue Description 4 | 5 | Pathik versions up to 0.3.0 had a critical issue with how command-line arguments were ordered when calling the Go binary. The issue caused flags to be incorrectly interpreted as URLs, resulting in errors like: 6 | 7 | ``` 8 | Invalid URL '-outdir': only HTTP and HTTPS schemes are allowed 9 | ``` 10 | 11 | ## Root Cause 12 | 13 | The Go binary expects a specific order of command-line arguments: 14 | 15 | ``` 16 | pathik_bin [flags] -crawl [urls] 17 | ``` 18 | 19 | However, the Python wrapper was incorrectly constructing commands with URLs before flags: 20 | 21 | ``` 22 | pathik_bin -crawl [urls] [flags] 23 | ``` 24 | 25 | ## Fix Applied 26 | 27 | The following files have been updated to fix the issue: 28 | 29 | 1. `pathik/cli.py`: Reordered command arguments to place all flags before `-crawl` 30 | 2. `pathik/crawler.py`: Fixed command ordering in multiple places: 31 | - In the `crawl()` function (both parallel and sequential modes) 32 | - In the `stream_to_kafka()` function 33 | 34 | ## Command-Line Arguments Order Rules 35 | 36 | When using the Go binary directly or through the Python API, the following rules must be followed: 37 | 38 | 1. All flags MUST come BEFORE the `-crawl` flag 39 | 2. The `-crawl` flag must come immediately before the URLs 40 | 3. No flags can appear after the URLs 41 | 42 | Correct pattern: 43 | ``` 44 | pathik_bin [flags] -crawl [urls] 45 | ``` 46 | 47 | Example: 48 | ``` 49 | pathik_bin -outdir ./output -parallel -kafka -crawl https://example.com https://example.org 50 | ``` 51 | 52 | ## Testing the Fix 53 | 54 | The fix has been tested with various flag combinations and URLs to ensure correct behavior. 55 | 56 | ```python 57 | import pathik 58 | 59 | # Now works correctly 60 | result = pathik.crawl( 61 | urls=["https://example.com"], 62 | output_dir="./output", 63 | parallel=True 64 | ) 65 | ``` 66 | 67 | ## Version Notes 68 | 69 | This fix will be included in the next release. If you encounter any issues with the fix, please report them. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include README.md 3 | include go.mod 4 | include go.sum 5 | include main.go 6 | include build_binary.py 7 | 8 | # Include all binary files 9 | include pathik/pathik_bin* 10 | recursive-include pathik/bin * 11 | 12 | # Exclude git, development and cache files 13 | exclude .git 14 | exclude .gitignore 15 | exclude __pycache__ 16 | exclude *.pyc -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Pathik - A Python package for web crawling with Go backend capabilities. 3 | """ 4 | print("Loading pathik package root") 5 | 6 | # Try importing from the subpackage 7 | try: 8 | from pathik.pathik.crawler import crawl, crawl_to_r2 9 | print("Successfully imported functions from pathik.pathik.crawler") 10 | except ImportError as e: 11 | print(f"Failed to import from pathik.pathik.crawler: {e}") 12 | # Try importing from the simple implementation 13 | try: 14 | from pathik.pathik.simple import crawl, crawl_to_r2 15 | print("Using simple Python implementation") 16 | except ImportError as e: 17 | print(f"Failed to import from pathik.pathik.simple: {e}") 18 | # Fallback implementation 19 | import tempfile 20 | import os 21 | import uuid 22 | from typing import List, Dict, Optional 23 | 24 | print("Using fallback implementation") 25 | 26 | def crawl(urls: List[str], output_dir: Optional[str] = None) -> Dict[str, Dict[str, str]]: 27 | """Emergency fallback crawler implementation""" 28 | if output_dir is None: 29 | output_dir = tempfile.mkdtemp(prefix="pathik_") 30 | else: 31 | os.makedirs(output_dir, exist_ok=True) 32 | 33 | print(f"FALLBACK CRAWLER - Would crawl: {urls} to {output_dir}") 34 | 35 | # Just create empty files as placeholders 36 | results = {} 37 | for url in urls: 38 | domain = url.replace("https://", "").replace("http://", "").replace("/", "_") 39 | html_file = os.path.join(output_dir, f"{domain}.html") 40 | md_file = os.path.join(output_dir, f"{domain}.md") 41 | 42 | # Create empty files 43 | open(html_file, 'w').close() 44 | open(md_file, 'w').close() 45 | 46 | results[url] = {"html": html_file, "markdown": md_file} 47 | 48 | return results 49 | 50 | def crawl_to_r2(urls: List[str], uuid_str: Optional[str] = None) -> Dict[str, Dict[str, str]]: 51 | """Emergency fallback R2 implementation""" 52 | if uuid_str is None: 53 | uuid_str = str(uuid.uuid4()) 54 | 55 | results = crawl(urls) 56 | return { 57 | url: { 58 | "uuid": uuid_str, 59 | "r2_html_key": "", 60 | "r2_markdown_key": "", 61 | "local_html_file": files["html"], 62 | "local_markdown_file": files["markdown"] 63 | } for url, files in results.items() 64 | } 65 | 66 | # Export the functions 67 | __all__ = ["crawl", "crawl_to_r2"] -------------------------------------------------------------------------------- /assets/PathikvPlaywright.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justrach/pathik/ce3af3455925e1af57e2408cb124c6d8ea181325/assets/PathikvPlaywright.png -------------------------------------------------------------------------------- /assets/pathik_logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justrach/pathik/ce3af3455925e1af57e2408cb124c6d8ea181325/assets/pathik_logo.jpg -------------------------------------------------------------------------------- /assets/pathik_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justrach/pathik/ce3af3455925e1af57e2408cb124c6d8ea181325/assets/pathik_logo.png -------------------------------------------------------------------------------- /assets/script.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | 4 | # Set up the figure with dark background 5 | plt.figure(figsize=(10, 4), facecolor='#1a202c') 6 | ax = plt.subplot() 7 | ax.set_facecolor('#1a202c') 8 | 9 | # Data from your benchmark - memory usage in MB 10 | tools = ['Pathik', 'Playwright'] 11 | memory = [0.17, 17.44] # Memory usage from batch size 5 12 | ratio = memory[1] / memory[0] # About 103x less memory 13 | 14 | # Create horizontal bars 15 | y_pos = np.arange(len(tools)) 16 | bars = ax.barh(y_pos, memory, height=0.6, color=['#4fc3f7', '#9e9e9e']) 17 | 18 | # Add labels and styling 19 | ax.set_yticks(y_pos) 20 | ax.set_yticklabels(tools, fontsize=12, color='white') 21 | ax.invert_yaxis() # Invert to match your example image 22 | 23 | # Add memory values at the end of each bar 24 | for i, bar in enumerate(bars): 25 | width = bar.get_width() 26 | ax.text(width + 0.5, bar.get_y() + bar.get_height()/2, 27 | f'{width:.2f}MB', ha='left', va='center', color='white', fontsize=12) 28 | 29 | # Set title showing memory efficiency 30 | plt.title(f'Memory usage - {ratio:.0f}x less', fontsize=20, color='white', pad=20) 31 | 32 | # Remove axes 33 | ax.spines['top'].set_visible(False) 34 | ax.spines['right'].set_visible(False) 35 | ax.spines['bottom'].set_visible(False) 36 | ax.spines['left'].set_visible(False) 37 | 38 | # Remove tick marks 39 | ax.tick_params(axis='both', which='both', length=0) 40 | ax.set_xticks([]) 41 | 42 | # Add vertical grid lines to match example image 43 | ax.grid(axis='x', linestyle='--', alpha=0.3, color='white') 44 | 45 | # Add icons (if needed, you would need to import images for this) 46 | # This is just a placeholder for where you'd add icon logic 47 | 48 | plt.tight_layout() 49 | plt.savefig('benchmarks/concurrency/batch_results.png', dpi=300, bbox_inches='tight') 50 | plt.show() -------------------------------------------------------------------------------- /benchmarks/concurrency/batch_results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justrach/pathik/ce3af3455925e1af57e2408cb124c6d8ea181325/benchmarks/concurrency/batch_results.png -------------------------------------------------------------------------------- /benchmarks/speed/comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justrach/pathik/ce3af3455925e1af57e2408cb124c6d8ea181325/benchmarks/speed/comparison.png -------------------------------------------------------------------------------- /commit_message.txt: -------------------------------------------------------------------------------- 1 | Fix Kafka streaming compression options and update binary 2 | 3 | Fixed an issue with Kafka streaming compression options where the Go binary was rejecting the compression flags. Added compression, max-message-size, and buffer-memory flags to the Go binary and updated the Python wrapper to support these options. 4 | 5 | Changes include: 6 | - Added compression_type, max_message_size, and buffer_memory parameters to the KafkaConfig struct in Go 7 | - Added corresponding flags to the Go binary command line interface 8 | - Updated the Kafka writer creation code to use the compression options 9 | - Modified stream_to_kafka function in Python to support the new parameters 10 | - Updated both safe_kafka_demo.py and native_kafka_demo.py examples to use compression options 11 | - Updated documentation to reflect new compression and performance tuning options 12 | 13 | The fix allows users to specify Kafka compression algorithm, message size, and buffer memory for optimized streaming performance. -------------------------------------------------------------------------------- /docs/safe_api_example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Example of using Pathik's type-safe API. 4 | 5 | This example demonstrates how to use the safe_crawl function 6 | for better type safety and error handling. 7 | """ 8 | import os 9 | import sys 10 | import json 11 | from typing import Dict, Any 12 | 13 | # Add parent directory to path to find local pathik 14 | sys.path.insert(0, os.path.abspath('..')) 15 | sys.path.insert(0, os.path.abspath('.')) 16 | 17 | try: 18 | # Try to import from local module first 19 | from pathik.safe_api import safe_crawl 20 | print("Imported safe_crawl from local pathik module") 21 | except ImportError: 22 | try: 23 | # Fall back to installed module 24 | from pathik import safe_crawl 25 | print("Imported safe_crawl from installed pathik module") 26 | except ImportError: 27 | print("Pathik is not installed. Install with: pip install pathik") 28 | exit(1) 29 | 30 | def crawl_with_validation(url: str, output_dir: str = None) -> Dict[str, Any]: 31 | """ 32 | Crawl a URL using the type-safe API. 33 | 34 | Args: 35 | url: The URL to crawl 36 | output_dir: Directory to save crawled files 37 | 38 | Returns: 39 | Dictionary with crawl results 40 | """ 41 | print(f"Crawling {url} with type-safe API...") 42 | 43 | # Create output directory if it doesn't exist 44 | if output_dir and not os.path.exists(output_dir): 45 | os.makedirs(output_dir) 46 | 47 | try: 48 | # Call the type-safe API 49 | result = safe_crawl( 50 | urls=url, 51 | output_dir=output_dir, 52 | parallel=False, 53 | num_workers=4, 54 | timeout=60 55 | ) 56 | 57 | # Check if the crawl was successful 58 | if url in result and result[url].get("success", False): 59 | print(f"✅ Successfully crawled {url}") 60 | # Extract file paths 61 | html_file = result[url].get("html", "No HTML file") 62 | md_file = result[url].get("markdown", "No Markdown file") 63 | print(f"HTML file: {html_file}") 64 | print(f"Markdown file: {md_file}") 65 | 66 | # Check if files exist 67 | if os.path.exists(html_file): 68 | print(f"HTML file size: {os.path.getsize(html_file)} bytes") 69 | if os.path.exists(md_file): 70 | print(f"Markdown file size: {os.path.getsize(md_file)} bytes") 71 | else: 72 | error = result[url].get("error", "Unknown error") 73 | print(f"❌ Failed to crawl {url}: {error}") 74 | 75 | return result 76 | except ValueError as e: 77 | # Handle validation errors 78 | print(f"❌ Validation error: {e}") 79 | return {url: {"success": False, "error": str(e)}} 80 | except Exception as e: 81 | # Handle other errors 82 | print(f"❌ Error: {e}") 83 | return {url: {"success": False, "error": str(e)}} 84 | 85 | def demonstrate_validation_error(): 86 | """Demonstrate parameter validation error handling""" 87 | print("\n=== Demonstrating Validation Error Handling ===") 88 | 89 | try: 90 | # Invalid URL, should be caught by validation 91 | result = safe_crawl( 92 | urls="not-a-url", 93 | output_dir="/tmp/pathik_example" 94 | ) 95 | print("This shouldn't happen - validation should catch the invalid URL") 96 | except ValueError as e: 97 | print(f"✅ Validation caught the error: {e}") 98 | 99 | def demonstrate_multiple_urls(): 100 | """Demonstrate crawling multiple URLs""" 101 | print("\n=== Demonstrating Multiple URL Crawling ===") 102 | 103 | output_dir = "/tmp/pathik_example_multi" 104 | if not os.path.exists(output_dir): 105 | os.makedirs(output_dir) 106 | 107 | try: 108 | # Crawl multiple URLs 109 | result = safe_crawl( 110 | urls=["https://example.com", "https://httpbin.org/html"], 111 | output_dir=output_dir, 112 | parallel=True 113 | ) 114 | 115 | print("Results:") 116 | for url, data in result.items(): 117 | success = data.get("success", False) 118 | if success: 119 | print(f"✅ {url}: Success") 120 | print(f" HTML: {data.get('html', 'No HTML file')}") 121 | print(f" Markdown: {data.get('markdown', 'No Markdown file')}") 122 | else: 123 | print(f"❌ {url}: Failed - {data.get('error', 'Unknown error')}") 124 | except Exception as e: 125 | print(f"❌ Error: {e}") 126 | 127 | def main(): 128 | """Main function to demonstrate the type-safe API""" 129 | # Simple crawl with valid parameters 130 | output_dir = "/tmp/pathik_example" 131 | result = crawl_with_validation("https://example.com", output_dir) 132 | 133 | # Print the full result 134 | print("\nFull result:") 135 | print(json.dumps(result, indent=2)) 136 | 137 | # Demonstrate validation error handling 138 | demonstrate_validation_error() 139 | 140 | # Demonstrate multiple URLs 141 | demonstrate_multiple_urls() 142 | 143 | if __name__ == "__main__": 144 | main() -------------------------------------------------------------------------------- /docs/type_safe_api.md: -------------------------------------------------------------------------------- 1 | ## Kafka Streaming with Compression Options 2 | 3 | The `safe_stream_to_kafka` function supports several compression options to optimize message size and network bandwidth when streaming content to Kafka: 4 | 5 | ```python 6 | from pathik.safe_api import safe_stream_to_kafka, KafkaParams 7 | 8 | # Create parameters with compression options 9 | params = KafkaParams( 10 | brokers="localhost:9092", 11 | topic="my_crawl_data", 12 | content_type="both", 13 | session_id="compression-test", 14 | username="", password="", 15 | compression_type="gzip", # Specify compression algorithm 16 | max_message_size=15728640, # ~15MB max message size 17 | buffer_memory=157286400 # ~150MB buffer memory 18 | ) 19 | 20 | # Stream with compression 21 | result = safe_stream_to_kafka(["https://example.com"], params) 22 | ``` 23 | 24 | ### Compression Options 25 | 26 | The following compression options are available: 27 | 28 | | Parameter | Type | Description | Default | Valid Values | 29 | |-----------|------|-------------|---------|--------------| 30 | | `compression_type` | string | Compression algorithm to use | "gzip" | "gzip", "snappy", "lz4", "zstd" | 31 | | `max_message_size` | int | Maximum message size in bytes | 1048576 (1MB) | any positive integer | 32 | | `buffer_memory` | int | Producer buffer memory in bytes | 0 (default) | any positive integer | 33 | 34 | ### Compression Algorithms 35 | 36 | - **gzip**: Best compression ratio but slower; good for text content 37 | - **snappy**: Moderate compression with good speed; balanced option 38 | - **lz4**: Fast compression with moderate ratio; good for high throughput 39 | - **zstd**: Excellent compression ratio with good speed; best overall choice if available 40 | 41 | ### Command Line Usage 42 | 43 | ```bash 44 | # Stream with gzip compression and increased message size 45 | pathik kafka -c both -t pathik_data --compression gzip --max-message-size 15728640 --buffer-memory 157286400 https://example.com 46 | ``` 47 | 48 | ### Choosing Compression Options 49 | 50 | 1. For very large pages, increase `max_message_size` (default is 1MB) 51 | 2. For high throughput requirements, choose faster compression like `lz4` or `snappy` 52 | 3. If bandwidth is limited, increase compression and reduce message size 53 | 4. For optimal performance with good compression, use `zstd` if available 54 | 55 | ### Performance Tuning 56 | 57 | The `buffer_memory` option controls the amount of memory used by the Kafka producer for buffering messages 58 | before sending them to the broker. A larger buffer can improve throughput when streaming many URLs. 59 | 60 | ```python 61 | # High-performance configuration for large batch processing 62 | params = KafkaParams( 63 | brokers="kafka1:9092,kafka2:9092", 64 | topic="bulk_crawl_data", 65 | compression_type="zstd", 66 | max_message_size=20971520, # 20MB 67 | buffer_memory=314572800 # 300MB 68 | ) 69 | ``` 70 | 71 | For most use cases, the default values will be sufficient, but these options provide flexibility 72 | for high-performance or constrained environments. -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Pathik Examples 2 | 3 | This directory contains example code to demonstrate how to use Pathik. 4 | 5 | ## Kafka Consumer Examples 6 | 7 | These examples show how to consume data that Pathik has streamed to Kafka. 8 | 9 | ### Prerequisites 10 | 11 | To run the Kafka examples, you'll need: 12 | 13 | 1. A running Kafka broker (see setup options below) 14 | 2. Data streamed to Kafka using Pathik's Kafka feature 15 | 16 | ### Go Example 17 | 18 | #### Requirements 19 | 20 | ```bash 21 | # Install dependencies 22 | go get github.com/segmentio/kafka-go 23 | go get github.com/joho/godotenv 24 | ``` 25 | 26 | #### Usage 27 | 28 | ```bash 29 | # Run with default settings (localhost:9092, topic: pathik_crawl_data) 30 | go run kafka_consumer.go 31 | 32 | # Specify brokers and topic 33 | go run kafka_consumer.go --brokers=localhost:9092 --topic=my-topic 34 | 35 | # Filter by content type 36 | go run kafka_consumer.go --type=html 37 | go run kafka_consumer.go --type=markdown 38 | 39 | # With authentication 40 | go run kafka_consumer.go --username=user --password=pass 41 | ``` 42 | 43 | ### Python Example 44 | 45 | #### Requirements 46 | 47 | ```bash 48 | # Install dependencies 49 | pip install kafka-python python-dotenv 50 | ``` 51 | 52 | #### Usage 53 | 54 | ```bash 55 | # Run with default settings (localhost:9092, topic: pathik_crawl_data) 56 | python kafka_consumer.py 57 | 58 | # Specify brokers and topic 59 | python kafka_consumer.py --brokers=localhost:9092 --topic=my-topic 60 | 61 | # Filter by content type 62 | python kafka_consumer.py --type=html 63 | python kafka_consumer.py --type=markdown 64 | 65 | # Filter by session ID (useful for multi-user environments) 66 | python kafka_consumer.py --session=user123 67 | 68 | # Combine filters 69 | python kafka_consumer.py --type=html --session=user123 70 | 71 | # Consume from the beginning of the topic 72 | python kafka_consumer.py --from-beginning 73 | 74 | # With authentication 75 | python kafka_consumer.py --username=user --password=pass 76 | ``` 77 | 78 | ### JavaScript Example 79 | 80 | #### Requirements 81 | 82 | ```bash 83 | # Install dependencies 84 | cd examples 85 | npm install 86 | ``` 87 | 88 | #### Usage 89 | 90 | ```bash 91 | # Run with default settings (localhost:9092, topic: pathik_crawl_data) 92 | node kafka_consumer.js 93 | 94 | # Specify brokers and topic 95 | node kafka_consumer.js --brokers=localhost:9092 --topic=my-topic 96 | 97 | # Filter by content type 98 | node kafka_consumer.js --type=html 99 | node kafka_consumer.js --type=markdown 100 | 101 | # Filter by session ID (useful for multi-user environments) 102 | node kafka_consumer.js --session=user123 103 | 104 | # Combine filters 105 | node kafka_consumer.js --type=html --session=user123 106 | 107 | # Consume from the beginning of the topic 108 | node kafka_consumer.js --from-beginning 109 | 110 | # With authentication 111 | node kafka_consumer.js --username=user --password=pass 112 | ``` 113 | 114 | ## Setting Up Kafka for Local Development 115 | 116 | There are several ways to run Kafka locally: 117 | 118 | ### Option 1: Using Docker Compose 119 | 120 | Create a `docker-compose.yml` file: 121 | 122 | ```yaml 123 | version: '3' 124 | services: 125 | zookeeper: 126 | image: confluentinc/cp-zookeeper:latest 127 | environment: 128 | ZOOKEEPER_CLIENT_PORT: 2181 129 | ZOOKEEPER_TICK_TIME: 2000 130 | ports: 131 | - "2181:2181" 132 | 133 | kafka: 134 | image: confluentinc/cp-kafka:latest 135 | depends_on: 136 | - zookeeper 137 | ports: 138 | - "9092:9092" 139 | environment: 140 | KAFKA_BROKER_ID: 1 141 | KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181 142 | KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://localhost:9092 143 | KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 144 | ``` 145 | 146 | Then start the containers: 147 | 148 | ```bash 149 | docker-compose up -d 150 | ``` 151 | 152 | ### Option 2: Using Redpanda 153 | 154 | [Redpanda](https://redpanda.com/) is a Kafka API-compatible streaming platform that's easier to set up. 155 | 156 | ```bash 157 | docker run -d --name=redpanda --net=host \ 158 | -e REDPANDA_RPC_SERVER_LISTEN_ADDR=0.0.0.0 \ 159 | -e REDPANDA_ADVERTISED_KAFKA_ADDR=127.0.0.1:9092 \ 160 | -e REDPANDA_SEED_SERVERS='[]' \ 161 | docker.redpanda.com/vectorized/redpanda:latest 162 | ``` 163 | 164 | ### Option 3: Kafka on Kubernetes with Strimzi 165 | 166 | If you're using Kubernetes (e.g., with minikube, kind, or k3s), you can use [Strimzi](https://strimzi.io/): 167 | 168 | ```bash 169 | # Install Strimzi operator 170 | kubectl create namespace kafka 171 | kubectl create -f 'https://strimzi.io/install/latest?namespace=kafka' 172 | 173 | # Deploy a Kafka cluster 174 | kubectl apply -f https://strimzi.io/examples/latest/kafka/kafka-persistent-single.yaml -n kafka 175 | ``` 176 | 177 | ## Workflow Example 178 | 179 | 1. Start Kafka locally: 180 | ```bash 181 | docker-compose up -d 182 | ``` 183 | 184 | 2. Stream content to Kafka using Pathik: 185 | ```bash 186 | ./pathik -kafka https://example.com 187 | ``` 188 | 189 | 3. Consume the streamed data: 190 | ```bash 191 | go run kafka_consumer.go 192 | # or 193 | python kafka_consumer.py 194 | # or 195 | node kafka_consumer.js 196 | ``` 197 | 198 | ### Crawling Multiple URLs in Parallel 199 | 200 | Pathik uses parallel crawling by default when multiple URLs are provided: 201 | 202 | ```bash 203 | # Crawling multiple sites in parallel (default behavior) 204 | pathik kafka https://example.com https://huewheel.com https://ycombinator.com 205 | 206 | # For Go binary direct usage: 207 | # Explicitly enable parallel crawling (redundant, as it's on by default) 208 | ./pathik -kafka -parallel https://example.com https://huewheel.com https://ycombinator.com 209 | 210 | # Disable parallel crawling in Go binary 211 | ./pathik -kafka -parallel=false https://example.com https://huewheel.com https://ycombinator.com 212 | 213 | # For Python CLI: 214 | # Disable parallel crawling with -s/--sequential flag 215 | pathik kafka -s https://example.com https://huewheel.com https://ycombinator.com 216 | ``` 217 | 218 | ## Multi-URL and Multi-User Examples -------------------------------------------------------------------------------- /examples/basic_usage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Basic usage examples for Pathik 4 | 5 | This script demonstrates how to use Pathik for crawling web pages 6 | and handling the results properly. 7 | """ 8 | 9 | import os 10 | import sys 11 | import pathlib 12 | import uuid 13 | from pprint import pprint 14 | 15 | # Add the parent directory to sys.path to find pathik if running from the examples directory 16 | sys.path.insert(0, str(pathlib.Path(__file__).resolve().parent.parent)) 17 | 18 | try: 19 | import pathik 20 | from pathik.crawler import CrawlerError 21 | except ImportError: 22 | print("Pathik not found. Install it with: pip install pathik") 23 | sys.exit(1) 24 | 25 | def crawl_single_url(): 26 | """Example of crawling a single URL""" 27 | print("\n=== Crawling a single URL ===") 28 | 29 | try: 30 | # Basic crawl of a single URL 31 | result = pathik.crawl("https://example.com") 32 | 33 | # Check if the URL is in the result 34 | if "https://example.com" in result: 35 | url_result = result["https://example.com"] 36 | 37 | if url_result.get("success", False): 38 | print(f"✅ Successfully crawled https://example.com") 39 | print(f"HTML saved to: {url_result.get('html', 'N/A')}") 40 | print(f"Markdown saved to: {url_result.get('markdown', 'N/A')}") 41 | else: 42 | print(f"❌ Failed to crawl https://example.com") 43 | print(f"Error: {url_result.get('error', 'Unknown error')}") 44 | else: 45 | print("❌ URL not found in results") 46 | pprint(result) 47 | 48 | except CrawlerError as e: 49 | print(f"❌ Crawler error: {e}") 50 | except Exception as e: 51 | print(f"❌ Unexpected error: {e}") 52 | 53 | def crawl_multiple_urls(): 54 | """Example of crawling multiple URLs in parallel""" 55 | print("\n=== Crawling multiple URLs in parallel ===") 56 | 57 | urls = [ 58 | "https://example.com", 59 | "https://httpbin.org/html", 60 | "https://jsonplaceholder.typicode.com" 61 | ] 62 | 63 | try: 64 | # Crawl multiple URLs in parallel (default behavior) 65 | results = pathik.crawl(urls) 66 | 67 | # Process each URL result 68 | for url in urls: 69 | if url in results: 70 | url_result = results[url] 71 | 72 | if url_result.get("success", False): 73 | print(f"✅ Successfully crawled {url}") 74 | print(f" HTML saved to: {url_result.get('html', 'N/A')}") 75 | print(f" Markdown saved to: {url_result.get('markdown', 'N/A')}") 76 | else: 77 | print(f"❌ Failed to crawl {url}") 78 | print(f" Error: {url_result.get('error', 'Unknown error')}") 79 | else: 80 | print(f"❌ URL not found in results: {url}") 81 | 82 | except CrawlerError as e: 83 | print(f"❌ Crawler error: {e}") 84 | except Exception as e: 85 | print(f"❌ Unexpected error: {e}") 86 | 87 | def crawl_with_custom_output(): 88 | """Example of crawling with a custom output directory""" 89 | print("\n=== Crawling with custom output directory ===") 90 | 91 | # Create a custom output directory 92 | output_dir = "pathik_output" 93 | os.makedirs(output_dir, exist_ok=True) 94 | 95 | try: 96 | # Crawl with custom output directory 97 | result = pathik.crawl("https://example.com", output_dir=output_dir) 98 | 99 | if "https://example.com" in result: 100 | url_result = result["https://example.com"] 101 | 102 | if url_result.get("success", False): 103 | print(f"✅ Successfully crawled with custom output directory") 104 | print(f"HTML saved to: {url_result.get('html', 'N/A')}") 105 | print(f"Markdown saved to: {url_result.get('markdown', 'N/A')}") 106 | else: 107 | print(f"❌ Failed to crawl with custom output") 108 | print(f"Error: {url_result.get('error', 'Unknown error')}") 109 | else: 110 | print("❌ URL not found in results") 111 | 112 | except CrawlerError as e: 113 | print(f"❌ Crawler error: {e}") 114 | except Exception as e: 115 | print(f"❌ Unexpected error: {e}") 116 | 117 | def kafka_streaming_example(): 118 | """Example of streaming to Kafka""" 119 | print("\n=== Streaming to Kafka ===") 120 | 121 | # Generate a session ID 122 | session_id = str(uuid.uuid4()) 123 | print(f"Session ID: {session_id}") 124 | 125 | try: 126 | # Stream to Kafka 127 | result = pathik.stream_to_kafka( 128 | "https://example.com", 129 | session=session_id, 130 | # Increase buffer sizes for large pages 131 | max_message_size=15728640, # 15MB 132 | buffer_memory=104857600 # 100MB 133 | ) 134 | 135 | if "https://example.com" in result: 136 | url_result = result["https://example.com"] 137 | 138 | if url_result.get("success", False): 139 | print(f"✅ Successfully streamed to Kafka") 140 | print(f"Session ID: {session_id}") 141 | 142 | # Instructions for consuming 143 | print("\nTo consume the messages, run:") 144 | print(f"python kafka_consumer_direct.py --session={session_id}") 145 | else: 146 | print(f"❌ Failed to stream to Kafka") 147 | print(f"Error: {url_result.get('error', 'Unknown error')}") 148 | else: 149 | print("❌ URL not found in results") 150 | 151 | except CrawlerError as e: 152 | print(f"❌ Crawler error: {e}") 153 | except Exception as e: 154 | print(f"❌ Unexpected error: {e}") 155 | 156 | def main(): 157 | """Run all examples""" 158 | print(f"Pathik version: {pathik.__version__}") 159 | 160 | # Run examples 161 | crawl_single_url() 162 | crawl_multiple_urls() 163 | crawl_with_custom_output() 164 | 165 | # Uncomment to run Kafka example (requires Kafka setup) 166 | # kafka_streaming_example() 167 | 168 | print("\nAll examples completed!") 169 | 170 | if __name__ == "__main__": 171 | main() -------------------------------------------------------------------------------- /examples/command_ordering_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Test script to verify that the command ordering fix works properly. 4 | """ 5 | import os 6 | import sys 7 | import json 8 | 9 | # Add the parent directory to the Python path 10 | sys.path.insert(0, os.path.abspath('..')) 11 | 12 | try: 13 | import pathik 14 | print(f"Successfully imported pathik v{pathik.__version__}") 15 | except ImportError as e: 16 | print(f"Error importing pathik: {e}") 17 | sys.exit(1) 18 | 19 | def test_command_ordering(): 20 | """Run tests to verify the command ordering fix""" 21 | print("\n=== Testing Command Ordering Fix ===") 22 | 23 | # Create an output directory 24 | output_dir = os.path.abspath("test_output") 25 | os.makedirs(output_dir, exist_ok=True) 26 | print(f"Output directory: {output_dir}") 27 | 28 | # Test URLs 29 | urls = [ 30 | "https://example.com", 31 | "https://httpbin.org/html" 32 | ] 33 | 34 | # Test 1: Basic crawl with output directory 35 | print("\nTest 1: Basic crawl with output directory") 36 | try: 37 | results = pathik.crawl(urls[0], output_dir=output_dir) 38 | if urls[0] in results and "html" in results[urls[0]]: 39 | print(f"✅ Test 1 passed: Successfully crawled {urls[0]}") 40 | print(f" HTML file: {results[urls[0]]['html']}") 41 | else: 42 | print(f"❌ Test 1 failed: Could not find expected output for {urls[0]}") 43 | print(f" Result: {results}") 44 | except Exception as e: 45 | print(f"❌ Test 1 failed with exception: {e}") 46 | 47 | # Test 2: Multiple URLs with parallel processing 48 | print("\nTest 2: Multiple URLs with parallel processing") 49 | try: 50 | results = pathik.crawl(urls, output_dir=output_dir, parallel=True) 51 | success = True 52 | for url in urls: 53 | if url not in results or "html" not in results[url]: 54 | success = False 55 | print(f"❌ Missing output for {url}") 56 | 57 | if success: 58 | print(f"✅ Test 2 passed: Successfully crawled multiple URLs in parallel") 59 | for url in urls: 60 | print(f" {url} -> {results[url]['html']}") 61 | else: 62 | print(f"❌ Test 2 failed: Some URLs did not complete successfully") 63 | print(f" Result: {results}") 64 | except Exception as e: 65 | print(f"❌ Test 2 failed with exception: {e}") 66 | 67 | # Test 3: Advanced options 68 | print("\nTest 3: Advanced options") 69 | try: 70 | results = pathik.crawl( 71 | urls[0], 72 | output_dir=output_dir, 73 | validate=True, 74 | timeout=30, 75 | limit=100 76 | ) 77 | if urls[0] in results and "html" in results[urls[0]]: 78 | print(f"✅ Test 3 passed: Successfully crawled with advanced options") 79 | print(f" HTML file: {results[urls[0]]['html']}") 80 | else: 81 | print(f"❌ Test 3 failed: Could not find expected output") 82 | print(f" Result: {results}") 83 | except Exception as e: 84 | print(f"❌ Test 3 failed with exception: {e}") 85 | 86 | print("\n=== All Tests Completed ===") 87 | 88 | if __name__ == "__main__": 89 | test_command_ordering() -------------------------------------------------------------------------------- /examples/diagnose_imports.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Diagnostic script to identify import issues with pathik 4 | """ 5 | import sys 6 | import os 7 | import importlib 8 | 9 | # Print Python path 10 | print("Python path:") 11 | for p in sys.path: 12 | print(f" - {p}") 13 | 14 | # Look for pathik in site-packages 15 | site_packages = [p for p in sys.path if 'site-packages' in p] 16 | for sp in site_packages: 17 | pathik_path = os.path.join(sp, 'pathik') 18 | if os.path.exists(pathik_path): 19 | print(f"\nFound pathik in: {pathik_path}") 20 | print("Contents:") 21 | for root, dirs, files in os.walk(pathik_path): 22 | level = root.replace(pathik_path, '').count(os.sep) 23 | indent = ' ' * 4 * level 24 | print(f"{indent}{os.path.basename(root)}/") 25 | sub_indent = ' ' * 4 * (level + 1) 26 | for f in files: 27 | print(f"{sub_indent}{f}") 28 | 29 | # Try importing pathik 30 | print("\nImporting pathik...") 31 | try: 32 | import pathik 33 | print(f"pathik imported from: {pathik.__file__}") 34 | print(f"pathik version: {getattr(pathik, '__version__', 'Unknown')}") 35 | print("\nPathik module dir:") 36 | for attr in dir(pathik): 37 | if not attr.startswith('_'): 38 | print(f" - {attr}") 39 | except Exception as e: 40 | print(f"Error importing pathik: {e}") 41 | 42 | # Try loading from local source 43 | print("\nTrying to load from local source...") 44 | # Add parent directory to path 45 | parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 46 | if parent_dir not in sys.path: 47 | sys.path.insert(0, parent_dir) 48 | print(f"Added {parent_dir} to sys.path") 49 | 50 | try: 51 | import pathik as local_pathik 52 | print(f"Local pathik imported from: {local_pathik.__file__}") 53 | print(f"Local pathik version: {getattr(local_pathik, '__version__', 'Unknown')}") 54 | 55 | # Forcefully reload the module 56 | importlib.reload(local_pathik) 57 | print("Reloaded local pathik module") 58 | 59 | print("\nLocal pathik module dir after reload:") 60 | for attr in dir(local_pathik): 61 | if not attr.startswith('_'): 62 | print(f" - {attr}") 63 | 64 | # Try accessing stream_to_kafka 65 | if hasattr(local_pathik, 'stream_to_kafka'): 66 | print("\nstream_to_kafka function exists!") 67 | func_source = getattr(local_pathik.stream_to_kafka, '__code__', None) 68 | if func_source: 69 | print(f"Function defined in: {func_source.co_filename}") 70 | else: 71 | print("\nstream_to_kafka function not found in local module") 72 | 73 | # Try accessing crawl 74 | if hasattr(local_pathik, 'crawl'): 75 | print("\ncrawl function exists!") 76 | func_source = getattr(local_pathik.crawl, '__code__', None) 77 | if func_source: 78 | print(f"Function defined in: {func_source.co_filename}") 79 | else: 80 | print("\ncrawl function not found in local module") 81 | 82 | except Exception as e: 83 | print(f"Error with local pathik: {e}") -------------------------------------------------------------------------------- /examples/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | zookeeper: 4 | image: confluentinc/cp-zookeeper:7.5.0 5 | hostname: zookeeper 6 | container_name: zookeeper 7 | ports: 8 | - "2181:2181" 9 | environment: 10 | ZOOKEEPER_CLIENT_PORT: 2181 11 | ZOOKEEPER_TICK_TIME: 2000 12 | 13 | kafka: 14 | image: confluentinc/cp-kafka:7.5.0 15 | hostname: kafka 16 | container_name: kafka 17 | depends_on: 18 | - zookeeper 19 | ports: 20 | - "9092:9092" 21 | environment: 22 | KAFKA_BROKER_ID: 1 23 | KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181 24 | KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://localhost:9092 25 | KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT 26 | KAFKA_INTER_BROKER_LISTENER_NAME: PLAINTEXT 27 | KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 28 | KAFKA_AUTO_CREATE_TOPICS_ENABLE: true 29 | 30 | # Optional Kafka UI 31 | kafka-ui: 32 | image: provectuslabs/kafka-ui:latest 33 | container_name: kafka-ui 34 | depends_on: 35 | - kafka 36 | ports: 37 | - "8080:8080" 38 | environment: 39 | KAFKA_CLUSTERS_0_NAME: local 40 | KAFKA_CLUSTERS_0_BOOTSTRAPSERVERS: kafka:9092 41 | KAFKA_CLUSTERS_0_ZOOKEEPER: zookeeper:2181 -------------------------------------------------------------------------------- /examples/example.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import traceback 4 | 5 | # Add the parent directory to the Python path if needed 6 | #sys.path.insert(0, os.path.abspath('..')) 7 | 8 | try: 9 | import pathik 10 | print(f"Successfully imported pathik from {pathik.__file__}") 11 | print(f"Available attributes: {dir(pathik)}") 12 | except ImportError as e: 13 | print(f"Error importing pathik: {e}") 14 | sys.exit(1) 15 | 16 | # Create an output directory with an absolute path 17 | output_dir = os.path.abspath("output_data") 18 | os.makedirs(output_dir, exist_ok=True) 19 | print(f"Output directory: {output_dir}") 20 | 21 | # List of URLs to crawl 22 | urls = [ 23 | "https://jan.ai/docs/quickstart" 24 | ] 25 | 26 | # Crawl the URLs and save to the output directory 27 | print(f"Crawling {len(urls)} URLs...") 28 | try: 29 | results = pathik.crawl(urls, output_dir=output_dir) 30 | 31 | # Print the results 32 | print("\nCrawling results:") 33 | for url, files in results.items(): 34 | print(f"\nURL: {url}") 35 | print(f"HTML file: {files['html']}") 36 | print(f"Markdown file: {files['markdown']}") 37 | 38 | # Print sample content from markdown file 39 | if files['markdown'] and os.path.exists(files['markdown']): 40 | with open(files['markdown'], 'r', encoding='utf-8') as f: 41 | content = f.read(500) # First 500 characters 42 | print(f"\nSample markdown content:") 43 | print(f"{content}...") 44 | else: 45 | print(f"WARNING: Markdown file not found or empty!") 46 | except Exception as e: 47 | print(f"Error during crawling: {e}") 48 | traceback.print_exc() -------------------------------------------------------------------------------- /examples/example2.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import traceback 4 | import requests 5 | import xml.etree.ElementTree as ET 6 | 7 | # Function to fetch URLs from sitemap.xml 8 | def fetch_urls_from_sitemap(sitemap_url): 9 | response = requests.get(sitemap_url) 10 | response.raise_for_status() # Raise an error for bad responses 11 | 12 | # Parse the XML content 13 | root = ET.fromstring(response.content) 14 | 15 | # Extract the namespace from the root element 16 | ns = {'ns': root.tag.split('}')[0].strip('{')} if '}' in root.tag else '' 17 | 18 | # If there's a namespace, we need to use it to find elements 19 | if ns: 20 | urls = [url_elem.find('ns:loc', ns).text for url_elem in root.findall('ns:url', ns)] 21 | else: 22 | # Default namespace handling 23 | urls = [url_elem.find('loc').text for url_elem in root.findall('url')] 24 | 25 | print(f"Found {len(urls)} URLs in the sitemap") 26 | return urls 27 | 28 | # Add the parent directory to the Python path if needed 29 | #sys.path.insert(0, os.path.abspath('..')) 30 | 31 | try: 32 | import pathik 33 | print(f"Successfully imported pathik from {pathik.__file__}") 34 | print(f"Available attributes: {dir(pathik)}") 35 | except ImportError as e: 36 | print(f"Error importing pathik: {e}") 37 | sys.exit(1) 38 | 39 | # Create an output directory with an absolute path 40 | output_dir = os.path.abspath("output_data") 41 | os.makedirs(output_dir, exist_ok=True) 42 | print(f"Output directory: {output_dir}") 43 | 44 | # Fetch URLs from sitemap.xml 45 | sitemap_url = "https://jan.ai/sitemap-0.xml" # The sitemap URL 46 | print(f"Fetching URLs from {sitemap_url}...") 47 | urls = fetch_urls_from_sitemap(sitemap_url) 48 | 49 | # Limit the number of URLs to crawl if there are too many 50 | max_urls = 10 # Adjust this number as needed 51 | if len(urls) > max_urls: 52 | print(f"Limiting to {max_urls} URLs out of {len(urls)} total") 53 | urls = urls[:max_urls] 54 | 55 | # Crawl the URLs and save to the output directory 56 | print(f"Crawling {len(urls)} URLs...") 57 | try: 58 | results = pathik.crawl(urls, output_dir=output_dir) 59 | 60 | # Print the results 61 | print("\nCrawling results:") 62 | for url, files in results.items(): 63 | print(f"\nURL: {url}") 64 | print(f"HTML file: {files['html']}") 65 | print(f"Markdown file: {files['markdown']}") 66 | 67 | # Print sample content from markdown file 68 | if files['markdown'] and os.path.exists(files['markdown']): 69 | with open(files['markdown'], 'r', encoding='utf-8') as f: 70 | content = f.read(500) # First 500 characters 71 | print(f"\nSample markdown content:") 72 | print(f"{content}...") 73 | else: 74 | print(f"WARNING: Markdown file not found or empty!") 75 | except Exception as e: 76 | print(f"Error during crawling: {e}") 77 | traceback.print_exc() -------------------------------------------------------------------------------- /examples/example_kafka.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Example of using Pathik to crawl websites and stream content to Kafka, 4 | and then consuming that content from Kafka. 5 | """ 6 | import pathik 7 | import uuid 8 | import time 9 | import argparse 10 | import json 11 | from typing import Optional 12 | 13 | 14 | def stream_to_kafka(urls, content_type="both", topic=None, parallel=True): 15 | """ 16 | Stream crawled content to Kafka 17 | """ 18 | # Generate a unique session ID to identify this streaming batch 19 | session_id = str(uuid.uuid4()) 20 | print(f"Generated session ID: {session_id}") 21 | 22 | print(f"\nStreaming {len(urls)} URLs to Kafka...") 23 | print(f"Content type: {content_type}") 24 | if topic: 25 | print(f"Topic: {topic}") 26 | print(f"Parallel processing: {parallel}") 27 | 28 | # Stream content to Kafka 29 | results = pathik.stream_to_kafka( 30 | urls=urls, 31 | content_type=content_type, 32 | topic=topic, 33 | session=session_id, 34 | parallel=parallel 35 | ) 36 | 37 | # Print results 38 | success_count = 0 39 | for url, status in results.items(): 40 | if status.get("success", False): 41 | print(f"✓ {url}: Successfully streamed to Kafka") 42 | success_count += 1 43 | else: 44 | print(f"✗ {url}: Failed to stream - {status.get('error', 'Unknown error')}") 45 | 46 | print(f"\n{success_count}/{len(urls)} URLs successfully streamed to Kafka") 47 | print(f"Session ID: {session_id} (use this to filter messages when consuming)") 48 | 49 | return session_id 50 | 51 | 52 | def consume_from_kafka(topic="pathik_crawl_data", session_id=None, 53 | content_type=None, max_messages=10, timeout=30): 54 | """ 55 | Consume messages from Kafka with filtering options 56 | """ 57 | try: 58 | # Import Kafka consumer - requires kafka-python package 59 | from kafka import KafkaConsumer 60 | except ImportError: 61 | print("\nERROR: kafka-python package not installed.") 62 | print("Install it with: pip install kafka-python") 63 | return 64 | 65 | print(f"\nConnecting to Kafka topic: {topic}") 66 | if session_id: 67 | print(f"Filtering by session ID: {session_id}") 68 | if content_type: 69 | print(f"Filtering by content type: {content_type}") 70 | print(f"Will show up to {max_messages} messages or stop after {timeout} seconds") 71 | 72 | # Create consumer 73 | try: 74 | consumer = KafkaConsumer( 75 | topic, 76 | bootstrap_servers='localhost:9092', # Change if your server is different 77 | auto_offset_reset='earliest', # Start from beginning of topic 78 | enable_auto_commit=True, 79 | group_id='pathik-example-consumer', 80 | consumer_timeout_ms=timeout * 1000 # Convert seconds to milliseconds 81 | ) 82 | except Exception as e: 83 | print(f"Error connecting to Kafka: {e}") 84 | print("\nMake sure Kafka is running and properly configured in .env file") 85 | return 86 | 87 | # Process messages with filtering 88 | print("\nWaiting for messages...") 89 | count = 0 90 | try: 91 | for message in consumer: 92 | # Extract message details 93 | msg_key = message.key.decode('utf-8') if message.key else None 94 | 95 | # Process headers 96 | headers = {} 97 | for key, value in message.headers: 98 | headers[key] = value.decode('utf-8') if value else None 99 | 100 | # Apply filters 101 | if session_id and headers.get('session') != session_id: 102 | continue 103 | 104 | msg_content_type = headers.get('contentType', '') 105 | if content_type: 106 | if content_type == 'html' and 'html' not in msg_content_type: 107 | continue 108 | if content_type == 'markdown' and 'markdown' not in msg_content_type: 109 | continue 110 | 111 | # Print message details 112 | print("\n" + "-" * 60) 113 | print(f"URL: {msg_key}") 114 | print(f"Content Type: {msg_content_type}") 115 | print(f"Session ID: {headers.get('session', 'N/A')}") 116 | print(f"Timestamp: {headers.get('timestamp', 'N/A')}") 117 | 118 | # Print a sample of the content 119 | content = message.value.decode('utf-8') 120 | content_preview = content[:500] + "..." if len(content) > 500 else content 121 | print("\nContent Preview:") 122 | print(content_preview) 123 | 124 | count += 1 125 | if count >= max_messages: 126 | print(f"\nReached maximum message count ({max_messages})") 127 | break 128 | 129 | except Exception as e: 130 | print(f"Error consuming messages: {e}") 131 | finally: 132 | consumer.close() 133 | 134 | if count == 0: 135 | print("\nNo messages received. Possible reasons:") 136 | print("- No messages matching your filters") 137 | print("- Kafka topic is empty") 138 | print("- Timeout reached before messages arrived") 139 | else: 140 | print(f"\nReceived {count} messages from Kafka") 141 | 142 | 143 | def main(): 144 | """ 145 | Main function to parse arguments and run the example 146 | """ 147 | parser = argparse.ArgumentParser( 148 | description="Pathik Kafka streaming example", 149 | formatter_class=argparse.RawDescriptionHelpFormatter, 150 | epilog=""" 151 | Examples: 152 | # Stream content to Kafka: 153 | python example_kafka.py stream https://example.com https://news.ycombinator.com 154 | 155 | # Stream only HTML content: 156 | python example_kafka.py stream -c html https://example.com 157 | 158 | # Stream to a custom topic: 159 | python example_kafka.py stream -t custom_topic https://example.com 160 | 161 | # Consume from Kafka: 162 | python example_kafka.py consume 163 | 164 | # Consume with session filter: 165 | python example_kafka.py consume -s your-session-id 166 | 167 | # Consume only markdown content: 168 | python example_kafka.py consume -c markdown 169 | """ 170 | ) 171 | 172 | subparsers = parser.add_subparsers(dest="command", help="Command to run") 173 | 174 | # Stream command 175 | stream_parser = subparsers.add_parser("stream", help="Stream content to Kafka") 176 | stream_parser.add_argument("urls", nargs="+", help="URLs to crawl and stream") 177 | stream_parser.add_argument("-c", "--content", choices=["html", "markdown", "both"], 178 | default="both", help="Content type to stream") 179 | stream_parser.add_argument("-t", "--topic", help="Kafka topic to use") 180 | stream_parser.add_argument("-s", "--sequential", action="store_true", 181 | help="Use sequential (non-parallel) crawling") 182 | 183 | # Consume command 184 | consume_parser = subparsers.add_parser("consume", help="Consume content from Kafka") 185 | consume_parser.add_argument("-t", "--topic", default="pathik_crawl_data", 186 | help="Kafka topic to consume from") 187 | consume_parser.add_argument("-s", "--session", help="Filter by session ID") 188 | consume_parser.add_argument("-c", "--content", choices=["html", "markdown"], 189 | help="Filter by content type") 190 | consume_parser.add_argument("-m", "--max", type=int, default=10, 191 | help="Maximum number of messages to consume") 192 | consume_parser.add_argument("--timeout", type=int, default=30, 193 | help="Timeout in seconds") 194 | 195 | args = parser.parse_args() 196 | 197 | if not args.command: 198 | parser.print_help() 199 | return 200 | 201 | if args.command == "stream": 202 | session_id = stream_to_kafka( 203 | urls=args.urls, 204 | content_type=args.content, 205 | topic=args.topic, 206 | parallel=not args.sequential 207 | ) 208 | 209 | # Offer to consume the messages just streamed 210 | choice = input("\nDo you want to consume the messages you just streamed? (y/n): ") 211 | if choice.lower() in ['y', 'yes']: 212 | consume_from_kafka( 213 | topic=args.topic or "pathik_crawl_data", 214 | session_id=session_id, 215 | content_type=args.content if args.content != "both" else None 216 | ) 217 | 218 | elif args.command == "consume": 219 | consume_from_kafka( 220 | topic=args.topic, 221 | session_id=args.session, 222 | content_type=args.content, 223 | max_messages=args.max, 224 | timeout=args.timeout 225 | ) 226 | 227 | 228 | if __name__ == "__main__": 229 | main() -------------------------------------------------------------------------------- /examples/kafka_consumer.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "flag" 6 | "fmt" 7 | "log" 8 | "os" 9 | "os/signal" 10 | "strings" 11 | "syscall" 12 | "time" 13 | 14 | "github.com/joho/godotenv" 15 | "github.com/segmentio/kafka-go" 16 | "github.com/segmentio/kafka-go/sasl/plain" 17 | ) 18 | 19 | func main() { 20 | // Load .env file if it exists 21 | godotenv.Load() 22 | 23 | // Command line flags 24 | brokers := flag.String("brokers", getEnvWithDefault("KAFKA_BROKERS", "localhost:9092"), "Kafka brokers (comma-separated)") 25 | topic := flag.String("topic", getEnvWithDefault("KAFKA_TOPIC", "pathik_crawl_data"), "Kafka topic to consume from") 26 | username := flag.String("username", os.Getenv("KAFKA_USERNAME"), "SASL username") 27 | password := flag.String("password", os.Getenv("KAFKA_PASSWORD"), "SASL password") 28 | contentType := flag.String("type", "", "Filter by content type (html or markdown)") 29 | sessionID := flag.String("session", "", "Filter by session ID") 30 | flag.Parse() 31 | 32 | // Setup signal handling for graceful shutdown 33 | ctx, cancel := context.WithCancel(context.Background()) 34 | defer cancel() 35 | 36 | signals := make(chan os.Signal, 1) 37 | signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM) 38 | go func() { 39 | <-signals 40 | fmt.Println("\nShutting down gracefully...") 41 | cancel() 42 | }() 43 | 44 | // Create Kafka reader 45 | brokerList := strings.Split(*brokers, ",") 46 | fmt.Printf("Connecting to Kafka brokers: %s\n", *brokers) 47 | fmt.Printf("Consuming from topic: %s\n", *topic) 48 | if *contentType != "" { 49 | fmt.Printf("Filtering for content type: %s\n", *contentType) 50 | } 51 | if *sessionID != "" { 52 | fmt.Printf("Filtering for session ID: %s\n", *sessionID) 53 | } 54 | 55 | // Configure reader 56 | readerConfig := kafka.ReaderConfig{ 57 | Brokers: brokerList, 58 | Topic: *topic, 59 | MinBytes: 10e3, // 10KB 60 | MaxBytes: 10e6, // 10MB 61 | StartOffset: kafka.LastOffset, 62 | Logger: kafka.LoggerFunc(logKafkaInfo), 63 | ErrorLogger: kafka.LoggerFunc(logKafkaError), 64 | } 65 | 66 | // Add SASL authentication if credentials provided 67 | if *username != "" && *password != "" { 68 | dialer := &kafka.Dialer{ 69 | Timeout: 10 * time.Second, 70 | DualStack: true, 71 | SASLMechanism: plain.Mechanism{ 72 | Username: *username, 73 | Password: *password, 74 | }, 75 | } 76 | readerConfig.Dialer = dialer 77 | fmt.Println("Using SASL authentication") 78 | } 79 | 80 | reader := kafka.NewReader(readerConfig) 81 | defer reader.Close() 82 | 83 | fmt.Println("Consumer started. Press Ctrl+C to exit.") 84 | fmt.Println("-----------------------------------------") 85 | 86 | // Consume messages 87 | for { 88 | select { 89 | case <-ctx.Done(): 90 | return 91 | default: 92 | m, err := reader.ReadMessage(ctx) 93 | if err != nil { 94 | if ctx.Err() != context.Canceled { 95 | log.Printf("Error reading message: %v", err) 96 | } 97 | continue 98 | } 99 | 100 | // Extract headers 101 | var msgURL, msgContentType, msgTimestamp, msgSessionID string 102 | for _, header := range m.Headers { 103 | switch header.Key { 104 | case "url": 105 | msgURL = string(header.Value) 106 | case "contentType": 107 | msgContentType = string(header.Value) 108 | case "timestamp": 109 | msgTimestamp = string(header.Value) 110 | case "sessionID": 111 | msgSessionID = string(header.Value) 112 | } 113 | } 114 | 115 | // Skip if content type filter is set and doesn't match 116 | if *contentType != "" && !strings.Contains(msgContentType, *contentType) { 117 | continue 118 | } 119 | 120 | // Skip if session ID filter is set and doesn't match 121 | if *sessionID != "" && msgSessionID != *sessionID { 122 | continue 123 | } 124 | 125 | // Display message 126 | fmt.Println("-----------------------------------------") 127 | fmt.Printf("Message received at partition %d, offset %d\n", m.Partition, m.Offset) 128 | fmt.Printf("Key: %s\n", string(m.Key)) 129 | fmt.Printf("URL: %s\n", msgURL) 130 | fmt.Printf("Content Type: %s\n", msgContentType) 131 | fmt.Printf("Timestamp: %s\n", msgTimestamp) 132 | if msgSessionID != "" { 133 | fmt.Printf("Session ID: %s\n", msgSessionID) 134 | } 135 | 136 | // Print preview of the content (first 200 chars) 137 | content := string(m.Value) 138 | preview := content 139 | if len(content) > 200 { 140 | preview = content[:200] + "... [truncated]" 141 | } 142 | fmt.Printf("Content Preview (%d bytes total):\n%s\n", len(content), preview) 143 | } 144 | } 145 | } 146 | 147 | func getEnvWithDefault(key, defaultValue string) string { 148 | value := os.Getenv(key) 149 | if value == "" { 150 | return defaultValue 151 | } 152 | return value 153 | } 154 | 155 | func logKafkaInfo(msg string, args ...interface{}) { 156 | // Uncomment to see verbose Kafka client logs 157 | // log.Printf("INFO: "+msg, args...) 158 | } 159 | 160 | func logKafkaError(msg string, args ...interface{}) { 161 | log.Printf("ERROR: "+msg, args...) 162 | } 163 | -------------------------------------------------------------------------------- /examples/kafka_consumer.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | /** 3 | * Example Kafka consumer for Pathik-streamed content in JavaScript/Node.js 4 | * 5 | * Requirements: 6 | * npm install kafkajs dotenv yargs 7 | */ 8 | 9 | const { Kafka } = require('kafkajs'); 10 | const fs = require('fs'); 11 | const path = require('path'); 12 | const dotenv = require('dotenv'); 13 | const yargs = require('yargs/yargs'); 14 | const { hideBin } = require('yargs/helpers'); 15 | 16 | // Load environment variables from .env file 17 | dotenv.config(); 18 | 19 | // Parse command line arguments 20 | const argv = yargs(hideBin(process.argv)) 21 | .option('brokers', { 22 | description: 'Kafka brokers (comma-separated)', 23 | default: process.env.KAFKA_BROKERS || 'localhost:9092', 24 | type: 'string' 25 | }) 26 | .option('topic', { 27 | description: 'Kafka topic to consume from', 28 | default: process.env.KAFKA_TOPIC || 'pathik_crawl_data', 29 | type: 'string' 30 | }) 31 | .option('username', { 32 | description: 'SASL username', 33 | default: process.env.KAFKA_USERNAME, 34 | type: 'string' 35 | }) 36 | .option('password', { 37 | description: 'SASL password', 38 | default: process.env.KAFKA_PASSWORD, 39 | type: 'string' 40 | }) 41 | .option('type', { 42 | description: 'Filter by content type (html or markdown)', 43 | choices: ['html', 'markdown'], 44 | type: 'string' 45 | }) 46 | .option('session', { 47 | description: 'Filter by session ID', 48 | type: 'string' 49 | }) 50 | .option('from-beginning', { 51 | description: 'Consume from the beginning of the topic', 52 | type: 'boolean', 53 | default: false 54 | }) 55 | .help() 56 | .alias('help', 'h') 57 | .argv; 58 | 59 | async function run() { 60 | // Log connection info 61 | console.log(`Connecting to Kafka brokers: ${argv.brokers}`); 62 | console.log(`Consuming from topic: ${argv.topic}`); 63 | if (argv.type) { 64 | console.log(`Filtering for content type: ${argv.type}`); 65 | } 66 | console.log(`Starting from: ${argv['from-beginning'] ? 'beginning' : 'most recent'}`); 67 | 68 | // Configure Kafka client 69 | const kafkaConfig = { 70 | clientId: 'pathik-example-consumer', 71 | brokers: argv.brokers.split(','), 72 | }; 73 | 74 | // Add SASL authentication if credentials provided 75 | if (argv.username && argv.password) { 76 | kafkaConfig.sasl = { 77 | mechanism: 'plain', 78 | username: argv.username, 79 | password: argv.password 80 | }; 81 | kafkaConfig.ssl = true; 82 | console.log('Using SASL authentication'); 83 | } 84 | 85 | const kafka = new Kafka(kafkaConfig); 86 | const consumer = kafka.consumer({ 87 | groupId: 'pathik-example-consumer-js', 88 | maxWaitTimeInMs: 500 89 | }); 90 | 91 | // Handle graceful shutdown 92 | const errorTypes = ['unhandledRejection', 'uncaughtException']; 93 | const signalTraps = ['SIGTERM', 'SIGINT', 'SIGUSR2']; 94 | 95 | errorTypes.forEach(type => { 96 | process.on(type, async e => { 97 | try { 98 | console.log(`Process.on ${type}`); 99 | console.error(e); 100 | await consumer.disconnect(); 101 | process.exit(0); 102 | } catch (_) { 103 | process.exit(1); 104 | } 105 | }); 106 | }); 107 | 108 | signalTraps.forEach(type => { 109 | process.once(type, async () => { 110 | try { 111 | console.log('\nShutting down gracefully...'); 112 | await consumer.disconnect(); 113 | process.exit(0); 114 | } catch (_) { 115 | process.exit(1); 116 | } 117 | }); 118 | }); 119 | 120 | // Connect and subscribe 121 | await consumer.connect(); 122 | await consumer.subscribe({ 123 | topic: argv.topic, 124 | fromBeginning: argv['from-beginning'] 125 | }); 126 | 127 | // Display messages 128 | console.log('Consumer started. Press Ctrl+C to exit.'); 129 | console.log('-----------------------------------------'); 130 | 131 | await consumer.run({ 132 | eachMessage: async ({ topic, partition, message }) => { 133 | // Extract headers 134 | const headers = {}; 135 | if (message.headers) { 136 | Object.entries(message.headers).forEach(([key, value]) => { 137 | headers[key] = value ? value.toString() : null; 138 | }); 139 | } 140 | 141 | const contentType = headers.contentType || ''; 142 | const sessionId = headers.sessionID || ''; 143 | 144 | // Skip if content type filter is set and doesn't match 145 | if (argv.type && !contentType.toLowerCase().includes(argv.type.toLowerCase())) { 146 | return; 147 | } 148 | 149 | // Skip if session ID filter is set and doesn't match 150 | if (argv.session && sessionId !== argv.session) { 151 | return; 152 | } 153 | 154 | // Display message 155 | console.log('-----------------------------------------'); 156 | console.log(`Partition: ${partition}, Offset: ${message.offset}`); 157 | console.log(`Key: ${message.key ? message.key.toString() : 'null'}`); 158 | console.log(`URL: ${headers.url || 'unknown'}`); 159 | console.log(`Content Type: ${contentType}`); 160 | console.log(`Timestamp: ${headers.timestamp || 'unknown'}`); 161 | if (sessionId) { 162 | console.log(`Session ID: ${sessionId}`); 163 | } 164 | 165 | // Print preview of content 166 | const content = message.value ? message.value.toString() : ''; 167 | const contentLen = content.length; 168 | const preview = contentLen > 200 ? `${content.substring(0, 200)}... [truncated]` : content; 169 | console.log(`Content Preview (${contentLen} bytes total):`); 170 | console.log(preview); 171 | }, 172 | }); 173 | } 174 | 175 | run().catch(e => { 176 | console.error(`Error: ${e.message}`); 177 | process.exit(1); 178 | }); -------------------------------------------------------------------------------- /examples/kafka_consumer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Example Kafka consumer for Pathik-streamed content. 4 | This script reads messages from a Kafka topic where Pathik has streamed crawled content. 5 | 6 | Requirements: 7 | pip install kafka-python python-dotenv 8 | """ 9 | 10 | import os 11 | import sys 12 | import signal 13 | import argparse 14 | import json 15 | from datetime import datetime 16 | import re 17 | import ssl 18 | from typing import Optional, Dict, Any, List 19 | 20 | try: 21 | from dotenv import load_dotenv 22 | from kafka import KafkaConsumer 23 | except ImportError: 24 | print("Required packages not found. Install with:") 25 | print("pip install kafka-python python-dotenv") 26 | sys.exit(1) 27 | 28 | def get_env_with_default(key: str, default: str) -> str: 29 | """Get environment variable with a default value.""" 30 | return os.environ.get(key, default) 31 | 32 | def validate_broker_string(broker_str: str) -> bool: 33 | """Validate broker string format.""" 34 | if not broker_str: 35 | return False 36 | 37 | # Simple validation - check format of each broker 38 | for broker in broker_str.split(','): 39 | # Check for hostname/IP:port format 40 | if not re.match(r'^[a-zA-Z0-9.-]+:\d+$', broker.strip()): 41 | return False 42 | 43 | return True 44 | 45 | def validate_topic_name(topic: str) -> bool: 46 | """Validate Kafka topic name.""" 47 | if not topic: 48 | return False 49 | 50 | # Kafka topics have restrictions on characters 51 | return bool(re.match(r'^[a-zA-Z0-9._-]+$', topic)) 52 | 53 | def validate_session_id(session_id: str) -> bool: 54 | """Validate session ID for security.""" 55 | if not session_id: 56 | return True # Empty is valid 57 | 58 | # Only allow alphanumeric and some special chars 59 | return bool(re.match(r'^[a-zA-Z0-9._-]+$', session_id)) 60 | 61 | def create_ssl_context() -> ssl.SSLContext: 62 | """Create a secure SSL context for Kafka.""" 63 | context = ssl.create_default_context() 64 | context.check_hostname = True 65 | context.verify_mode = ssl.CERT_REQUIRED 66 | 67 | # Optional: Load custom CA certificate if specified 68 | ca_path = os.environ.get('KAFKA_CA_CERT') 69 | if ca_path and os.path.exists(ca_path): 70 | context.load_verify_locations(cafile=ca_path) 71 | 72 | return context 73 | 74 | def main(): 75 | # Load environment variables from .env file if it exists 76 | load_dotenv() 77 | 78 | # Parse command line arguments 79 | parser = argparse.ArgumentParser(description="Consume messages from Kafka topic where Pathik streamed content") 80 | parser.add_argument("--brokers", default=get_env_with_default("KAFKA_BROKERS", "localhost:9092"), 81 | help="Kafka brokers (comma-separated)") 82 | parser.add_argument("--topic", default=get_env_with_default("KAFKA_TOPIC", "pathik_crawl_data"), 83 | help="Kafka topic to consume from") 84 | parser.add_argument("--username", default=os.environ.get("KAFKA_USERNAME"), 85 | help="SASL username") 86 | parser.add_argument("--password", default=os.environ.get("KAFKA_PASSWORD"), 87 | help="SASL password") 88 | parser.add_argument("--type", choices=["html", "markdown"], 89 | help="Filter by content type (html or markdown)") 90 | parser.add_argument("--from-beginning", action="store_true", 91 | help="Consume from the beginning of the topic") 92 | parser.add_argument("--session", help="Filter messages by session ID") 93 | parser.add_argument("--use-ssl", action="store_true", default=(os.environ.get("KAFKA_USE_SSL", "false").lower() == "true"), 94 | help="Use SSL/TLS for Kafka connection") 95 | parser.add_argument("--max-content-size", type=int, default=int(os.environ.get("MAX_CONTENT_SIZE", "10000")), 96 | help="Maximum content size to display (in characters)") 97 | args = parser.parse_args() 98 | 99 | # Validate inputs 100 | if not validate_broker_string(args.brokers): 101 | print("Error: Invalid broker string format", file=sys.stderr) 102 | sys.exit(1) 103 | 104 | if not validate_topic_name(args.topic): 105 | print("Error: Invalid topic name", file=sys.stderr) 106 | sys.exit(1) 107 | 108 | if args.session and not validate_session_id(args.session): 109 | print("Error: Invalid session ID format", file=sys.stderr) 110 | sys.exit(1) 111 | 112 | if args.max_content_size < 0 or args.max_content_size > 100000: 113 | print("Error: Content size must be between 0 and 100000 characters", file=sys.stderr) 114 | sys.exit(1) 115 | 116 | # Configure consumer 117 | config: Dict[str, Any] = { 118 | 'bootstrap_servers': args.brokers.split(','), 119 | 'auto_offset_reset': 'earliest' if args.from_beginning else 'latest', 120 | 'group_id': 'pathik-example-consumer', 121 | 'value_deserializer': lambda x: x.decode('utf-8', errors='replace'), 122 | 'key_deserializer': lambda x: x.decode('utf-8', errors='replace'), 123 | 'enable_auto_commit': True, 124 | 'consumer_timeout_ms': 60000, # 60 seconds timeout 125 | } 126 | 127 | # Add SASL authentication if credentials provided 128 | if args.username and args.password: 129 | config.update({ 130 | 'security_protocol': 'SASL_SSL' if args.use_ssl else 'SASL_PLAINTEXT', 131 | 'sasl_mechanism': 'PLAIN', 132 | 'sasl_plain_username': args.username, 133 | 'sasl_plain_password': args.password, 134 | }) 135 | print("Using SASL authentication") 136 | 137 | # Add SSL if requested 138 | if args.use_ssl: 139 | ssl_context = create_ssl_context() 140 | config.update({ 141 | 'security_protocol': 'SSL' if not args.username else 'SASL_SSL', 142 | 'ssl_context': ssl_context, 143 | }) 144 | print("Using SSL/TLS encryption") 145 | 146 | # Print connection info 147 | print(f"Connecting to Kafka brokers: {args.brokers}") 148 | print(f"Consuming from topic: {args.topic}") 149 | if args.type: 150 | print(f"Filtering for content type: {args.type}") 151 | print(f"Starting from: {'beginning' if args.from_beginning else 'most recent'}") 152 | 153 | # Set up consumer 154 | try: 155 | consumer = KafkaConsumer(args.topic, **config) 156 | except Exception as e: 157 | print(f"Failed to connect to Kafka: {e}", file=sys.stderr) 158 | sys.exit(1) 159 | 160 | # Setup signal handler for graceful shutdown 161 | def handle_signal(sig, frame): 162 | print("\nShutting down gracefully...") 163 | consumer.close() 164 | sys.exit(0) 165 | 166 | signal.signal(signal.SIGINT, handle_signal) 167 | signal.signal(signal.SIGTERM, handle_signal) 168 | 169 | # Consume messages 170 | print("Consumer started. Press Ctrl+C to exit.") 171 | print("-----------------------------------------") 172 | 173 | try: 174 | for message in consumer: 175 | # Extract headers 176 | headers = {} 177 | for key, value in message.headers: 178 | try: 179 | headers[key] = value.decode('utf-8') 180 | except (AttributeError, UnicodeDecodeError): 181 | headers[key] = str(value) 182 | 183 | content_type = headers.get('contentType', '') 184 | session_id = headers.get('sessionID', '') 185 | 186 | # Skip if content type filter is set and doesn't match 187 | if args.type and args.type not in content_type.lower(): 188 | continue 189 | 190 | # Skip if session ID filter is set and doesn't match 191 | if args.session and args.session != session_id: 192 | continue 193 | 194 | # Display message 195 | print("-----------------------------------------") 196 | print(f"Partition: {message.partition}, Offset: {message.offset}") 197 | print(f"Key: {message.key}") 198 | print(f"URL: {headers.get('url', 'unknown')}") 199 | print(f"Content Type: {content_type}") 200 | print(f"Timestamp: {headers.get('timestamp', 'unknown')}") 201 | 202 | # Print preview of content with size limit 203 | content = message.value 204 | content_len = len(content) 205 | preview_len = min(content_len, args.max_content_size) 206 | preview = content[:preview_len] + "... [truncated]" if content_len > preview_len else content 207 | print(f"Content Preview ({content_len} bytes total):") 208 | print(preview) 209 | except Exception as e: 210 | print(f"Error consuming messages: {e}", file=sys.stderr) 211 | consumer.close() 212 | sys.exit(1) 213 | 214 | if __name__ == "__main__": 215 | main() -------------------------------------------------------------------------------- /examples/native_kafka_demo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Native Kafka Demo using pathik's built-in streaming capabilities 4 | 5 | This script demonstrates real-world usage of pathik's native Kafka 6 | streaming functionality without additional implementation. 7 | """ 8 | import os 9 | import sys 10 | import uuid 11 | import argparse 12 | from typing import List, Optional, Dict, Any 13 | 14 | # Add the parent directory to the path to ensure we can import pathik 15 | parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 16 | if parent_dir not in sys.path: 17 | sys.path.insert(0, parent_dir) 18 | 19 | from pathik import stream_to_kafka 20 | 21 | def stream_urls_to_kafka( 22 | urls: List[str], 23 | kafka_brokers: str = "localhost:9092", 24 | kafka_topic: str = "pathik_crawl_data", 25 | content_type: str = "both", 26 | session_id: Optional[str] = None, 27 | compression_type: Optional[str] = None, 28 | max_message_size: Optional[int] = None, 29 | buffer_memory: Optional[int] = None 30 | ) -> Dict[str, Any]: 31 | """ 32 | Stream a list of URLs to Kafka using pathik's native streaming functionality. 33 | 34 | Args: 35 | urls: List of URLs to crawl and stream 36 | kafka_brokers: Comma-separated list of Kafka brokers 37 | kafka_topic: Kafka topic to stream to 38 | content_type: Type of content to stream ('html', 'markdown', or 'both') 39 | session_id: Optional session ID (generated if not provided) 40 | compression_type: Compression algorithm to use ('gzip', 'snappy', 'lz4', 'zstd') 41 | max_message_size: Maximum message size in bytes 42 | buffer_memory: Kafka producer buffer memory in bytes 43 | 44 | Returns: 45 | Dictionary with streaming results 46 | """ 47 | # Set environment variables for Kafka configuration 48 | os.environ["KAFKA_BROKERS"] = kafka_brokers 49 | os.environ["KAFKA_TOPIC"] = kafka_topic 50 | 51 | # Generate a session ID if not provided 52 | if not session_id: 53 | session_id = str(uuid.uuid4()) 54 | 55 | print(f"Streaming {len(urls)} URLs to Kafka:") 56 | print(f"Kafka Brokers: {kafka_brokers}") 57 | print(f"Kafka Topic: {kafka_topic}") 58 | print(f"Content Type: {content_type}") 59 | print(f"Session ID: {session_id}") 60 | if compression_type: 61 | print(f"Compression: {compression_type}") 62 | if max_message_size: 63 | print(f"Max Message Size: {max_message_size} bytes") 64 | if buffer_memory: 65 | print(f"Buffer Memory: {buffer_memory} bytes") 66 | print("="*50) 67 | 68 | # Stream URLs to Kafka using pathik's built-in functionality 69 | results = stream_to_kafka( 70 | urls=urls, 71 | content_type=content_type, 72 | topic=kafka_topic, 73 | session=session_id, 74 | parallel=True, 75 | compression_type=compression_type, 76 | max_message_size=max_message_size, 77 | buffer_memory=buffer_memory 78 | ) 79 | 80 | # Return results 81 | return results 82 | 83 | def print_results(results: Dict[str, Any]) -> None: 84 | """ 85 | Print formatted results of the Kafka streaming operation. 86 | 87 | Args: 88 | results: Results dictionary from stream_to_kafka 89 | """ 90 | if not results: 91 | print("No results returned") 92 | return 93 | 94 | # Print results 95 | print("\nStreaming Results:") 96 | print("="*50) 97 | successful = 0 98 | failed = 0 99 | 100 | for url, result in results.items(): 101 | status = "✅ Success" if result.get("success", False) else "❌ Failed" 102 | if result.get("success", False): 103 | successful += 1 104 | print(f"{status} - {url}") 105 | if "details" in result: 106 | details = result["details"] 107 | print(f" Topic: {details.get('topic')}") 108 | if "compression_type" in details: 109 | print(f" Compression: {details.get('compression_type')}") 110 | html_file = details.get('html_file', 'N/A') 111 | md_file = details.get('markdown_file', 'N/A') 112 | print(f" HTML Content: {os.path.basename(html_file) if html_file != 'N/A' else 'N/A'}") 113 | print(f" Markdown Content: {os.path.basename(md_file) if md_file != 'N/A' else 'N/A'}") 114 | else: 115 | failed += 1 116 | print(f"{status} - {url}") 117 | if "error" in result: 118 | print(f" Error: {result['error']}") 119 | 120 | print("\nSummary:") 121 | print(f"Successfully streamed: {successful}/{len(results)}") 122 | print(f"Failed to stream: {failed}/{len(results)}") 123 | 124 | # Get session ID from the first successful result 125 | session_id = None 126 | for result in results.values(): 127 | if result.get("success", False) and "details" in result: 128 | session_id = result["details"].get("session_id") 129 | break 130 | 131 | # Or get it from the first result's details if available 132 | if not session_id and len(results) > 0: 133 | url = list(results.keys())[0] 134 | result = results[url] 135 | if "details" in result: 136 | session_id = result["details"].get("session_id") 137 | 138 | if session_id: 139 | print("\nTo consume these messages from Kafka:") 140 | print(f" python examples/kafka_consumer.py --session={session_id}") 141 | 142 | def main(): 143 | # Set up argument parser 144 | parser = argparse.ArgumentParser(description="Stream web content to Kafka using pathik's native streaming capability") 145 | parser.add_argument("--urls", type=str, required=True, help="Comma-separated list of URLs to stream") 146 | parser.add_argument("--brokers", type=str, default="localhost:9092", help="Kafka broker list (comma-separated)") 147 | parser.add_argument("--topic", type=str, default="pathik_crawl_data", help="Kafka topic to stream to") 148 | parser.add_argument("--content", type=str, choices=["html", "markdown", "both"], default="both", 149 | help="Type of content to stream") 150 | parser.add_argument("--session", type=str, help="Session ID (generated if not provided)") 151 | parser.add_argument("--compression", type=str, choices=["gzip", "snappy", "lz4", "zstd"], 152 | help="Compression algorithm to use") 153 | parser.add_argument("--max-message-size", type=int, help="Maximum message size in bytes") 154 | parser.add_argument("--buffer-memory", type=int, help="Buffer memory in bytes for Kafka producer") 155 | 156 | args = parser.parse_args() 157 | 158 | # Parse URLs 159 | url_list = [url.strip() for url in args.urls.split(",")] 160 | 161 | # Stream URLs to Kafka 162 | results = stream_urls_to_kafka( 163 | urls=url_list, 164 | kafka_brokers=args.brokers, 165 | kafka_topic=args.topic, 166 | content_type=args.content, 167 | session_id=args.session, 168 | compression_type=args.compression, 169 | max_message_size=args.max_message_size, 170 | buffer_memory=args.buffer_memory 171 | ) 172 | 173 | # Print results 174 | print_results(results) 175 | 176 | if __name__ == "__main__": 177 | main() -------------------------------------------------------------------------------- /examples/news_aggregator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | A simple news aggregator that crawls news sites and streams content to Kafka 4 | using the Pathik API 5 | """ 6 | import pathik 7 | import uuid 8 | import time 9 | import datetime 10 | import os 11 | from typing import List, Dict, Any 12 | 13 | # List of news sites to crawl 14 | NEWS_SOURCES = [ 15 | "https://news.ycombinator.com", 16 | "https://lobste.rs", 17 | "https://www.techmeme.com", 18 | "https://news.google.com/technology", 19 | "https://techcrunch.com" 20 | ] 21 | 22 | def stream_news_sites(sources: List[str], topic: str = None) -> Dict[str, Dict[str, Any]]: 23 | """ 24 | Crawl and stream content from news sites to Kafka 25 | 26 | Args: 27 | sources: List of news site URLs 28 | topic: Optional custom Kafka topic 29 | 30 | Returns: 31 | Dictionary of results 32 | """ 33 | # Generate a session ID that includes timestamp for easy organization 34 | timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") 35 | session_id = f"news_agg_{timestamp}" 36 | 37 | print(f"=== News Aggregator - {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ===") 38 | print(f"Session ID: {session_id}") 39 | print(f"Processing {len(sources)} news sources") 40 | 41 | # Use Pathik to stream content to Kafka 42 | results = pathik.stream_to_kafka( 43 | urls=sources, 44 | content_type="both", # Stream both HTML and markdown 45 | topic=topic, # Use custom topic if provided 46 | session=session_id, # Use our timestamped session ID 47 | parallel=True # Crawl sites in parallel for speed 48 | ) 49 | 50 | # Process results 51 | successful = [] 52 | failed = [] 53 | 54 | for url, status in results.items(): 55 | if status.get("success", False): 56 | successful.append(url) 57 | else: 58 | failed.append((url, status.get("error", "Unknown error"))) 59 | 60 | # Print summary 61 | print(f"\nSuccessfully crawled and streamed {len(successful)}/{len(sources)} sites") 62 | 63 | if failed: 64 | print("\nFailed sites:") 65 | for url, error in failed: 66 | print(f" - {url}: {error}") 67 | 68 | # Print session information for consumers 69 | print(f"\nKafka stream information:") 70 | print(f" - Session ID: {session_id}") 71 | print(f" - Topic: {topic or 'default (KAFKA_TOPIC from env)'}") 72 | 73 | return { 74 | "session_id": session_id, 75 | "timestamp": timestamp, 76 | "successful": successful, 77 | "failed": failed, 78 | "raw_results": results 79 | } 80 | 81 | def run_scheduled(interval_mins: int = 60, run_once: bool = False): 82 | """ 83 | Run the news aggregator on a schedule 84 | 85 | Args: 86 | interval_mins: Minutes between runs 87 | run_once: If True, only run once then exit 88 | """ 89 | topic = os.environ.get("NEWS_KAFKA_TOPIC", "pathik_news_feed") 90 | 91 | try: 92 | while True: 93 | print("\n" + "=" * 60) 94 | results = stream_news_sites(NEWS_SOURCES, topic=topic) 95 | 96 | if run_once: 97 | return results 98 | 99 | next_run = datetime.datetime.now() + datetime.timedelta(minutes=interval_mins) 100 | print(f"\nNext run scheduled at: {next_run.strftime('%Y-%m-%d %H:%M:%S')}") 101 | print("Press Ctrl+C to exit") 102 | 103 | time.sleep(interval_mins * 60) 104 | except KeyboardInterrupt: 105 | print("\nExiting news aggregator") 106 | 107 | if __name__ == "__main__": 108 | import argparse 109 | 110 | parser = argparse.ArgumentParser(description="News aggregator using Pathik API") 111 | parser.add_argument("--once", action="store_true", help="Run once and exit") 112 | parser.add_argument("--interval", type=int, default=60, 113 | help="Interval between runs in minutes (default: 60)") 114 | parser.add_argument("--topic", type=str, help="Custom Kafka topic") 115 | 116 | args = parser.parse_args() 117 | 118 | if args.topic: 119 | os.environ["NEWS_KAFKA_TOPIC"] = args.topic 120 | 121 | run_scheduled(interval_mins=args.interval, run_once=args.once) -------------------------------------------------------------------------------- /examples/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "pathik-kafka-examples", 3 | "version": "1.0.0", 4 | "description": "Examples for Pathik Kafka streaming", 5 | "main": "kafka_consumer.js", 6 | "scripts": { 7 | "consumer": "node kafka_consumer.js" 8 | }, 9 | "dependencies": { 10 | "dotenv": "^16.3.1", 11 | "kafkajs": "^2.2.4", 12 | "yargs": "^17.7.2" 13 | }, 14 | "engines": { 15 | "node": ">=14.0.0" 16 | }, 17 | "author": "", 18 | "license": "MIT" 19 | } -------------------------------------------------------------------------------- /examples/pathik: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justrach/pathik/ce3af3455925e1af57e2408cb124c6d8ea181325/examples/pathik -------------------------------------------------------------------------------- /examples/pathik_bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justrach/pathik/ce3af3455925e1af57e2408cb124c6d8ea181325/examples/pathik_bin -------------------------------------------------------------------------------- /examples/simple_crawl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Simple example demonstrating Pathik's basic crawling 4 | """ 5 | import sys 6 | import os 7 | import uuid 8 | 9 | # Ensure we're using the local pathik module 10 | parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 11 | if parent_dir not in sys.path: 12 | sys.path.insert(0, parent_dir) 13 | 14 | # Now import pathik 15 | import pathik 16 | from pathik.crawler import crawl 17 | 18 | # URLs to crawl 19 | urls = ["https://example.com"] 20 | 21 | # Create a temporary directory for output 22 | import tempfile 23 | output_dir = tempfile.mkdtemp(prefix="pathik_simple_crawl_") 24 | print(f"Output directory: {output_dir}") 25 | 26 | # Use the basic crawl function 27 | results = crawl(urls=urls, output_dir=output_dir, parallel=False) 28 | 29 | # Print results 30 | print("\nCrawl Results:") 31 | print("--------------") 32 | for url, info in results.items(): 33 | if "error" in info: 34 | print(f"❌ {url}: Error - {info['error']}") 35 | else: 36 | print(f"✅ {url}: Success") 37 | print(f" HTML: {info.get('html', 'Not found')}") 38 | print(f" Markdown: {info.get('markdown', 'Not found')}") 39 | 40 | # Clean up 41 | print(f"\nFiles in {output_dir}:") 42 | for file in os.listdir(output_dir): 43 | print(f" {file}") 44 | 45 | # Optionally display file contents 46 | for url, info in results.items(): 47 | if "html" in info and os.path.exists(info["html"]): 48 | with open(info["html"], "r", encoding="utf-8") as f: 49 | contents = f.read() 50 | preview = contents[:100] + "..." if len(contents) > 100 else contents 51 | print(f"\nHTML preview for {url}:") 52 | print(preview) -------------------------------------------------------------------------------- /examples/simple_kafka_example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Simple example showing real Kafka streaming with Pathik 4 | """ 5 | import sys 6 | import os 7 | import uuid 8 | 9 | # Ensure we're using the local pathik module 10 | parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 11 | if parent_dir not in sys.path: 12 | sys.path.insert(0, parent_dir) 13 | 14 | import pathik 15 | 16 | def run_example(): 17 | # Generate a session ID 18 | session_id = str(uuid.uuid4()) 19 | print(f"Generated session ID: {session_id}") 20 | 21 | # Set the Kafka topic through environment variable 22 | os.environ["KAFKA_TOPIC"] = "pathik_crawl_data" 23 | 24 | # URL to crawl and stream 25 | url = "https://httpbin.org/html" 26 | print(f"Crawling and streaming {url} to Kafka...") 27 | 28 | try: 29 | # Stream the content to Kafka 30 | result = pathik.stream_to_kafka( 31 | urls=url, 32 | content_type="both", 33 | session=session_id 34 | ) 35 | 36 | # Print the result 37 | print("\nResult:") 38 | print("=======") 39 | print(f"URL: {url}") 40 | if result[url]["success"]: 41 | print("Status: Success") 42 | if "details" in result[url]: 43 | details = result[url]["details"] 44 | print(f"Topic: {details.get('topic')}") 45 | print(f"HTML file: {details.get('html_file')}") 46 | print(f"Markdown file: {details.get('markdown_file')}") 47 | else: 48 | print(f"Status: Failed - {result[url].get('error', 'Unknown error')}") 49 | 50 | print("\nTo consume these messages from Kafka:") 51 | print(f" python examples/kafka_consumer.py --session={session_id}") 52 | 53 | return result 54 | 55 | except Exception as e: 56 | print(f"Error: {e}") 57 | import traceback 58 | traceback.print_exc() 59 | 60 | if __name__ == "__main__": 61 | run_example() -------------------------------------------------------------------------------- /examples/test_secure_kafka.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Secure Kafka streaming test using our fixed implementation with customizable buffer sizes. 4 | """ 5 | import os 6 | import sys 7 | import uuid 8 | import argparse 9 | import subprocess 10 | from typing import List, Optional, Dict, Any 11 | import time 12 | 13 | # Force use of the local binary 14 | LOCAL_BINARY = os.path.join(os.path.dirname(os.path.abspath(__file__)), "pathik_bin") 15 | 16 | def stream_to_kafka_direct( 17 | urls: List[str], 18 | kafka_brokers: str = "localhost:9092", 19 | kafka_topic: str = "pathik_crawl_data", 20 | content_type: str = "both", 21 | session_id: Optional[str] = None, 22 | max_message_size: int = 10 * 1024 * 1024, # 10MB 23 | buffer_memory: int = 100 * 1024 * 1024, # 100MB 24 | max_request_size: int = 20 * 1024 * 1024 # 20MB 25 | ) -> Dict[str, Any]: 26 | """ 27 | Stream directly using our secured binary with buffer size customization 28 | """ 29 | if not os.path.exists(LOCAL_BINARY): 30 | raise FileNotFoundError(f"Local binary not found at {LOCAL_BINARY}") 31 | 32 | # Ensure binary is executable 33 | if not os.access(LOCAL_BINARY, os.X_OK): 34 | os.chmod(LOCAL_BINARY, 0o755) 35 | 36 | # Generate a session ID if not provided 37 | if not session_id: 38 | session_id = str(uuid.uuid4()) 39 | 40 | # Set environment variables for Kafka configuration 41 | env = os.environ.copy() 42 | env["KAFKA_BROKERS"] = kafka_brokers 43 | env["KAFKA_TOPIC"] = kafka_topic 44 | 45 | # Add buffer size configurations 46 | env["KAFKA_MAX_MESSAGE_SIZE"] = str(max_message_size) 47 | env["KAFKA_BUFFER_MEMORY"] = str(buffer_memory) 48 | env["KAFKA_MAX_REQUEST_SIZE"] = str(max_request_size) 49 | 50 | # Build command 51 | cmd = [ 52 | LOCAL_BINARY, 53 | "-kafka", 54 | "-content", content_type, 55 | "-session", session_id 56 | ] 57 | 58 | # Add URLs 59 | cmd.extend(urls) 60 | 61 | print(f"Running command: {' '.join(cmd)}") 62 | result = subprocess.run(cmd, env=env, capture_output=True, text=True) 63 | 64 | if result.returncode != 0: 65 | print(f"Error running command: {result.stderr}") 66 | raise RuntimeError(f"Command failed with exit code {result.returncode}") 67 | 68 | print(result.stdout) 69 | 70 | # Format results 71 | results = {} 72 | successful = 0 73 | for url in urls: 74 | results[url] = { 75 | "success": True, 76 | "details": { 77 | "topic": kafka_topic, 78 | "session_id": session_id 79 | } 80 | } 81 | successful += 1 82 | 83 | return { 84 | "results": results, 85 | "success_count": successful, 86 | "failed_count": len(urls) - successful, 87 | "session_id": session_id 88 | } 89 | 90 | def main(): 91 | # Set up argument parser 92 | parser = argparse.ArgumentParser(description="Test secure Kafka streaming with custom buffer sizes") 93 | parser.add_argument("--urls", type=str, required=True, help="Comma-separated list of URLs to stream") 94 | parser.add_argument("--brokers", type=str, default="localhost:9092", help="Kafka broker list (comma-separated)") 95 | parser.add_argument("--topic", type=str, default="pathik_crawl_data", help="Kafka topic to stream to") 96 | parser.add_argument("--content", type=str, choices=["html", "markdown", "both"], default="both", 97 | help="Type of content to stream") 98 | parser.add_argument("--session", type=str, help="Session ID (generated if not provided)") 99 | 100 | # Add buffer size customization 101 | parser.add_argument("--max-message-size", type=int, default=10 * 1024 * 1024, 102 | help="Maximum message size in bytes (default: 10MB)") 103 | parser.add_argument("--buffer-memory", type=int, default=100 * 1024 * 1024, 104 | help="Producer buffer memory in bytes (default: 100MB)") 105 | parser.add_argument("--max-request-size", type=int, default=20 * 1024 * 1024, 106 | help="Maximum request size in bytes (default: 20MB)") 107 | 108 | args = parser.parse_args() 109 | 110 | # Parse URLs 111 | url_list = [url.strip() for url in args.urls.split(",")] 112 | 113 | # Generate a session ID if not provided 114 | session_id = args.session 115 | if not session_id: 116 | session_id = str(uuid.uuid4()) 117 | 118 | # Display config information 119 | print(f"Streaming {len(urls)} URLs to Kafka with SECURE implementation:") 120 | print(f"Kafka Brokers: {args.brokers}") 121 | print(f"Kafka Topic: {args.topic}") 122 | print(f"Content Type: {args.content}") 123 | print(f"Session ID: {session_id}") 124 | print(f"Max Message Size: {args.max_message_size:,} bytes") 125 | print(f"Buffer Memory: {args.buffer_memory:,} bytes") 126 | print(f"Max Request Size: {args.max_request_size:,} bytes") 127 | print("="*50) 128 | 129 | # Stream to Kafka with our secure implementation 130 | try: 131 | start_time = time.time() 132 | response = stream_to_kafka_direct( 133 | urls=url_list, 134 | kafka_brokers=args.brokers, 135 | kafka_topic=args.topic, 136 | content_type=args.content, 137 | session_id=session_id, 138 | max_message_size=args.max_message_size, 139 | buffer_memory=args.buffer_memory, 140 | max_request_size=args.max_request_size 141 | ) 142 | elapsed_time = time.time() - start_time 143 | 144 | results = response["results"] 145 | successful = response["success_count"] 146 | failed = response["failed_count"] 147 | 148 | print("\nStreaming Results:") 149 | print("="*50) 150 | 151 | for url, result in results.items(): 152 | status = "✅ Success" if result.get("success", False) else "❌ Failed" 153 | if result.get("success", False): 154 | print(f"{status} - {url}") 155 | if "details" in result: 156 | details = result["details"] 157 | print(f" Topic: {details.get('topic')}") 158 | else: 159 | print(f"{status} - {url}") 160 | if "error" in result: 161 | print(f" Error: {result['error']}") 162 | 163 | print("\nSummary:") 164 | print(f"Successfully streamed: {successful}/{len(url_list)}") 165 | print(f"Failed to stream: {failed}/{len(url_list)}") 166 | print(f"Total time: {elapsed_time:.2f} seconds") 167 | 168 | print("\nTo consume these messages from Kafka:") 169 | print(f" python examples/kafka_consumer_direct.py --session={session_id}") 170 | 171 | except Exception as e: 172 | print(f"Error: {e}") 173 | sys.exit(1) 174 | 175 | if __name__ == "__main__": 176 | main() -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module pathik 2 | 3 | go 1.24.0 4 | 5 | require ( 6 | github.com/JohannesKaufmann/html-to-markdown v1.6.0 7 | github.com/aws/aws-sdk-go-v2 v1.36.3 8 | github.com/aws/aws-sdk-go-v2/config v1.29.8 9 | github.com/aws/aws-sdk-go-v2/credentials v1.17.61 10 | github.com/aws/aws-sdk-go-v2/service/s3 v1.78.0 11 | github.com/go-rod/rod v0.116.2 12 | github.com/go-shiori/go-readability v0.0.0-20250217085726-9f5bf5ca7612 13 | github.com/joho/godotenv v1.5.1 14 | github.com/segmentio/kafka-go v0.4.47 15 | ) 16 | 17 | require ( 18 | github.com/klauspost/compress v1.15.9 // indirect 19 | github.com/pierrec/lz4/v4 v4.1.15 // indirect 20 | ) 21 | 22 | require ( 23 | github.com/PuerkitoBio/goquery v1.9.2 // indirect 24 | github.com/andybalholm/cascadia v1.3.3 // indirect 25 | github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de // indirect 26 | github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.10 // indirect 27 | github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.30 // indirect 28 | github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.34 // indirect 29 | github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.34 // indirect 30 | github.com/aws/aws-sdk-go-v2/internal/ini v1.8.3 // indirect 31 | github.com/aws/aws-sdk-go-v2/internal/v4a v1.3.34 // indirect 32 | github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.12.3 // indirect 33 | github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.6.2 // indirect 34 | github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.15 // indirect 35 | github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.18.15 // indirect 36 | github.com/aws/aws-sdk-go-v2/service/sso v1.25.0 // indirect 37 | github.com/aws/aws-sdk-go-v2/service/ssooidc v1.29.0 // indirect 38 | github.com/aws/aws-sdk-go-v2/service/sts v1.33.16 // indirect 39 | github.com/aws/smithy-go v1.22.2 // indirect 40 | github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect 41 | github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect 42 | github.com/ysmood/fetchup v0.2.3 // indirect 43 | github.com/ysmood/goob v0.4.0 // indirect 44 | github.com/ysmood/got v0.40.0 // indirect 45 | github.com/ysmood/gson v0.7.3 // indirect 46 | github.com/ysmood/leakless v0.9.0 // indirect 47 | golang.org/x/net v0.36.0 // indirect 48 | golang.org/x/text v0.22.0 // indirect 49 | golang.org/x/time v0.11.0 // indirect 50 | ) 51 | -------------------------------------------------------------------------------- /new-version.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Colors for output 4 | GREEN='\033[0;32m' 5 | YELLOW='\033[1;33m' 6 | RED='\033[0;31m' 7 | NC='\033[0m' # No Color 8 | 9 | # Function to print messages with color 10 | print_message() { 11 | echo -e "${GREEN}$1${NC}" 12 | } 13 | 14 | print_warning() { 15 | echo -e "${YELLOW}$1${NC}" 16 | } 17 | 18 | print_error() { 19 | echo -e "${RED}$1${NC}" 20 | } 21 | 22 | # Get the latest tag 23 | get_latest_tag() { 24 | git fetch --tags 25 | latest_tag=$(git tag -l "v*" --sort=-v:refname | head -n 1) 26 | 27 | if [ -z "$latest_tag" ]; then 28 | print_warning "No existing version tags found. Starting with v0.1.0." 29 | latest_tag="v0.1.0" 30 | else 31 | print_message "Latest version tag: $latest_tag" 32 | fi 33 | 34 | # Strip the 'v' prefix for version calculations 35 | latest_version=${latest_tag#v} 36 | } 37 | 38 | # Increment version 39 | increment_version() { 40 | # Split version into parts 41 | IFS='.' read -r -a version_parts <<< "$latest_version" 42 | 43 | major=${version_parts[0]} 44 | minor=${version_parts[1]} 45 | patch=${version_parts[2]} 46 | 47 | case $1 in 48 | major) 49 | major=$((major + 1)) 50 | minor=0 51 | patch=0 52 | ;; 53 | minor) 54 | minor=$((minor + 1)) 55 | patch=0 56 | ;; 57 | patch|*) 58 | patch=$((patch + 1)) 59 | ;; 60 | esac 61 | 62 | new_version="$major.$minor.$patch" 63 | new_tag="v$new_version" 64 | } 65 | 66 | # Main script 67 | 68 | # Check if we're in a git repository 69 | if ! git rev-parse --is-inside-work-tree > /dev/null 2>&1; then 70 | print_error "Not in a git repository!" 71 | exit 1 72 | fi 73 | 74 | # Get the latest tag 75 | get_latest_tag 76 | 77 | # Process command line arguments 78 | if [ $# -eq 0 ]; then 79 | # No arguments - show interactive menu 80 | echo "Current version: $latest_tag" 81 | echo "What would you like to do?" 82 | echo "1) Increment patch version (default)" 83 | echo "2) Increment minor version" 84 | echo "3) Increment major version" 85 | echo "4) Specify exact version" 86 | 87 | read -p "Select option [1-4] (default: 1): " option 88 | 89 | case $option in 90 | 2) increment_version minor ;; 91 | 3) increment_version major ;; 92 | 4) 93 | read -p "Enter version (without 'v' prefix): " user_version 94 | new_version=$user_version 95 | new_tag="v$new_version" 96 | ;; 97 | *) increment_version patch ;; 98 | esac 99 | else 100 | # Version provided as command line argument 101 | if [[ $1 == v* ]]; then 102 | new_tag=$1 103 | new_version=${new_tag#v} 104 | else 105 | new_version=$1 106 | new_tag="v$new_version" 107 | fi 108 | fi 109 | 110 | print_message "Creating new version tag: $new_tag" 111 | 112 | # Check if there are uncomitted changes 113 | if [ -n "$(git status --porcelain)" ]; then 114 | print_warning "You have uncommitted changes. It's recommended to commit them before creating a new release." 115 | read -p "Continue anyway? (y/n): " continue_choice 116 | if [[ $continue_choice != "y" && $continue_choice != "Y" ]]; then 117 | print_message "Aborting. Please commit your changes first." 118 | exit 0 119 | fi 120 | fi 121 | 122 | # Create and push the tag 123 | print_message "Creating tag $new_tag with message 'Release $new_tag'..." 124 | git tag -a "$new_tag" -m "Release $new_tag" 125 | 126 | print_message "Pushing tag to remote..." 127 | git push origin "$new_tag" 128 | 129 | print_message "Version $new_tag has been created and pushed!" 130 | print_message "GitHub Actions workflow should start automatically." 131 | print_message "Check the progress at: https://github.com/$(git remote get-url origin | sed 's/.*github.com[:/]\(.*\).git/\1/')/actions" -------------------------------------------------------------------------------- /package-lock.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "pathik", 3 | "lockfileVersion": 3, 4 | "requires": true, 5 | "packages": { 6 | "": { 7 | "dependencies": { 8 | "pathik": "^0.2.0" 9 | } 10 | }, 11 | "node_modules/commander": { 12 | "version": "11.1.0", 13 | "resolved": "https://registry.npmjs.org/commander/-/commander-11.1.0.tgz", 14 | "integrity": "sha512-yPVavfyCcRhmorC7rWlkHn15b4wDVgVmBA7kV4QVBsF7kv/9TKJAbAXVTxvTnwP8HHKjRCJDClKbciiYS7p0DQ==", 15 | "license": "MIT", 16 | "engines": { 17 | "node": ">=16" 18 | } 19 | }, 20 | "node_modules/pathik": { 21 | "version": "0.2.0", 22 | "resolved": "https://registry.npmjs.org/pathik/-/pathik-0.2.0.tgz", 23 | "integrity": "sha512-hNZ7XvrphFsRhGPl+hPGEZj6hRmQwSfL6Ox6k8c3rykOuw/canPRoC9FMIc3WvwlxuBBcrAtyzzMrvW5m1BTTg==", 24 | "hasInstallScript": true, 25 | "license": "Apache-2.0", 26 | "dependencies": { 27 | "commander": "^11.1.0", 28 | "uuid": "^9.0.1" 29 | }, 30 | "bin": { 31 | "pathik": "bin/pathik-cli.js" 32 | }, 33 | "engines": { 34 | "node": ">=14.0.0" 35 | } 36 | }, 37 | "node_modules/uuid": { 38 | "version": "9.0.1", 39 | "resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz", 40 | "integrity": "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==", 41 | "funding": [ 42 | "https://github.com/sponsors/broofa", 43 | "https://github.com/sponsors/ctavan" 44 | ], 45 | "license": "MIT", 46 | "bin": { 47 | "uuid": "dist/bin/uuid" 48 | } 49 | } 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | {} 2 | -------------------------------------------------------------------------------- /parallel_test.js: -------------------------------------------------------------------------------- 1 | const pathik = require('pathik'); 2 | const path = require('path'); 3 | const fs = require('fs'); 4 | 5 | async function main() { 6 | // Create output directories 7 | const baseDir = path.resolve('./output_test_js'); 8 | const parallelDir = path.join(baseDir, 'parallel'); 9 | const sequentialDir = path.join(baseDir, 'sequential'); 10 | 11 | fs.mkdirSync(parallelDir, { recursive: true }); 12 | fs.mkdirSync(sequentialDir, { recursive: true }); 13 | 14 | // Test URLs - mix of different sites 15 | const urls = [ 16 | 'https://example.com', 17 | 'https://httpbin.org/html', 18 | 'https://jsonplaceholder.typicode.com', 19 | 'https://books.toscrape.com', 20 | 'https://quotes.toscrape.com' 21 | ]; 22 | 23 | console.log(`Testing with ${urls.length} URLs...`); 24 | 25 | // Test parallel crawling 26 | console.log('\n=== PARALLEL CRAWLING ==='); 27 | const parallelStart = Date.now(); 28 | 29 | try { 30 | const parallelResults = await pathik.crawl(urls, { 31 | outputDir: parallelDir, 32 | parallel: true 33 | }); 34 | 35 | const parallelTime = (Date.now() - parallelStart) / 1000; 36 | console.log(`Parallel crawling completed in ${parallelTime.toFixed(2)} seconds`); 37 | 38 | // Print results summary 39 | for (const [url, info] of Object.entries(parallelResults)) { 40 | const status = info.html && info.markdown ? '✅ Success' : '❌ Failed'; 41 | console.log(` ${url}: ${status}`); 42 | } 43 | 44 | // Test sequential crawling 45 | console.log('\n=== SEQUENTIAL CRAWLING ==='); 46 | const sequentialStart = Date.now(); 47 | 48 | try { 49 | const sequentialResults = await pathik.crawl(urls, { 50 | outputDir: sequentialDir, 51 | parallel: false 52 | }); 53 | 54 | const sequentialTime = (Date.now() - sequentialStart) / 1000; 55 | console.log(`Sequential crawling completed in ${sequentialTime.toFixed(2)} seconds`); 56 | 57 | // Print results summary 58 | for (const [url, info] of Object.entries(sequentialResults)) { 59 | const status = info.html && info.markdown ? '✅ Success' : '❌ Failed'; 60 | console.log(` ${url}: ${status}`); 61 | } 62 | 63 | // Compare performance 64 | console.log('\n=== PERFORMANCE COMPARISON ==='); 65 | if (parallelTime < sequentialTime) { 66 | const speedup = sequentialTime / parallelTime; 67 | console.log(`Parallel crawling was ${speedup.toFixed(2)}x faster than sequential crawling`); 68 | } else { 69 | console.log(`Warning: Parallel crawling was not faster in this test`); 70 | } 71 | 72 | console.log(`Parallel: ${parallelTime.toFixed(2)}s vs Sequential: ${sequentialTime.toFixed(2)}s`); 73 | console.log(`\nOutput files are located in: ${baseDir}`); 74 | } catch (error) { 75 | console.error(`Error during sequential crawling: ${error.message}`); 76 | process.exit(1); 77 | } 78 | } catch (error) { 79 | console.error(`Error during parallel crawling: ${error.message}`); 80 | process.exit(1); 81 | } 82 | } 83 | 84 | main().catch(console.error); -------------------------------------------------------------------------------- /parallel_test.py: -------------------------------------------------------------------------------- 1 | import pathik 2 | import os 3 | import time 4 | import sys 5 | 6 | def main(): 7 | # Create output directories 8 | base_dir = os.path.abspath("./output_test") 9 | parallel_dir = os.path.join(base_dir, "parallel") 10 | sequential_dir = os.path.join(base_dir, "sequential") 11 | 12 | os.makedirs(parallel_dir, exist_ok=True) 13 | os.makedirs(sequential_dir, exist_ok=True) 14 | 15 | # Test URLs - mix of different sites 16 | urls = [ 17 | "https://example.com", 18 | "https://httpbin.org/html", 19 | "https://jsonplaceholder.typicode.com", 20 | "https://books.toscrape.com", 21 | "https://quotes.toscrape.com" 22 | ] 23 | 24 | print(f"Testing with {len(urls)} URLs...") 25 | 26 | # Test parallel crawling 27 | print("\n=== PARALLEL CRAWLING ===") 28 | parallel_start = time.time() 29 | 30 | try: 31 | parallel_results = pathik.crawl(urls, output_dir=parallel_dir, parallel=True) 32 | parallel_time = time.time() - parallel_start 33 | 34 | print(f"Parallel crawling completed in {parallel_time:.2f} seconds") 35 | 36 | # Print results summary 37 | for url, info in parallel_results.items(): 38 | status = "✅ Success" if info.get("html") and info.get("markdown") else "❌ Failed" 39 | print(f" {url}: {status}") 40 | except Exception as e: 41 | print(f"Error during parallel crawling: {e}") 42 | sys.exit(1) 43 | 44 | # Test sequential crawling 45 | print("\n=== SEQUENTIAL CRAWLING ===") 46 | sequential_start = time.time() 47 | 48 | try: 49 | sequential_results = pathik.crawl(urls, output_dir=sequential_dir, parallel=False) 50 | sequential_time = time.time() - sequential_start 51 | 52 | print(f"Sequential crawling completed in {sequential_time:.2f} seconds") 53 | 54 | # Print results summary 55 | for url, info in sequential_results.items(): 56 | status = "✅ Success" if info.get("html") and info.get("markdown") else "❌ Failed" 57 | print(f" {url}: {status}") 58 | except Exception as e: 59 | print(f"Error during sequential crawling: {e}") 60 | sys.exit(1) 61 | 62 | # Compare performance 63 | if parallel_time < sequential_time: 64 | speedup = sequential_time / parallel_time 65 | print(f"\n=== PERFORMANCE COMPARISON ===") 66 | print(f"Parallel crawling was {speedup:.2f}x faster than sequential crawling") 67 | print(f"Parallel: {parallel_time:.2f}s vs Sequential: {sequential_time:.2f}s") 68 | else: 69 | print("\n=== PERFORMANCE COMPARISON ===") 70 | print(f"Warning: Parallel crawling was not faster in this test") 71 | print(f"Parallel: {parallel_time:.2f}s vs Sequential: {sequential_time:.2f}s") 72 | 73 | print(f"\nOutput files are located in: {base_dir}") 74 | 75 | if __name__ == "__main__": 76 | main() -------------------------------------------------------------------------------- /pathik-js/README.md: -------------------------------------------------------------------------------- 1 | # Pathik for Node.js 2 | 3 | High-performance web crawler implemented in Go with JavaScript bindings. 4 | 5 | ## Installation 6 | 7 | ```bash 8 | npm install pathik 9 | ``` 10 | 11 | ## Prerequisites 12 | 13 | - Node.js 14+ 14 | - Go 1.16+ (for building the binary) 15 | 16 | ## Usage 17 | 18 | ### Basic Crawling 19 | 20 | ```javascript 21 | const pathik = require('pathik'); 22 | const path = require('path'); 23 | const fs = require('fs'); 24 | 25 | // Create output directory 26 | const outputDir = path.resolve('./output_data'); 27 | fs.mkdirSync(outputDir, { recursive: true }); 28 | 29 | // List of URLs to crawl 30 | const urls = [ 31 | 'https://example.com', 32 | 'https://news.ycombinator.com' 33 | ]; 34 | 35 | // Crawl the URLs 36 | pathik.crawl(urls, { outputDir }) 37 | .then(results => { 38 | console.log('Crawling results:'); 39 | 40 | for (const [url, files] of Object.entries(results)) { 41 | console.log(`URL: ${url}`); 42 | console.log(`HTML file: ${files.html}`); 43 | console.log(`Markdown file: ${files.markdown}`); 44 | } 45 | }) 46 | .catch(error => { 47 | console.error(`Error during crawling: ${error.message}`); 48 | }); 49 | ``` 50 | 51 | ### Parallel Crawling 52 | 53 | Pathik supports parallel crawling by default, making it very efficient for batch processing: 54 | 55 | ```javascript 56 | const pathik = require('pathik'); 57 | 58 | // List of many URLs to crawl in parallel 59 | const urls = [ 60 | 'https://example.com', 61 | 'https://news.ycombinator.com', 62 | 'https://github.com', 63 | 'https://developer.mozilla.org', 64 | 'https://wikipedia.org' 65 | ]; 66 | 67 | // Crawl multiple URLs in parallel (default behavior) 68 | pathik.crawl(urls, { outputDir: './output' }) 69 | .then(results => { 70 | console.log(`Successfully crawled ${Object.keys(results).length} URLs`); 71 | }); 72 | 73 | // Disable parallel crawling if needed 74 | pathik.crawl(urls, { 75 | outputDir: './output', 76 | parallel: false // Process sequentially 77 | }) 78 | .then(results => { 79 | console.log(`Successfully crawled ${Object.keys(results).length} URLs sequentially`); 80 | }); 81 | ``` 82 | 83 | ### R2 Upload 84 | 85 | ```javascript 86 | const pathik = require('pathik'); 87 | 88 | // Crawl and upload to R2 89 | pathik.crawlToR2(['https://example.com'], { uuid: 'my-unique-id' }) 90 | .then(results => { 91 | console.log('R2 Upload results:'); 92 | 93 | for (const [url, info] of Object.entries(results)) { 94 | console.log(`URL: ${url}`); 95 | console.log(`R2 HTML key: ${info.r2_html_key}`); 96 | console.log(`R2 Markdown key: ${info.r2_markdown_key}`); 97 | } 98 | }) 99 | .catch(error => { 100 | console.error(`Error during R2 upload: ${error.message}`); 101 | }); 102 | ``` 103 | 104 | ### Kafka Streaming 105 | 106 | Stream crawled content directly to Kafka: 107 | 108 | ```javascript 109 | const pathik = require('pathik'); 110 | const crypto = require('crypto'); 111 | 112 | // Generate a session ID to identify this batch 113 | const sessionId = crypto.randomUUID(); 114 | 115 | // Crawl and stream to Kafka 116 | pathik.streamToKafka(['https://example.com'], { 117 | contentType: 'both', // 'html', 'markdown', or 'both' 118 | session: sessionId, // Session ID for batch identification 119 | topic: 'custom-topic' // Optional custom topic (uses KAFKA_TOPIC env var by default) 120 | }) 121 | .then(results => { 122 | console.log('Kafka streaming results:'); 123 | 124 | for (const [url, result] of Object.entries(results)) { 125 | if (result.success) { 126 | console.log(`✓ ${url}: Successfully streamed to Kafka`); 127 | } else { 128 | console.log(`✗ ${url}: Failed - ${result.error}`); 129 | } 130 | } 131 | }) 132 | .catch(error => { 133 | console.error(`Error during Kafka streaming: ${error.message}`); 134 | }); 135 | ``` 136 | 137 | ### Command-line Interface 138 | 139 | ```bash 140 | # Install globally 141 | npm install -g pathik 142 | 143 | # Crawl URLs 144 | pathik crawl https://example.com https://news.ycombinator.com -o ./output 145 | 146 | # Crawl multiple URLs in parallel (default) 147 | pathik crawl https://example.com https://news.ycombinator.com https://github.com -o ./output 148 | 149 | # Disable parallel crawling 150 | pathik crawl https://example.com https://news.ycombinator.com --no-parallel -o ./output 151 | 152 | # Crawl and upload to R2 153 | pathik r2 https://example.com -u my-unique-id 154 | 155 | # Stream to Kafka 156 | pathik kafka https://example.com --session my-session-id --content both 157 | ``` 158 | 159 | ## API 160 | 161 | ### pathik.crawl(urls, options) 162 | 163 | Crawl URLs and save content locally. 164 | 165 | - `urls`: String or array of URLs to crawl 166 | - `options`: Object with crawl options 167 | - `outputDir`: Directory to save output (uses temp dir if null) 168 | - `parallel`: Boolean to enable/disable parallel crawling (default: true) 169 | - Returns: Promise resolving to an object mapping URLs to file paths 170 | 171 | ### pathik.crawlToR2(urls, options) 172 | 173 | Crawl URLs and upload content to R2. 174 | 175 | - `urls`: String or array of URLs to crawl 176 | - `options`: Object with R2 options 177 | - `uuid`: UUID to prefix filenames (generates random UUID if null) 178 | - `parallel`: Boolean to enable/disable parallel crawling (default: true) 179 | - Returns: Promise resolving to an object mapping URLs to R2 keys 180 | 181 | ### pathik.streamToKafka(urls, options) 182 | 183 | Crawl URLs and stream content to Kafka. 184 | 185 | - `urls`: String or array of URLs to crawl 186 | - `options`: Object with Kafka options 187 | - `contentType`: Type of content to stream: 'html', 'markdown', or 'both' (default: 'both') 188 | - `topic`: Custom Kafka topic (uses KAFKA_TOPIC env var if not specified) 189 | - `session`: Session ID for identifying this batch of messages 190 | - `parallel`: Boolean to enable/disable parallel crawling (default: true) 191 | - Returns: Promise resolving to an object mapping URLs to streaming status 192 | 193 | ## Environment Variables 194 | 195 | ### For R2 Storage 196 | 197 | - `R2_ACCOUNT_ID`: Cloudflare account ID 198 | - `R2_ACCESS_KEY_ID`: Cloudflare R2 access key ID 199 | - `R2_ACCESS_KEY_SECRET`: Cloudflare R2 access key secret 200 | - `R2_BUCKET_NAME`: Cloudflare R2 bucket name 201 | 202 | ### For Kafka Streaming 203 | 204 | - `KAFKA_BROKERS`: Comma-separated list of Kafka brokers (default: 'localhost:9092') 205 | - `KAFKA_TOPIC`: Topic to publish messages to (default: 'pathik_crawl_data') 206 | - `KAFKA_USERNAME`: Username for SASL authentication (optional) 207 | - `KAFKA_PASSWORD`: Password for SASL authentication (optional) 208 | - `KAFKA_CLIENT_ID`: Client ID for Kafka producer (optional) 209 | - `KAFKA_USE_TLS`: Whether to use TLS connection (true/false, optional) 210 | 211 | ## Building the Binary 212 | 213 | If the Go binary isn't built automatically during installation: 214 | 215 | ```bash 216 | npm run build-binary 217 | ``` 218 | 219 | ## Troubleshooting 220 | 221 | ### Missing Binary 222 | 223 | ```bash 224 | npm run build-binary 225 | ``` 226 | 227 | ### Import Errors 228 | 229 | ```bash 230 | npm uninstall -y pathik 231 | cd pathik-js && npm install 232 | ``` 233 | 234 | ## Performance 235 | 236 | Pathik's concurrent crawling is powered by Go's goroutines, making it significantly more memory-efficient than browser automation tools: 237 | 238 | - Uses ~10x less memory than Playwright 239 | - Efficiently processes large batches of URLs 240 | - Parallelism controlled by the Go binary (default: 5 concurrent crawls) 241 | 242 | ## License 243 | 244 | Apache 2.0 -------------------------------------------------------------------------------- /pathik-js/benchmark/create-comparison-chart.js: -------------------------------------------------------------------------------- 1 | const { ChartJSNodeCanvas } = require('chartjs-node-canvas'); 2 | const fs = require('fs'); 3 | const path = require('path'); 4 | 5 | async function createMemoryComparisonChart() { 6 | // Configuration for the chart 7 | const width = 800; 8 | const height = 400; 9 | const chartCallback = (ChartJS) => { 10 | // Optional callback to customize Chart.js instance 11 | ChartJS.defaults.font.family = 'Arial'; 12 | ChartJS.defaults.font.size = 16; 13 | }; 14 | 15 | // Create canvas 16 | const chartJSNodeCanvas = new ChartJSNodeCanvas({ width, height, chartCallback }); 17 | 18 | // Actual memory usage for example.com from the benchmark data 19 | const pathikMemory = 0.45; // MB for example.com 20 | const playwrightMemory = 4.64; // MB for example.com 21 | 22 | // Calculate the ratio 23 | const ratio = Math.round(playwrightMemory / pathikMemory); 24 | 25 | // Chart configuration 26 | const configuration = { 27 | type: 'bar', 28 | data: { 29 | labels: ['Memory Usage for example.com'], 30 | datasets: [ 31 | { 32 | label: 'Pathik', 33 | data: [pathikMemory.toFixed(2)], 34 | backgroundColor: 'rgba(54, 162, 235, 0.8)', 35 | borderColor: 'rgba(54, 162, 235, 1)', 36 | borderWidth: 1 37 | }, 38 | { 39 | label: 'Playwright', 40 | data: [playwrightMemory.toFixed(2)], 41 | backgroundColor: 'rgba(255, 99, 132, 0.8)', 42 | borderColor: 'rgba(255, 99, 132, 1)', 43 | borderWidth: 1 44 | } 45 | ] 46 | }, 47 | options: { 48 | plugins: { 49 | title: { 50 | display: true, 51 | text: `Memory usage - ${ratio}x less`, 52 | font: { 53 | size: 24 54 | }, 55 | padding: 20 56 | }, 57 | legend: { 58 | position: 'bottom' 59 | } 60 | }, 61 | scales: { 62 | y: { 63 | beginAtZero: true, 64 | title: { 65 | display: true, 66 | text: 'Memory Usage (MB)' 67 | } 68 | } 69 | } 70 | } 71 | }; 72 | 73 | // Render the chart 74 | const imageBuffer = await chartJSNodeCanvas.renderToBuffer(configuration); 75 | 76 | // Save the image 77 | const outputDir = path.join(__dirname, 'results'); 78 | if (!fs.existsSync(outputDir)) { 79 | fs.mkdirSync(outputDir, { recursive: true }); 80 | } 81 | 82 | const outputPath = path.join(outputDir, 'memory-comparison.png'); 83 | fs.writeFileSync(outputPath, imageBuffer); 84 | 85 | console.log(`Chart saved to: ${outputPath}`); 86 | console.log(`Memory usage for example.com: Pathik ${pathikMemory.toFixed(2)}MB vs Playwright ${playwrightMemory.toFixed(2)}MB`); 87 | console.log(`For example.com, Playwright uses ${ratio}x more memory than Pathik`); 88 | } 89 | 90 | createMemoryComparisonChart().catch(console.error); -------------------------------------------------------------------------------- /pathik-js/benchmark/results/benchmark_data.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": { 3 | "platform": "darwin", 4 | "release": "24.1.0", 5 | "node": "v22.6.0", 6 | "cpus": 14, 7 | "memory": 48, 8 | "date": "2025-03-04T15:32:03.711Z" 9 | }, 10 | "singleResults": [ 11 | { 12 | "crawler": "pathik", 13 | "url": "https://example.com", 14 | "time": 4.722, 15 | "memory": 0.453125, 16 | "contentSize": 0.2080078125, 17 | "success": true 18 | }, 19 | { 20 | "crawler": "playwright", 21 | "url": "https://example.com", 22 | "time": 1.889, 23 | "memory": 4.640625, 24 | "contentSize": 0.287109375, 25 | "success": true 26 | }, 27 | { 28 | "crawler": "pathik", 29 | "url": "https://httpbin.org/html", 30 | "time": 5.68, 31 | "memory": 0.5, 32 | "contentSize": 3.515625, 33 | "success": true 34 | }, 35 | { 36 | "crawler": "playwright", 37 | "url": "https://httpbin.org/html", 38 | "time": 2.198, 39 | "memory": 1.453125, 40 | "contentSize": 3.5908203125, 41 | "success": true 42 | }, 43 | { 44 | "crawler": "pathik", 45 | "url": "https://jsonplaceholder.typicode.com", 46 | "time": 4.415, 47 | "memory": 0.421875, 48 | "contentSize": 2.6396484375, 49 | "success": true 50 | }, 51 | { 52 | "crawler": "playwright", 53 | "url": "https://jsonplaceholder.typicode.com", 54 | "time": 2.275, 55 | "memory": 1.875, 56 | "contentSize": 1.88671875, 57 | "success": true 58 | } 59 | ], 60 | "batchResults": [ 61 | { 62 | "crawler": "pathik", 63 | "totalTime": 30.001, 64 | "avgTimePerUrl": 6.0002, 65 | "memory": 1, 66 | "memoryPerUrl": 0.2, 67 | "successRate": 80, 68 | "throughput": 0.1333288890370321, 69 | "urlCount": 5, 70 | "successCount": 4 71 | }, 72 | { 73 | "crawler": "playwright", 74 | "totalTime": 5.257, 75 | "avgTimePerUrl": 1.0514, 76 | "memory": 4.984375, 77 | "memoryPerUrl": 0.996875, 78 | "successRate": 100, 79 | "throughput": 0.9511128019783147, 80 | "urlCount": 5, 81 | "successCount": 5 82 | } 83 | ] 84 | } -------------------------------------------------------------------------------- /pathik-js/benchmark/results/memory-comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justrach/pathik/ce3af3455925e1af57e2408cb124c6d8ea181325/pathik-js/benchmark/results/memory-comparison.png -------------------------------------------------------------------------------- /pathik-js/bin/pathik-cli.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | /** 3 | * Command-line interface for Pathik 4 | */ 5 | 6 | const { program } = require('commander'); 7 | const path = require('path'); 8 | const fs = require('fs'); 9 | const pathik = require('../src/index'); 10 | 11 | program 12 | .name('pathik') 13 | .description('High-performance web crawler implemented in Go') 14 | .version('0.3.0'); 15 | 16 | program 17 | .command('crawl') 18 | .description('Crawl URLs and save content locally') 19 | .argument('', 'URLs to crawl') 20 | .option('-o, --output ', 'Output directory', './output') 21 | .option('-p, --parallel', 'Use parallel crawling (default: true)') 22 | .option('--no-parallel', 'Disable parallel crawling') 23 | .action(async (urls, options) => { 24 | try { 25 | console.log(`Crawling ${urls.length} URLs to ${options.output}...`); 26 | 27 | // Ensure output directory exists 28 | fs.mkdirSync(options.output, { recursive: true }); 29 | 30 | // Crawl URLs 31 | const results = await pathik.crawl(urls, { 32 | outputDir: options.output, 33 | parallel: options.parallel !== false 34 | }); 35 | 36 | // Print results 37 | console.log('\nCrawling results:'); 38 | for (const [url, files] of Object.entries(results)) { 39 | console.log(`\nURL: ${url}`); 40 | 41 | if (files.error) { 42 | console.log(`Error: ${files.error}`); 43 | continue; 44 | } 45 | 46 | console.log(`HTML file: ${files.html || 'Not found'}`); 47 | console.log(`Markdown file: ${files.markdown || 'Not found'}`); 48 | 49 | // Print sample markdown content 50 | if (files.markdown && fs.existsSync(files.markdown)) { 51 | const content = fs.readFileSync(files.markdown, 'utf-8').slice(0, 500); 52 | console.log('\nSample markdown content:'); 53 | console.log(`${content}...`); 54 | } else { 55 | console.log('WARNING: Markdown file not found or empty!'); 56 | } 57 | } 58 | } catch (error) { 59 | console.error(`Error: ${error.message}`); 60 | process.exit(1); 61 | } 62 | }); 63 | 64 | program 65 | .command('r2') 66 | .description('Crawl URLs and upload content to R2') 67 | .argument('', 'URLs to crawl') 68 | .option('-u, --uuid ', 'UUID for R2 upload') 69 | .option('-p, --parallel', 'Use parallel crawling (default: true)') 70 | .option('--no-parallel', 'Disable parallel crawling') 71 | .action(async (urls, options) => { 72 | try { 73 | console.log(`Crawling and uploading ${urls.length} URLs to R2...`); 74 | 75 | // Crawl and upload 76 | const results = await pathik.crawlToR2(urls, { 77 | uuid: options.uuid, 78 | parallel: options.parallel !== false 79 | }); 80 | 81 | // Print results 82 | console.log('\nR2 Upload results:'); 83 | for (const [url, info] of Object.entries(results)) { 84 | console.log(`\nURL: ${url}`); 85 | 86 | if (info.error) { 87 | console.log(`Error: ${info.error}`); 88 | continue; 89 | } 90 | 91 | console.log(`UUID: ${info.uuid}`); 92 | console.log(`R2 HTML key: ${info.r2_html_key}`); 93 | console.log(`R2 Markdown key: ${info.r2_markdown_key}`); 94 | console.log(`Local HTML file: ${info.local_html_file || 'Not found'}`); 95 | console.log(`Local Markdown file: ${info.local_markdown_file || 'Not found'}`); 96 | } 97 | } catch (error) { 98 | console.error(`Error: ${error.message}`); 99 | process.exit(1); 100 | } 101 | }); 102 | 103 | program 104 | .command('kafka') 105 | .description('Crawl URLs and stream content to Kafka') 106 | .argument('', 'URLs to crawl and stream') 107 | .option('-c, --content ', 'Content type to stream: html, markdown, or both', 'both') 108 | .option('-t, --topic ', 'Kafka topic to stream to') 109 | .option('-s, --session ', 'Session ID for multi-user environments') 110 | .option('-p, --parallel', 'Use parallel crawling (default: true)') 111 | .option('--no-parallel', 'Disable parallel crawling') 112 | .action(async (urls, options) => { 113 | try { 114 | // Validate content type 115 | if (!['html', 'markdown', 'both'].includes(options.content)) { 116 | console.error('Error: Content type must be "html", "markdown", or "both"'); 117 | process.exit(1); 118 | } 119 | 120 | console.log(`Streaming ${urls.length} URLs to Kafka...`); 121 | console.log(`Content type: ${options.content}`); 122 | 123 | if (options.topic) { 124 | console.log(`Topic: ${options.topic}`); 125 | } 126 | 127 | if (options.session) { 128 | console.log(`Session ID: ${options.session}`); 129 | } 130 | 131 | // Stream to Kafka 132 | const results = await pathik.streamToKafka(urls, { 133 | contentType: options.content, 134 | topic: options.topic, 135 | session: options.session, 136 | parallel: options.parallel !== false 137 | }); 138 | 139 | // Print results 140 | console.log('\nKafka streaming results:'); 141 | let successCount = 0; 142 | let failCount = 0; 143 | 144 | for (const [url, result] of Object.entries(results)) { 145 | if (result.success) { 146 | console.log(`✓ ${url}: Successfully streamed to Kafka`); 147 | successCount++; 148 | } else { 149 | console.log(`✗ ${url}: Failed to stream to Kafka - ${result.error}`); 150 | failCount++; 151 | } 152 | } 153 | 154 | console.log(`\nSummary: ${successCount} succeeded, ${failCount} failed`); 155 | 156 | if (options.session) { 157 | console.log(`\nTo consume these messages, filter by session ID: ${options.session}`); 158 | } 159 | } catch (error) { 160 | console.error(`Error: ${error.message}`); 161 | process.exit(1); 162 | } 163 | }); 164 | 165 | program.parse(); -------------------------------------------------------------------------------- /pathik-js/bin/pathik_bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justrach/pathik/ce3af3455925e1af57e2408cb124c6d8ea181325/pathik-js/bin/pathik_bin -------------------------------------------------------------------------------- /pathik-js/examples/basic.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Basic example of using Pathik 3 | */ 4 | 5 | const pathik = require('../src/index'); 6 | const path = require('path'); 7 | const fs = require('fs'); 8 | 9 | // Create output directory 10 | const outputDir = path.resolve('./output_data'); 11 | if (!fs.existsSync(outputDir)) { 12 | fs.mkdirSync(outputDir, { recursive: true }); 13 | } 14 | console.log(`Output directory: ${outputDir}`); 15 | 16 | // List of URLs to crawl 17 | const urls = [ 18 | 'https://example.com', 19 | 'https://news.ycombinator.com' 20 | ]; 21 | 22 | // Crawl the URLs 23 | console.log(`Crawling ${urls.length} URLs...`); 24 | 25 | pathik.crawl(urls, { outputDir }) 26 | .then(results => { 27 | console.log('\nCrawling results:'); 28 | 29 | for (const [url, files] of Object.entries(results)) { 30 | console.log(`\nURL: ${url}`); 31 | console.log(`HTML file: ${files.html}`); 32 | console.log(`Markdown file: ${files.markdown}`); 33 | 34 | // Print sample content from markdown file 35 | if (files.markdown && fs.existsSync(files.markdown)) { 36 | const content = fs.readFileSync(files.markdown, 'utf-8').slice(0, 500); 37 | console.log('\nSample markdown content:'); 38 | console.log(`${content}...`); 39 | } else { 40 | console.log('WARNING: Markdown file not found or empty!'); 41 | } 42 | } 43 | }) 44 | .catch(error => { 45 | console.error(`Error during crawling: ${error.message}`); 46 | }); -------------------------------------------------------------------------------- /pathik-js/examples/kafka.js: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pathik-js/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "pathik", 3 | "version": "0.3.0", 4 | "description": "High-performance web crawler implemented in Go with JavaScript bindings", 5 | "main": "src/index.js", 6 | "types": "types/index.d.ts", 7 | "scripts": { 8 | "postinstall": "node scripts/install.js", 9 | "test": "node examples/basic.js", 10 | "build-binary": "node scripts/build.js" 11 | }, 12 | "keywords": [ 13 | "crawler", 14 | "web-scraping", 15 | "go", 16 | "high-performance", 17 | "markdown", 18 | "kafka", 19 | "streaming" 20 | ], 21 | "author": "Rach Pradhan", 22 | "license": "Apache-2.0", 23 | "engines": { 24 | "node": ">=14.0.0" 25 | }, 26 | "dependencies": { 27 | "commander": "^11.1.0", 28 | "uuid": "^9.0.1" 29 | }, 30 | "devDependencies": { 31 | "chai": "^4.3.10", 32 | "mocha": "^10.2.0", 33 | "@types/node": "^18.0.0" 34 | }, 35 | "files": [ 36 | "src", 37 | "bin", 38 | "scripts", 39 | "types", 40 | "README.md" 41 | ] 42 | } -------------------------------------------------------------------------------- /pathik-js/scripts/build.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | /** 3 | * Script to build the Go binary for pathik 4 | */ 5 | 6 | const { execSync } = require('child_process'); 7 | const path = require('path'); 8 | const fs = require('fs'); 9 | const os = require('os'); 10 | 11 | // Determine binary name based on platform 12 | const binaryName = os.platform() === 'win32' ? 'pathik_bin.exe' : 'pathik_bin'; 13 | const binDir = path.join(__dirname, '..', 'bin'); 14 | const binaryPath = path.join(binDir, binaryName); 15 | 16 | // Create bin directory if it doesn't exist 17 | if (!fs.existsSync(binDir)) { 18 | fs.mkdirSync(binDir, { recursive: true }); 19 | } 20 | 21 | console.log('Building Go binary for Pathik...'); 22 | 23 | try { 24 | // Look for the Go binary in parent directory 25 | const mainGoPath = path.join(__dirname, '..', '..', 'main.go'); 26 | 27 | if (!fs.existsSync(mainGoPath)) { 28 | console.error(`Error: main.go not found at expected location: ${mainGoPath}`); 29 | console.log('Please ensure you have the Go source files in the parent directory.'); 30 | process.exit(1); 31 | } 32 | 33 | // Build the Go binary 34 | const buildCommand = `go build -o "${binaryPath}" "${mainGoPath}"`; 35 | console.log(`Running: ${buildCommand}`); 36 | 37 | execSync(buildCommand, { stdio: 'inherit' }); 38 | 39 | console.log(`Go binary built successfully: ${binaryPath}`); 40 | console.log('You can now use the package:'); 41 | console.log(' const pathik = require("pathik");'); 42 | } catch (error) { 43 | console.error('Failed to build Go binary:', error.message); 44 | process.exit(1); 45 | } -------------------------------------------------------------------------------- /pathik-js/scripts/install.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | /** 3 | * Post-installation script for pathik 4 | */ 5 | 6 | const { execSync } = require('child_process'); 7 | const path = require('path'); 8 | const fs = require('fs'); 9 | const os = require('os'); 10 | 11 | // Determine binary name based on platform 12 | const binaryName = os.platform() === 'win32' ? 'pathik_bin.exe' : 'pathik_bin'; 13 | const binDir = path.join(__dirname, '..', 'bin'); 14 | const binaryPath = path.join(binDir, binaryName); 15 | 16 | // Create bin directory if it doesn't exist 17 | if (!fs.existsSync(binDir)) { 18 | fs.mkdirSync(binDir, { recursive: true }); 19 | } 20 | 21 | console.log('Pathik post-installation setup...'); 22 | 23 | // Check if binary already exists 24 | if (fs.existsSync(binaryPath)) { 25 | console.log(`Binary already exists at: ${binaryPath}`); 26 | process.exit(0); 27 | } 28 | 29 | try { 30 | // Look for the Go binary in parent directory 31 | const mainGoPath = path.join(__dirname, '..', '..', 'main.go'); 32 | 33 | if (!fs.existsSync(mainGoPath)) { 34 | console.log(`Info: main.go not found at expected location: ${mainGoPath}`); 35 | console.log('You may need to build the binary manually with "npm run build-binary"'); 36 | process.exit(0); 37 | } 38 | 39 | // Build the Go binary 40 | console.log('Building Go binary...'); 41 | const buildCommand = `go build -o "${binaryPath}" "${mainGoPath}"`; 42 | execSync(buildCommand, { stdio: 'inherit' }); 43 | 44 | console.log(`Go binary built successfully: ${binaryPath}`); 45 | } catch (error) { 46 | console.log('Note: Could not automatically build the Go binary.'); 47 | console.log('This is normal if Go is not installed or the source is not available.'); 48 | console.log('You can build it manually by running "npm run build-binary"'); 49 | } -------------------------------------------------------------------------------- /pathik-js/src/index.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Pathik - High-Performance Web Crawler 3 | * JavaScript bindings for the Pathik Go crawler 4 | */ 5 | 6 | const crawler = require('./crawler'); 7 | 8 | /** 9 | * @module pathik 10 | */ 11 | 12 | /** 13 | * Crawl a URL or list of URLs and save the content locally 14 | * 15 | * @param {string|string[]} urls - URL or array of URLs to crawl 16 | * @param {Object} options - Crawling options 17 | * @param {string} [options.output] - Output directory 18 | * @param {boolean} [options.parallel=true] - Whether to use parallel crawling 19 | * @returns {Promise} - Object mapping URLs to file paths 20 | */ 21 | function crawl(urls, options = {}) { 22 | return crawler.crawl(urls, options); 23 | } 24 | 25 | /** 26 | * Crawl a URL or list of URLs and upload content to R2 27 | * 28 | * @param {string|string[]} urls - URL or array of URLs to crawl 29 | * @param {Object} options - Upload options 30 | * @param {string} [options.uuid] - UUID for the upload 31 | * @param {string} [options.output] - Output directory 32 | * @param {boolean} [options.parallel=true] - Whether to use parallel crawling 33 | * @returns {Promise} - Object mapping URLs to R2 keys 34 | */ 35 | function crawlToR2(urls, options = {}) { 36 | return crawler.crawlToR2(urls, options); 37 | } 38 | 39 | /** 40 | * Stream crawled content from a URL or list of URLs to Kafka 41 | * 42 | * @param {string|string[]} urls - URL or array of URLs to crawl and stream 43 | * @param {Object} options - Kafka streaming options 44 | * @param {boolean} [options.parallel=true] - Whether to use parallel crawling 45 | * @param {string} [options.contentType='both'] - Content type to stream: 'html', 'markdown', or 'both' 46 | * @param {string} [options.topic=null] - Kafka topic to stream to (uses KAFKA_TOPIC env var if null) 47 | * @param {string} [options.session=null] - Session ID for multi-user environments 48 | * @returns {Promise} Result mapping URLs to streaming status 49 | */ 50 | function streamToKafka(urls, options = {}) { 51 | return crawler.streamToKafka(urls, options); 52 | } 53 | 54 | module.exports = { 55 | crawl, 56 | crawlToR2, 57 | streamToKafka 58 | }; -------------------------------------------------------------------------------- /pathik-js/src/utils.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Pathik utility functions 3 | */ 4 | 5 | const fs = require('fs'); 6 | const path = require('path'); 7 | const { exec } = require('child_process'); 8 | const os = require('os'); 9 | 10 | /** 11 | * Get the path to the pathik binary 12 | * 13 | * @returns {Promise} Path to the binary 14 | */ 15 | async function getBinaryPath() { 16 | // Determine binary name based on platform 17 | const binaryName = os.platform() === 'win32' ? 'pathik_bin.exe' : 'pathik_bin'; 18 | 19 | // Check in the package's bin directory first 20 | const packageBinPath = path.join(__dirname, '..', 'bin', binaryName); 21 | if (fs.existsSync(packageBinPath)) { 22 | return packageBinPath; 23 | } 24 | 25 | // Check in node_modules/.bin 26 | const nodeModulesBinPath = path.join(__dirname, '..', '..', '.bin', binaryName); 27 | if (fs.existsSync(nodeModulesBinPath)) { 28 | return nodeModulesBinPath; 29 | } 30 | 31 | // Check if it's available in PATH 32 | try { 33 | const { error, stdout } = await execPromise(`which ${binaryName}`); 34 | if (!error && stdout) { 35 | return stdout.trim(); 36 | } 37 | } catch (err) { 38 | // Ignore and try next location 39 | } 40 | 41 | // If binary not found, try to build it 42 | console.log('Binary not found, attempting to build it...'); 43 | try { 44 | await buildBinary(); 45 | 46 | // Check if build was successful 47 | if (fs.existsSync(packageBinPath)) { 48 | return packageBinPath; 49 | } 50 | } catch (err) { 51 | console.error('Failed to build binary:', err); 52 | } 53 | 54 | throw new Error(`Pathik binary not found. Please run 'npm run build-binary' first.`); 55 | } 56 | 57 | /** 58 | * Build the Go binary 59 | * 60 | * @returns {Promise} 61 | */ 62 | async function buildBinary() { 63 | return new Promise((resolve, reject) => { 64 | exec('node scripts/build.js', (error, stdout, stderr) => { 65 | if (error) { 66 | console.error(`Build error: ${stderr}`); 67 | return reject(error); 68 | } 69 | console.log(stdout); 70 | resolve(); 71 | }); 72 | }); 73 | } 74 | 75 | /** 76 | * Promisified exec function 77 | * 78 | * @param {string} command - Command to execute 79 | * @returns {Promise} Object with stdout and stderr 80 | */ 81 | function execPromise(command) { 82 | return new Promise((resolve, reject) => { 83 | exec(command, (error, stdout, stderr) => { 84 | resolve({ error, stdout, stderr }); 85 | }); 86 | }); 87 | } 88 | 89 | module.exports = { 90 | getBinaryPath, 91 | buildBinary 92 | }; -------------------------------------------------------------------------------- /pathik-js/types/index.d.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Type definitions for pathik package 3 | */ 4 | 5 | declare module 'pathik' { 6 | /** 7 | * Result object for crawled URL 8 | */ 9 | export interface CrawlResult { 10 | html: string; 11 | markdown: string; 12 | error?: string; 13 | } 14 | 15 | /** 16 | * Result object for R2 upload 17 | */ 18 | export interface R2UploadResult { 19 | uuid: string; 20 | r2_html_key: string; 21 | r2_markdown_key: string; 22 | local_html_file: string; 23 | local_markdown_file: string; 24 | error?: string; 25 | } 26 | 27 | /** 28 | * Result object for Kafka streaming operation 29 | */ 30 | export interface KafkaStreamResult { 31 | success: boolean; 32 | error?: string; 33 | } 34 | 35 | /** 36 | * Options for crawl function 37 | */ 38 | export interface CrawlOptions { 39 | outputDir?: string; 40 | parallel?: boolean; 41 | } 42 | 43 | /** 44 | * Options for crawlToR2 function 45 | */ 46 | export interface R2Options { 47 | uuid?: string; 48 | outputDir?: string; 49 | parallel?: boolean; 50 | } 51 | 52 | /** 53 | * Options for streamToKafka function 54 | */ 55 | export interface KafkaOptions { 56 | parallel?: boolean; 57 | contentType?: 'html' | 'markdown' | 'both'; 58 | topic?: string; 59 | session?: string; 60 | } 61 | 62 | /** 63 | * Crawl a URL or list of URLs and save the content locally 64 | * 65 | * @param urls - A single URL or array of URLs to crawl 66 | * @param options - Crawl options 67 | * @returns Promise resolving to an object mapping URLs to file paths 68 | */ 69 | export function crawl( 70 | urls: string | string[], 71 | options?: CrawlOptions 72 | ): Promise>; 73 | 74 | /** 75 | * Crawl a URL or list of URLs and upload the content to R2 76 | * 77 | * @param urls - A single URL or array of URLs to crawl 78 | * @param options - R2 crawl options 79 | * @returns Promise resolving to an object mapping URLs to R2 keys 80 | */ 81 | export function crawlToR2( 82 | urls: string | string[], 83 | options?: R2Options 84 | ): Promise>; 85 | 86 | /** 87 | * Stream crawled content from a URL or list of URLs to Kafka 88 | * 89 | * @param urls - A single URL or array of URLs to crawl and stream 90 | * @param options - Kafka streaming options 91 | * @returns Promise resolving to an object mapping URLs to streaming status 92 | */ 93 | export function streamToKafka( 94 | urls: string | string[], 95 | options?: KafkaOptions 96 | ): Promise>; 97 | } -------------------------------------------------------------------------------- /pathik.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | LICENSE 2 | MANIFEST.in 3 | README.md 4 | build_binary.py 5 | go.mod 6 | go.sum 7 | main.go 8 | setup.py 9 | pathik/__init__.py 10 | pathik/cli.py 11 | pathik/crawler.py 12 | pathik/safe_api.py 13 | pathik/schema.py 14 | pathik/simple.py 15 | pathik.egg-info/PKG-INFO 16 | pathik.egg-info/SOURCES.txt 17 | pathik.egg-info/dependency_links.txt 18 | pathik.egg-info/entry_points.txt 19 | pathik.egg-info/requires.txt 20 | pathik.egg-info/top_level.txt 21 | pathik/bin/.gitkeep 22 | pathik/bin/pathik_crawler 23 | pathik/bin/darwin_amd64/pathik_bin 24 | pathik/bin/darwin_arm64/pathik_bin 25 | pathik/bin/linux_amd64/pathik_bin 26 | pathik/bin/linux_arm64/pathik_bin 27 | pathik/bin/windows_amd64/pathik_bin.exe -------------------------------------------------------------------------------- /pathik.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /pathik.egg-info/entry_points.txt: -------------------------------------------------------------------------------- 1 | [console_scripts] 2 | pathik = pathik.cli:main 3 | -------------------------------------------------------------------------------- /pathik.egg-info/requires.txt: -------------------------------------------------------------------------------- 1 | requests>=2.25.0 2 | tqdm>=4.50.0 3 | satya>=0.2.6 4 | 5 | [kafka] 6 | kafka-python>=2.0.0 7 | -------------------------------------------------------------------------------- /pathik.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | pathik 2 | -------------------------------------------------------------------------------- /pathik/__pycache__/__init__.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justrach/pathik/ce3af3455925e1af57e2408cb124c6d8ea181325/pathik/__pycache__/__init__.cpython-312.pyc -------------------------------------------------------------------------------- /pathik/__pycache__/cli.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justrach/pathik/ce3af3455925e1af57e2408cb124c6d8ea181325/pathik/__pycache__/cli.cpython-312.pyc -------------------------------------------------------------------------------- /pathik/__pycache__/crawler.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justrach/pathik/ce3af3455925e1af57e2408cb124c6d8ea181325/pathik/__pycache__/crawler.cpython-312.pyc -------------------------------------------------------------------------------- /pathik/__pycache__/safe_api.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justrach/pathik/ce3af3455925e1af57e2408cb124c6d8ea181325/pathik/__pycache__/safe_api.cpython-312.pyc -------------------------------------------------------------------------------- /pathik/__pycache__/schema.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justrach/pathik/ce3af3455925e1af57e2408cb124c6d8ea181325/pathik/__pycache__/schema.cpython-312.pyc -------------------------------------------------------------------------------- /pathik/bin/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justrach/pathik/ce3af3455925e1af57e2408cb124c6d8ea181325/pathik/bin/.gitkeep -------------------------------------------------------------------------------- /pathik/bin/darwin_amd64/pathik_bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justrach/pathik/ce3af3455925e1af57e2408cb124c6d8ea181325/pathik/bin/darwin_amd64/pathik_bin -------------------------------------------------------------------------------- /pathik/bin/darwin_arm64/pathik_bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justrach/pathik/ce3af3455925e1af57e2408cb124c6d8ea181325/pathik/bin/darwin_arm64/pathik_bin -------------------------------------------------------------------------------- /pathik/bin/linux_amd64/pathik_bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justrach/pathik/ce3af3455925e1af57e2408cb124c6d8ea181325/pathik/bin/linux_amd64/pathik_bin -------------------------------------------------------------------------------- /pathik/bin/linux_arm64/pathik_bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justrach/pathik/ce3af3455925e1af57e2408cb124c6d8ea181325/pathik/bin/linux_arm64/pathik_bin -------------------------------------------------------------------------------- /pathik/bin/pathik_crawler: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justrach/pathik/ce3af3455925e1af57e2408cb124c6d8ea181325/pathik/bin/pathik_crawler -------------------------------------------------------------------------------- /pathik/bin/windows_amd64/pathik_bin.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justrach/pathik/ce3af3455925e1af57e2408cb124c6d8ea181325/pathik/bin/windows_amd64/pathik_bin.exe -------------------------------------------------------------------------------- /pathik/simple.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple implementation of the pathik functions that doesn't rely on Go. 3 | """ 4 | import os 5 | import requests 6 | import tempfile 7 | import uuid 8 | from typing import List, Dict, Optional 9 | from bs4 import BeautifulSoup 10 | import markdownify 11 | 12 | def crawl(urls: List[str], output_dir: Optional[str] = None) -> Dict[str, Dict[str, str]]: 13 | """Simple Python-based crawler that doesn't use Go""" 14 | if not urls: 15 | raise ValueError("No URLs provided") 16 | 17 | # Create output directory if it doesn't exist 18 | if output_dir is None: 19 | output_dir = tempfile.mkdtemp(prefix="pathik_") 20 | else: 21 | os.makedirs(output_dir, exist_ok=True) 22 | 23 | results = {} 24 | for url in urls: 25 | print(f"Crawling {url}...") 26 | try: 27 | # Fetch HTML 28 | response = requests.get(url, headers={ 29 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120.0.0.0" 30 | }) 31 | response.raise_for_status() 32 | html = response.text 33 | 34 | # Save HTML 35 | domain = url.replace("https://", "").replace("http://", "").replace("/", "_") 36 | html_file = os.path.join(output_dir, f"{domain}.html") 37 | with open(html_file, "w", encoding="utf-8") as f: 38 | f.write(html) 39 | 40 | # Extract content and convert to markdown 41 | soup = BeautifulSoup(html, "html.parser") 42 | content = soup.find("body") 43 | markdown = markdownify.markdownify(str(content)) if content else "" 44 | 45 | # Save markdown 46 | md_file = os.path.join(output_dir, f"{domain}.md") 47 | with open(md_file, "w", encoding="utf-8") as f: 48 | f.write(markdown) 49 | 50 | results[url] = {"html": html_file, "markdown": md_file} 51 | except Exception as e: 52 | print(f"Error crawling {url}: {e}") 53 | results[url] = {"html": "", "markdown": ""} 54 | 55 | return results 56 | 57 | def crawl_to_r2(urls: List[str], uuid_str: Optional[str] = None) -> Dict[str, Dict[str, str]]: 58 | """Simple version that just uses local storage""" 59 | if uuid_str is None: 60 | uuid_str = str(uuid.uuid4()) 61 | 62 | results = crawl(urls) 63 | # Just return local paths since we're not using R2 64 | return {url: { 65 | "uuid": uuid_str, 66 | "r2_html_key": "", 67 | "r2_markdown_key": "", 68 | "local_html_file": files["html"], 69 | "local_markdown_file": files["markdown"] 70 | } for url, files in results.items()} -------------------------------------------------------------------------------- /pathik_bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justrach/pathik/ce3af3455925e1af57e2408cb124c6d8ea181325/pathik_bin -------------------------------------------------------------------------------- /publish_to_pypi.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Script to build and publish the pathik package to PyPI 4 | """ 5 | import os 6 | import sys 7 | import subprocess 8 | import shutil 9 | import argparse 10 | 11 | def run_command(cmd, description=None): 12 | """Run a command and exit on failure""" 13 | if description: 14 | print(f"\n=== {description} ===") 15 | 16 | print(f"Running: {' '.join(cmd)}") 17 | result = subprocess.run(cmd) 18 | 19 | if result.returncode != 0: 20 | print(f"Error: Command failed with code {result.returncode}") 21 | sys.exit(1) 22 | 23 | return result 24 | 25 | def main(): 26 | """Main function to build and publish the package""" 27 | parser = argparse.ArgumentParser(description="Build and publish pathik to PyPI") 28 | parser.add_argument("--test", action="store_true", help="Upload to TestPyPI instead of PyPI") 29 | parser.add_argument("--skip-build-binary", action="store_true", help="Skip building Go binaries") 30 | parser.add_argument("--skip-clean", action="store_true", help="Skip cleaning build directories") 31 | args = parser.parse_args() 32 | 33 | # Clean build directories if needed 34 | if not args.skip_clean: 35 | print("\n=== Cleaning build directories ===") 36 | dirs_to_clean = ["build", "dist", "pathik.egg-info"] 37 | for dir_name in dirs_to_clean: 38 | if os.path.exists(dir_name): 39 | print(f"Removing {dir_name}") 40 | shutil.rmtree(dir_name) 41 | 42 | # Build Go binaries if needed 43 | if not args.skip_build_binary: 44 | run_command( 45 | ["python", "build_binary.py", "--all"], 46 | "Building Go binaries for all platforms" 47 | ) 48 | 49 | # Build Python package 50 | run_command( 51 | ["python", "setup.py", "sdist", "bdist_wheel"], 52 | "Building source distribution and wheel" 53 | ) 54 | 55 | # Check package with twine 56 | run_command( 57 | ["twine", "check", "dist/*"], 58 | "Checking package with twine" 59 | ) 60 | 61 | # Upload to PyPI or TestPyPI 62 | if args.test: 63 | run_command( 64 | ["twine", "upload", "--repository-url", "https://test.pypi.org/legacy/", "dist/*"], 65 | "Uploading to TestPyPI" 66 | ) 67 | print("\nPackage uploaded to TestPyPI. Install with:") 68 | print("pip install --index-url https://test.pypi.org/simple/ pathik") 69 | else: 70 | run_command( 71 | ["twine", "upload", "dist/*"], 72 | "Uploading to PyPI" 73 | ) 74 | print("\nPackage uploaded to PyPI. Install with:") 75 | print("pip install pathik") 76 | 77 | print("\nDone!") 78 | 79 | if __name__ == "__main__": 80 | main() -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import sys 4 | import shutil 5 | import platform 6 | from setuptools import setup, find_packages 7 | from setuptools.command.install import install 8 | from setuptools.command.develop import develop 9 | from setuptools.command.sdist import sdist 10 | from setuptools.command.bdist_wheel import bdist_wheel 11 | 12 | VERSION = '0.3.11' 13 | 14 | def detect_platform(): 15 | """Detect the current platform more reliably""" 16 | # Determine OS 17 | os_name = platform.system().lower() 18 | if os_name.startswith('win'): 19 | os_name = 'windows' 20 | elif os_name.startswith('lin'): 21 | os_name = 'linux' 22 | elif os_name == 'darwin': 23 | os_name = 'darwin' 24 | 25 | # Docker/container detection for Linux 26 | if os.path.exists("/proc/1/cgroup") or os.path.exists("/.dockerenv"): 27 | print("Container environment detected, forcing OS to Linux") 28 | os_name = 'linux' 29 | 30 | # Determine architecture 31 | arch = platform.machine().lower() 32 | if arch in ('x86_64', 'amd64'): 33 | arch = 'amd64' 34 | elif arch in ('arm64', 'aarch64'): 35 | arch = 'arm64' 36 | elif arch in ('i386', 'i686', 'x86'): 37 | arch = '386' 38 | 39 | print(f"Detected platform: {os_name}_{arch}") 40 | return os_name, arch 41 | 42 | def build_go_binary(target_os=None, target_arch=None): 43 | """Build the Go binary for the specified platform or current platform if not specified""" 44 | # If no platform specified, use current 45 | if target_os is None or target_arch is None: 46 | current_os, current_arch = detect_platform() 47 | target_os = target_os or current_os 48 | target_arch = target_arch or current_arch 49 | 50 | print(f"Building Go binary for {target_os}_{target_arch}...") 51 | 52 | # Check if we're in the original source directory or in a temporary build directory 53 | if os.path.exists("go.mod"): 54 | # We're in the original directory with Go module files 55 | binary_name = "pathik_bin" 56 | if target_os == "windows": 57 | binary_name += ".exe" 58 | 59 | # Setup environment for cross-compilation if needed 60 | env = os.environ.copy() 61 | env["GOOS"] = target_os 62 | env["GOARCH"] = target_arch 63 | 64 | # For GitHub Actions releases, output to releases directory 65 | release_dir = os.path.join("releases", f"{target_os}_{target_arch}") 66 | os.makedirs(release_dir, exist_ok=True) 67 | output_path = os.path.join(release_dir, binary_name) 68 | 69 | # Run go build 70 | build_cmd = ["go", "build", "-o", output_path, "./main.go"] 71 | result = subprocess.run(build_cmd, capture_output=True, env=env) 72 | 73 | if result.returncode != 0: 74 | print(f"Error building Go binary: {result.stderr.decode()}") 75 | raise RuntimeError("Failed to build Go binary") 76 | 77 | print(f"Go binary built successfully: {output_path}") 78 | 79 | # Make binary executable 80 | if target_os != "windows": 81 | os.chmod(output_path, 0o755) 82 | 83 | return output_path 84 | else: 85 | # We're in a temporary build directory, can't build Go binary here 86 | print("Not in source directory, skipping Go binary build") 87 | return None 88 | 89 | def build_all_binaries(): 90 | """Build binaries for all supported platforms""" 91 | platforms = [ 92 | ("darwin", "amd64"), # Intel Mac 93 | ("darwin", "arm64"), # Apple Silicon Mac 94 | ("linux", "amd64"), # Linux x86_64 95 | ("linux", "arm64"), # Linux ARM64 96 | ("windows", "amd64"), # Windows x86_64 97 | ] 98 | 99 | built_binaries = [] 100 | for target_os, target_arch in platforms: 101 | try: 102 | binary = build_go_binary(target_os, target_arch) 103 | if binary: 104 | built_binaries.append(binary) 105 | except Exception as e: 106 | print(f"Warning: Failed to build for {target_os}_{target_arch}: {e}") 107 | 108 | return built_binaries 109 | 110 | class BuildGoCommand: 111 | """Mixin to build Go binary before installation""" 112 | def run(self): 113 | # For local development only, build the binary for the current platform 114 | try: 115 | current_os, current_arch = detect_platform() 116 | # Only build if we're in development mode and have go installed 117 | if os.path.exists("go.mod") and shutil.which("go"): 118 | build_go_binary(current_os, current_arch) 119 | except Exception as e: 120 | print(f"Warning: Failed to build Go binary: {e}") 121 | print("Binary will be downloaded during first use.") 122 | 123 | # Run the original command 124 | super().run() 125 | 126 | class BuildSdistWithoutBinary(sdist): 127 | """Custom sdist command that excludes binaries to keep package size small""" 128 | def run(self): 129 | # Create empty bin directories to maintain structure 130 | os.makedirs("pathik/bin", exist_ok=True) 131 | 132 | # Run the original sdist 133 | super().run() 134 | 135 | class BuildWheel(bdist_wheel): 136 | """Custom wheel command that excludes binaries to keep package size small""" 137 | def run(self): 138 | # Create empty bin directories to maintain structure 139 | os.makedirs("pathik/bin", exist_ok=True) 140 | 141 | # Run the original wheel build 142 | super().run() 143 | 144 | def finalize_options(self): 145 | # Mark the wheel as platform independent since we include 146 | # binaries for all platforms via dynamic download 147 | super().finalize_options() 148 | self.root_is_pure = True 149 | 150 | class InstallWithGoBuild(BuildGoCommand, install): 151 | """Custom install command that builds Go binary first""" 152 | pass 153 | 154 | class DevelopWithGoBuild(BuildGoCommand, develop): 155 | """Custom develop command that builds Go binary first""" 156 | pass 157 | 158 | # Read the README.md file 159 | with open("README.md", "r", encoding="utf-8") as fh: 160 | long_description = fh.read() 161 | 162 | setup( 163 | name="pathik", 164 | version=VERSION, 165 | description="A web crawler implemented in Go with Python bindings", 166 | long_description=long_description, 167 | long_description_content_type="text/markdown", 168 | author="Rach Pradhan", 169 | author_email="me@rachit.ai", 170 | url="https://github.com/justrach/pathik", 171 | packages=find_packages(), 172 | package_data={ 173 | "pathik": ["bin/.gitkeep"], # Just include a placeholder 174 | }, 175 | cmdclass={ 176 | 'install': InstallWithGoBuild, 177 | 'develop': DevelopWithGoBuild, 178 | 'sdist': BuildSdistWithoutBinary, 179 | 'bdist_wheel': BuildWheel, 180 | }, 181 | install_requires=[ 182 | "requests>=2.25.0", # For downloading binaries 183 | "tqdm>=4.50.0", # For progress bar 184 | "satya>=0.2.6", # For type validation and schema definitions 185 | ], 186 | extras_require={ 187 | "kafka": ["kafka-python>=2.0.0"], 188 | }, 189 | python_requires='>=3.6', 190 | classifiers=[ 191 | "Programming Language :: Python :: 3", 192 | "License :: OSI Approved :: MIT License", 193 | "Operating System :: OS Independent", 194 | ], 195 | # Add entry points for command-line usage 196 | entry_points={ 197 | 'console_scripts': [ 198 | 'pathik=pathik.cli:main', 199 | ], 200 | }, 201 | ) -------------------------------------------------------------------------------- /storage/kafka.go: -------------------------------------------------------------------------------- 1 | package storage 2 | 3 | import ( 4 | "context" 5 | "crypto/tls" 6 | "errors" 7 | "os" 8 | "strconv" 9 | "strings" 10 | "time" 11 | 12 | "github.com/segmentio/kafka-go" 13 | "github.com/segmentio/kafka-go/sasl/plain" 14 | ) 15 | 16 | // KafkaConfig holds configuration for Kafka 17 | type KafkaConfig struct { 18 | Brokers []string 19 | Topic string 20 | Username string 21 | Password string 22 | ClientID string 23 | UseTLS bool 24 | MaxRetry int 25 | CompressionType string 26 | MaxMessageSize int 27 | BufferMemory int 28 | } 29 | 30 | // LoadKafkaConfig loads Kafka configuration from environment variables 31 | func LoadKafkaConfig() (KafkaConfig, error) { 32 | brokersStr := os.Getenv("KAFKA_BROKERS") 33 | if brokersStr == "" { 34 | brokersStr = "localhost:9092" // Default broker 35 | } 36 | 37 | topic := os.Getenv("KAFKA_TOPIC") 38 | if topic == "" { 39 | topic = "pathik_crawl_data" // Default topic 40 | } 41 | 42 | maxRetryStr := os.Getenv("KAFKA_MAX_RETRY") 43 | maxRetry := 3 // Default max retry 44 | if maxRetryStr != "" { 45 | var err error 46 | maxRetry, err = strconv.Atoi(maxRetryStr) 47 | if err != nil { 48 | return KafkaConfig{}, err 49 | } 50 | } 51 | 52 | useTLSStr := os.Getenv("KAFKA_USE_TLS") 53 | useTLS := false 54 | if useTLSStr != "" { 55 | var err error 56 | useTLS, err = strconv.ParseBool(useTLSStr) 57 | if err != nil { 58 | return KafkaConfig{}, err 59 | } 60 | } 61 | 62 | config := KafkaConfig{ 63 | Brokers: strings.Split(brokersStr, ","), 64 | Topic: topic, 65 | Username: os.Getenv("KAFKA_USERNAME"), 66 | Password: os.Getenv("KAFKA_PASSWORD"), 67 | ClientID: os.Getenv("KAFKA_CLIENT_ID"), 68 | UseTLS: useTLS, 69 | MaxRetry: maxRetry, 70 | CompressionType: os.Getenv("KAFKA_COMPRESSION"), 71 | MaxMessageSize: 0, // Default to 0 (uses Kafka default) 72 | BufferMemory: 0, // Default to 0 (uses Kafka default) 73 | } 74 | 75 | // Try to parse MaxMessageSize if provided in env 76 | maxMsgSizeStr := os.Getenv("KAFKA_MAX_MESSAGE_SIZE") 77 | if maxMsgSizeStr != "" { 78 | if val, err := strconv.Atoi(maxMsgSizeStr); err == nil { 79 | config.MaxMessageSize = val 80 | } 81 | } 82 | 83 | // Try to parse BufferMemory if provided in env 84 | bufferMemoryStr := os.Getenv("KAFKA_BUFFER_MEMORY") 85 | if bufferMemoryStr != "" { 86 | if val, err := strconv.Atoi(bufferMemoryStr); err == nil { 87 | config.BufferMemory = val 88 | } 89 | } 90 | 91 | return config, nil 92 | } 93 | 94 | // CreateKafkaWriter creates a Kafka writer from the provided configuration 95 | func CreateKafkaWriter(config KafkaConfig) (*kafka.Writer, error) { 96 | if len(config.Brokers) == 0 { 97 | return nil, errors.New("no Kafka brokers specified") 98 | } 99 | 100 | if config.Topic == "" { 101 | return nil, errors.New("no Kafka topic specified") 102 | } 103 | 104 | dialer := &kafka.Dialer{ 105 | Timeout: 10 * time.Second, 106 | DualStack: true, 107 | } 108 | 109 | // Setup SASL authentication if username and password are provided 110 | if config.Username != "" && config.Password != "" { 111 | mechanism := plain.Mechanism{ 112 | Username: config.Username, 113 | Password: config.Password, 114 | } 115 | dialer.SASLMechanism = mechanism 116 | } 117 | 118 | // Setup TLS if enabled 119 | if config.UseTLS { 120 | dialer.TLS = &tls.Config{ 121 | MinVersion: tls.VersionTLS12, 122 | } 123 | } 124 | 125 | // Create the writer with custom buffer configurations 126 | writerConfig := kafka.WriterConfig{ 127 | Brokers: config.Brokers, 128 | Topic: config.Topic, 129 | Balancer: &kafka.LeastBytes{}, 130 | MaxAttempts: config.MaxRetry, 131 | BatchSize: 1, // Default to sending immediately 132 | BatchTimeout: 1 * time.Millisecond, // Almost no delay 133 | RequiredAcks: -1, // RequireAll = -1, wait for all replicas 134 | Dialer: dialer, 135 | } 136 | 137 | // Set message size limit if provided 138 | if config.MaxMessageSize > 0 { 139 | writerConfig.BatchBytes = config.MaxMessageSize 140 | } 141 | 142 | // Set buffer memory if provided 143 | if config.BufferMemory > 0 { 144 | writerConfig.Async = true 145 | writerConfig.BatchSize = 10 // Adjust batch size when using async 146 | writerConfig.BatchBytes = config.BufferMemory 147 | } 148 | 149 | writer := kafka.NewWriter(writerConfig) 150 | 151 | // Set compression codec 152 | compressionCodec := kafka.Compression(kafka.Gzip) // Default to Gzip 153 | if config.CompressionType != "" { 154 | switch strings.ToLower(config.CompressionType) { 155 | case "gzip": 156 | compressionCodec = kafka.Compression(kafka.Gzip) 157 | case "snappy": 158 | compressionCodec = kafka.Compression(kafka.Snappy) 159 | case "lz4": 160 | compressionCodec = kafka.Compression(kafka.Lz4) 161 | case "zstd": 162 | compressionCodec = kafka.Compression(kafka.Zstd) 163 | } 164 | } 165 | writer.Compression = compressionCodec 166 | 167 | // Set client ID if provided 168 | if config.ClientID != "" { 169 | dialer.ClientID = config.ClientID 170 | } 171 | 172 | return writer, nil 173 | } 174 | 175 | // SendToKafka sends content to Kafka 176 | func SendToKafka(writer *kafka.Writer, key string, value []byte, headers ...kafka.Header) error { 177 | // Add timestamp to headers 178 | timestamp := time.Now().UTC().Format(time.RFC3339) 179 | headers = append(headers, kafka.Header{ 180 | Key: "timestamp", 181 | Value: []byte(timestamp), 182 | }) 183 | 184 | // Create message 185 | message := kafka.Message{ 186 | Key: []byte(key), 187 | Value: value, 188 | Headers: headers, 189 | Time: time.Now(), 190 | } 191 | 192 | // Write message with retry logic 193 | ctx := context.Background() 194 | err := writer.WriteMessages(ctx, message) 195 | if err != nil { 196 | return err 197 | } 198 | 199 | return nil 200 | } 201 | 202 | // ContentType represents the type of content to stream 203 | type ContentType string 204 | 205 | const ( 206 | // HTMLContent is the HTML content type 207 | HTMLContent ContentType = "html" 208 | // MarkdownContent is the Markdown content type 209 | MarkdownContent ContentType = "markdown" 210 | ) 211 | 212 | // StreamToKafka streams content to Kafka based on the specified content types 213 | // If contentTypes is empty, both HTML and Markdown will be streamed 214 | // If sessionID is provided, it will be included in message headers 215 | func StreamToKafka(writer interface{}, url string, htmlContent string, markdownContent string, sessionID string, contentTypes ...ContentType) error { 216 | kafkaWriter, ok := writer.(*kafka.Writer) 217 | if !ok { 218 | return errors.New("invalid Kafka writer provided") 219 | } 220 | 221 | // If no content types specified, stream both 222 | if len(contentTypes) == 0 { 223 | contentTypes = []ContentType{HTMLContent, MarkdownContent} 224 | } 225 | 226 | // Create common headers 227 | headers := []kafka.Header{ 228 | {Key: "url", Value: []byte(url)}, 229 | } 230 | 231 | // Add session ID if provided 232 | if sessionID != "" { 233 | headers = append(headers, kafka.Header{ 234 | Key: "sessionID", 235 | Value: []byte(sessionID), 236 | }) 237 | } 238 | 239 | // Check if HTML should be streamed 240 | if containsContentType(contentTypes, HTMLContent) { 241 | htmlHeaders := append(headers, kafka.Header{ 242 | Key: "contentType", 243 | Value: []byte("text/html"), 244 | }) 245 | 246 | err := SendToKafka( 247 | kafkaWriter, 248 | url, 249 | []byte(htmlContent), 250 | htmlHeaders..., 251 | ) 252 | if err != nil { 253 | return err 254 | } 255 | } 256 | 257 | // Check if Markdown should be streamed 258 | if containsContentType(contentTypes, MarkdownContent) { 259 | markdownHeaders := append(headers, kafka.Header{ 260 | Key: "contentType", 261 | Value: []byte("text/markdown"), 262 | }) 263 | 264 | err := SendToKafka( 265 | kafkaWriter, 266 | url, 267 | []byte(markdownContent), 268 | markdownHeaders..., 269 | ) 270 | if err != nil { 271 | return err 272 | } 273 | } 274 | 275 | return nil 276 | } 277 | 278 | // Helper function to check if a content type is in the list 279 | func containsContentType(types []ContentType, target ContentType) bool { 280 | for _, t := range types { 281 | if t == target { 282 | return true 283 | } 284 | } 285 | return false 286 | } 287 | 288 | // CloseKafkaWriter safely closes the Kafka writer 289 | func CloseKafkaWriter(writer interface{}) { 290 | if kafkaWriter, ok := writer.(*kafka.Writer); ok { 291 | kafkaWriter.Close() 292 | } 293 | } 294 | -------------------------------------------------------------------------------- /storage/storage.go: -------------------------------------------------------------------------------- 1 | package storage 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "io/ioutil" 7 | "log" 8 | "net/url" 9 | "os" 10 | "path/filepath" 11 | "strings" 12 | "time" 13 | 14 | "github.com/aws/aws-sdk-go-v2/aws" 15 | "github.com/aws/aws-sdk-go-v2/config" 16 | "github.com/aws/aws-sdk-go-v2/credentials" 17 | "github.com/aws/aws-sdk-go-v2/service/s3" 18 | "github.com/joho/godotenv" 19 | ) 20 | 21 | // R2Config holds the configuration for Cloudflare R2 22 | type R2Config struct { 23 | AccountID string 24 | AccessKeyID string 25 | AccessKeySecret string 26 | BucketName string 27 | Region string 28 | } 29 | 30 | // LoadR2Config loads R2 configuration from environment variables 31 | func LoadR2Config() (R2Config, error) { 32 | config := R2Config{ 33 | AccountID: os.Getenv("R2_ACCOUNT_ID"), 34 | AccessKeyID: os.Getenv("R2_ACCESS_KEY_ID"), 35 | AccessKeySecret: os.Getenv("R2_ACCESS_KEY_SECRET"), 36 | BucketName: os.Getenv("R2_BUCKET_NAME"), 37 | Region: os.Getenv("R2_REGION"), 38 | } 39 | 40 | // Check if required values are set 41 | if config.AccountID == "" || config.AccessKeyID == "" || 42 | config.AccessKeySecret == "" || config.BucketName == "" { 43 | return config, fmt.Errorf("missing required R2 configuration in environment variables") 44 | } 45 | 46 | // Set default region if not specified 47 | if config.Region == "" { 48 | config.Region = "auto" // R2 typically uses "auto" as region 49 | } 50 | 51 | return config, nil 52 | } 53 | 54 | // CreateS3Client creates an S3 client configured for Cloudflare R2 55 | func CreateS3Client(cfg R2Config) (*s3.Client, error) { 56 | r2Resolver := aws.EndpointResolverWithOptionsFunc(func(service, region string, options ...interface{}) (aws.Endpoint, error) { 57 | return aws.Endpoint{ 58 | URL: fmt.Sprintf("https://%s.r2.cloudflarestorage.com", cfg.AccountID), 59 | }, nil 60 | }) 61 | 62 | awsCfg, err := config.LoadDefaultConfig(context.TODO(), 63 | config.WithEndpointResolverWithOptions(r2Resolver), 64 | config.WithCredentialsProvider(credentials.NewStaticCredentialsProvider( 65 | cfg.AccessKeyID, 66 | cfg.AccessKeySecret, 67 | "", 68 | )), 69 | config.WithRegion(cfg.Region), 70 | ) 71 | if err != nil { 72 | return nil, fmt.Errorf("failed to load AWS config: %v", err) 73 | } 74 | 75 | return s3.NewFromConfig(awsCfg), nil 76 | } 77 | 78 | // SanitizeURL converts a URL to a safe filename component 79 | func SanitizeURL(urlStr string) string { 80 | // Parse the URL 81 | parsedURL, err := url.Parse(urlStr) 82 | if err != nil { 83 | // If parsing fails, sanitize the string more aggressively 84 | sanitized := strings.ReplaceAll(urlStr, "/", "_") 85 | sanitized = strings.ReplaceAll(sanitized, "\\", "_") 86 | sanitized = strings.ReplaceAll(sanitized, ":", "_") 87 | sanitized = strings.ReplaceAll(sanitized, "*", "_") 88 | sanitized = strings.ReplaceAll(sanitized, "?", "_") 89 | sanitized = strings.ReplaceAll(sanitized, "\"", "_") 90 | sanitized = strings.ReplaceAll(sanitized, "<", "_") 91 | sanitized = strings.ReplaceAll(sanitized, ">", "_") 92 | sanitized = strings.ReplaceAll(sanitized, "|", "_") 93 | return sanitized 94 | } 95 | 96 | // Combine host and path 97 | result := parsedURL.Host 98 | if parsedURL.Path != "" && parsedURL.Path != "/" { 99 | // Add path but remove leading/trailing slashes 100 | path := strings.Trim(parsedURL.Path, "/") 101 | result += "_" + path 102 | } 103 | 104 | // Remove unsafe characters 105 | unsafe := []string{":", "/", "\\", "?", "*", "\"", "<", ">", "|", " ", "\t", "\n", "\r", "&", "=", "+", "$", ",", ";", "^", "`", "{", "}", "[", "]", "(", ")", "#", "%"} 106 | for _, char := range unsafe { 107 | result = strings.ReplaceAll(result, char, "_") 108 | } 109 | 110 | // Ensure no directory traversal is possible 111 | result = strings.ReplaceAll(result, "..", "_") 112 | 113 | // Truncate if too long (max 200 chars for filename safety) 114 | if len(result) > 200 { 115 | result = result[:200] 116 | } 117 | 118 | return result 119 | } 120 | 121 | // UploadFileToR2 uploads a file to R2 bucket 122 | func UploadFileToR2(client *s3.Client, bucketName, filePath, uuid, originalURL, fileType string) error { 123 | // Read file content 124 | content, err := ioutil.ReadFile(filePath) 125 | if err != nil { 126 | return fmt.Errorf("failed to read file %s: %v", filePath, err) 127 | } 128 | 129 | // Create key in format UUID+sanitizedURL.extension 130 | sanitizedURL := SanitizeURL(originalURL) 131 | key := fmt.Sprintf("%s+%s.%s", uuid, sanitizedURL, fileType) 132 | 133 | // Upload to R2 134 | _, err = client.PutObject(context.TODO(), &s3.PutObjectInput{ 135 | Bucket: aws.String(bucketName), 136 | Key: aws.String(key), 137 | Body: strings.NewReader(string(content)), 138 | ContentType: aws.String(getContentType(fileType)), 139 | }) 140 | 141 | if err != nil { 142 | return fmt.Errorf("failed to upload %s to R2: %v", filePath, err) 143 | } 144 | 145 | fmt.Printf("Successfully uploaded %s to R2 as %s\n", filePath, key) 146 | return nil 147 | } 148 | 149 | // getContentType returns the MIME type based on file extension 150 | func getContentType(fileType string) string { 151 | switch fileType { 152 | case "html": 153 | return "text/html" 154 | case "md": 155 | return "text/markdown" 156 | default: 157 | return "application/octet-stream" 158 | } 159 | } 160 | 161 | // FindFilesForURL finds HTML and MD files for a given URL 162 | func FindFilesForURL(directory, urlStr string) (htmlFile, mdFile string, err error) { 163 | domain := GetDomainNameForFile(urlStr) 164 | 165 | files, err := ioutil.ReadDir(directory) 166 | if err != nil { 167 | return "", "", fmt.Errorf("failed to read directory %s: %v", directory, err) 168 | } 169 | 170 | for _, file := range files { 171 | if strings.HasPrefix(file.Name(), domain) { 172 | ext := filepath.Ext(file.Name()) 173 | if ext == ".html" { 174 | htmlFile = filepath.Join(directory, file.Name()) 175 | } else if ext == ".md" { 176 | mdFile = filepath.Join(directory, file.Name()) 177 | } 178 | } 179 | } 180 | 181 | if htmlFile == "" && mdFile == "" { 182 | return "", "", fmt.Errorf("no files found for URL %s", urlStr) 183 | } 184 | 185 | return htmlFile, mdFile, nil 186 | } 187 | 188 | // GetDomainNameForFile generates a unique filename prefix from the URL 189 | func GetDomainNameForFile(pageURL string) string { 190 | parsedURL, err := url.Parse(pageURL) 191 | if err != nil { 192 | log.Printf("Error parsing URL %s: %v", pageURL, err) 193 | return "unknown" 194 | } 195 | domain := strings.ReplaceAll(parsedURL.Hostname(), ".", "_") 196 | path := strings.Trim(parsedURL.Path, "/") 197 | if path == "" { 198 | return domain 199 | } 200 | path = strings.ReplaceAll(path, "/", "_") 201 | return fmt.Sprintf("%s_%s", domain, path) 202 | } 203 | 204 | // SaveToLocalFile saves content to a file with the appropriate extension 205 | func SaveToLocalFile(content, url, fileType, outputDir string) (string, error) { 206 | // Check for directory traversal attempts 207 | if strings.Contains(outputDir, "..") { 208 | return "", fmt.Errorf("directory traversal attempt detected") 209 | } 210 | 211 | // Limit content size to prevent denial of service 212 | maxContentSize := 10 * 1024 * 1024 // 10 MB 213 | if len(content) > maxContentSize { 214 | content = content[:maxContentSize] 215 | log.Printf("Warning: Content for URL %s truncated to %d bytes", url, maxContentSize) 216 | } 217 | 218 | domain := GetDomainNameForFile(url) 219 | date := time.Now().Format("2006-01-02") 220 | 221 | // Ensure safe file type 222 | safeFileType := fileType 223 | if fileType != "html" && fileType != "md" { 224 | safeFileType = "txt" // Default to txt if type is unexpected 225 | } 226 | 227 | filename := fmt.Sprintf("%s_%s.%s", domain, date, safeFileType) 228 | 229 | // Use the specified output directory or current directory 230 | if outputDir != "" && outputDir != "." { 231 | // Validate output directory path 232 | absOutputDir, err := filepath.Abs(outputDir) 233 | if err != nil { 234 | return "", fmt.Errorf("invalid output directory path: %v", err) 235 | } 236 | 237 | // Create the directory if it doesn't exist 238 | if err := os.MkdirAll(absOutputDir, 0755); err != nil { 239 | return "", fmt.Errorf("failed to create directory %s: %v", absOutputDir, err) 240 | } 241 | 242 | filename = filepath.Join(absOutputDir, filename) 243 | } 244 | 245 | // Ensure the final path doesn't go outside the intended directory 246 | absFilename, err := filepath.Abs(filename) 247 | if err != nil { 248 | return "", fmt.Errorf("invalid file path: %v", err) 249 | } 250 | 251 | absOutputDir, err := filepath.Abs(outputDir) 252 | if err != nil { 253 | return "", fmt.Errorf("invalid output directory path: %v", err) 254 | } 255 | 256 | if !strings.HasPrefix(absFilename, absOutputDir) { 257 | return "", fmt.Errorf("path traversal attempt detected") 258 | } 259 | 260 | err = ioutil.WriteFile(filename, []byte(content), 0644) 261 | if err != nil { 262 | return "", fmt.Errorf("failed to save file %s: %v", filename, err) 263 | } 264 | fmt.Printf("✅ Saved to %s\n", filename) 265 | return filename, nil 266 | } 267 | 268 | // InitEnv loads environment variables from .env file 269 | func InitEnv() error { 270 | return godotenv.Load() 271 | } 272 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import pathik 2 | import sys 3 | 4 | # Print diagnostic info 5 | print(f"Python version: {sys.version}") 6 | print(f"pathik version: {pathik.__file__}") 7 | 8 | # Test with a single URL 9 | try: 10 | result = pathik.crawl('https://example.com', output_dir='./output') 11 | print(f"Crawl result: {result}") 12 | except Exception as e: 13 | print(f"Error: {e}") 14 | 15 | # Crawl multiple URLs 16 | results = pathik.crawl(['https://example.com', 'https://news.ycombinator.com'], output_dir='./output') 17 | 18 | # Print the results 19 | print(results) -------------------------------------------------------------------------------- /test/cli_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Test the pathik command line interface to verify command ordering fix 3 | 4 | # Install pathik in development mode if needed 5 | cd "$(dirname "$0")/.." 6 | if ! pip show pathik >/dev/null 2>&1; then 7 | echo "Installing pathik in development mode..." 8 | pip install -e . 9 | fi 10 | 11 | # Create output directory 12 | OUTPUT_DIR="$HOME/projects/pathik/test/cli_test_output" 13 | mkdir -p "$OUTPUT_DIR" 14 | 15 | echo "===== TESTING PATHIK CLI WITH COMMAND ORDERING FIX =====" 16 | echo "Using output directory: $OUTPUT_DIR" 17 | 18 | # Test 1: Basic crawl with output directory 19 | echo -e "\n[TEST 1] Basic crawl with output directory" 20 | echo "Running: pathik crawl https://example.com -o $OUTPUT_DIR" 21 | pathik crawl https://example.com -o "$OUTPUT_DIR" 22 | 23 | # Check if HTML and MD files were created 24 | HTML_COUNT=$(find "$OUTPUT_DIR" -name "*.html" | wc -l) 25 | MD_COUNT=$(find "$OUTPUT_DIR" -name "*.md" | wc -l) 26 | 27 | if [ "$HTML_COUNT" -gt 0 ] && [ "$MD_COUNT" -gt 0 ]; then 28 | echo -e "\n✅ TEST PASSED: Files were created successfully" 29 | echo "HTML files: $HTML_COUNT" 30 | echo "Markdown files: $MD_COUNT" 31 | 32 | # List the files 33 | echo -e "\nFiles created:" 34 | find "$OUTPUT_DIR" -type f | while read -r file; do 35 | SIZE=$(stat -f%z "$file") 36 | echo " - $file ($SIZE bytes)" 37 | done 38 | else 39 | echo -e "\n❌ TEST FAILED: Files were not created" 40 | echo "HTML files: $HTML_COUNT" 41 | echo "Markdown files: $MD_COUNT" 42 | fi 43 | 44 | # Test 2: Multiple URLs with parallel flag 45 | echo -e "\n[TEST 2] Multiple URLs with parallel flag" 46 | echo "Running: pathik crawl https://example.com https://httpbin.org/html -o $OUTPUT_DIR -p" 47 | pathik crawl https://example.com https://httpbin.org/html -o "$OUTPUT_DIR" -p 48 | 49 | # Check if more HTML and MD files were created 50 | NEW_HTML_COUNT=$(find "$OUTPUT_DIR" -name "*.html" | wc -l) 51 | NEW_MD_COUNT=$(find "$OUTPUT_DIR" -name "*.md" | wc -l) 52 | 53 | if [ "$NEW_HTML_COUNT" -gt "$HTML_COUNT" ] && [ "$NEW_MD_COUNT" -gt "$MD_COUNT" ]; then 54 | echo -e "\n✅ TEST PASSED: Additional files were created successfully" 55 | echo "Total HTML files: $NEW_HTML_COUNT (added $(($NEW_HTML_COUNT - $HTML_COUNT)))" 56 | echo "Total Markdown files: $NEW_MD_COUNT (added $(($NEW_MD_COUNT - $MD_COUNT)))" 57 | else 58 | echo -e "\n❌ TEST FAILED: Additional files were not created" 59 | echo "HTML files: $NEW_HTML_COUNT (previously $HTML_COUNT)" 60 | echo "Markdown files: $NEW_MD_COUNT (previously $MD_COUNT)" 61 | fi 62 | 63 | echo -e "\n===== ALL TESTS COMPLETED =====" -------------------------------------------------------------------------------- /test/debug_command_order.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Debug script to examine command construction in pathik. 4 | 5 | This script imports the necessary functions from pathik directly 6 | and examines the command construction to ensure flags come before 7 | the -crawl flag and URLs come after. 8 | """ 9 | import os 10 | import sys 11 | import subprocess 12 | from pprint import pprint 13 | 14 | # Add the parent directory to the path 15 | sys.path.insert(0, os.path.abspath('..')) 16 | 17 | # Import directly from pathik modules 18 | from pathik.cli import crawl as cli_crawl 19 | from pathik.crawler import get_binary_path 20 | 21 | def debug_command_construction(): 22 | """Debug the command construction in pathik""" 23 | # Replace subprocess.run to capture the command 24 | original_run = subprocess.run 25 | last_command = None 26 | 27 | def mock_run(command, *args, **kwargs): 28 | nonlocal last_command 29 | last_command = command 30 | 31 | # Create a mock result 32 | class MockResult: 33 | returncode = 0 34 | stdout = "{}" 35 | stderr = "" 36 | return MockResult() 37 | 38 | # Replace the run function 39 | subprocess.run = mock_run 40 | 41 | try: 42 | # Get binary path 43 | binary_path = get_binary_path() 44 | print(f"Binary path: {binary_path}") 45 | 46 | # Set up test parameters 47 | test_url = "https://example.com" 48 | output_dir = "/tmp/test_output" 49 | 50 | print("\n=== Testing command construction ===") 51 | print(f"URL: {test_url}") 52 | print(f"Output dir: {output_dir}") 53 | 54 | # Call the function that constructs the command 55 | cli_crawl( 56 | urls=test_url, 57 | output_dir=output_dir, 58 | parallel=True 59 | ) 60 | 61 | # Analyze the command 62 | if last_command: 63 | print("\nCommand constructed:") 64 | print(f"{' '.join(last_command)}") 65 | 66 | # Check ordering 67 | binary_idx = last_command.index(binary_path) 68 | crawl_idx = last_command.index("-crawl") if "-crawl" in last_command else -1 69 | url_idx = last_command.index(test_url) if test_url in last_command else -1 70 | outdir_idx = last_command.index("-outdir") if "-outdir" in last_command else -1 71 | 72 | print("\nElement positions:") 73 | print(f"- Binary: {binary_idx}") 74 | print(f"- -crawl flag: {crawl_idx}") 75 | print(f"- URL: {url_idx}") 76 | print(f"- -outdir flag: {outdir_idx}") 77 | 78 | if crawl_idx > 0 and url_idx > 0 and outdir_idx > 0: 79 | if binary_idx < outdir_idx < crawl_idx < url_idx: 80 | print("\n✅ CORRECT ORDER: binary -> flags -> -crawl -> URLs") 81 | else: 82 | print("\n❌ INCORRECT ORDER!") 83 | if outdir_idx > crawl_idx: 84 | print("Problem: -outdir flag comes after -crawl") 85 | if url_idx < crawl_idx: 86 | print("Problem: URL comes before -crawl") 87 | else: 88 | print("\n❌ Missing critical elements in command!") 89 | else: 90 | print("\n❌ No command was constructed!") 91 | 92 | finally: 93 | # Restore original subprocess.run 94 | subprocess.run = original_run 95 | 96 | if __name__ == "__main__": 97 | debug_command_construction() -------------------------------------------------------------------------------- /test/direct_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Direct test for pathik binary to verify correct command ordering. 4 | 5 | This test script runs the pathik Go binary directly with different 6 | command ordering patterns to determine which one works correctly. 7 | """ 8 | import os 9 | import sys 10 | import subprocess 11 | import json 12 | import tempfile 13 | import shutil 14 | 15 | def run_command(cmd, show_output=True): 16 | """Run a command and return the result with exit code""" 17 | if show_output: 18 | print(f"Running: {' '.join(cmd)}") 19 | 20 | try: 21 | result = subprocess.run( 22 | cmd, 23 | stdout=subprocess.PIPE, 24 | stderr=subprocess.PIPE, 25 | text=True, 26 | check=False 27 | ) 28 | 29 | if show_output: 30 | print(f"Exit code: {result.returncode}") 31 | if result.stdout: 32 | print("STDOUT:") 33 | print(result.stdout[:500] + ("..." if len(result.stdout) > 500 else "")) 34 | if result.stderr: 35 | print("STDERR:") 36 | print(result.stderr[:500] + ("..." if len(result.stderr) > 500 else "")) 37 | 38 | return result 39 | except Exception as e: 40 | if show_output: 41 | print(f"Error executing command: {e}") 42 | return None 43 | 44 | def find_pathik_binary(): 45 | """Find the pathik binary location""" 46 | # Try to find the binary in the Python package 47 | try: 48 | import pathik 49 | path_parts = os.path.abspath(pathik.__file__).split(os.sep) 50 | 51 | # Look for the bin directory 52 | package_dir = os.sep.join(path_parts[:-1]) # Directory containing the package 53 | 54 | # Try to determine the platform 55 | import platform 56 | system = platform.system().lower() 57 | machine = platform.machine().lower() 58 | 59 | # Map to expected directory names 60 | if system == 'darwin': 61 | system = 'darwin' 62 | elif system.startswith('linux'): 63 | system = 'linux' 64 | elif system.startswith('win'): 65 | system = 'windows' 66 | 67 | if machine in ('x86_64', 'amd64'): 68 | machine = 'amd64' 69 | elif machine in ('arm64', 'aarch64'): 70 | machine = 'arm64' 71 | 72 | platform_dir = f"{system}_{machine}" 73 | 74 | # Check if bin directory exists with platform subdirectory 75 | bin_dir = os.path.join(package_dir, 'bin', platform_dir) 76 | if os.path.exists(bin_dir): 77 | binary_name = 'pathik_bin.exe' if system == 'windows' else 'pathik_bin' 78 | binary_path = os.path.join(bin_dir, binary_name) 79 | if os.path.exists(binary_path): 80 | return binary_path 81 | except: 82 | pass 83 | 84 | # Try to locate it in the project structure 85 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) 86 | 87 | # Check common locations 88 | possible_locations = [ 89 | os.path.join(project_root, 'pathik', 'bin', 'darwin_arm64', 'pathik_bin'), 90 | os.path.join(project_root, 'pathik', 'bin', 'darwin_amd64', 'pathik_bin'), 91 | os.path.join(project_root, 'pathik', 'bin', 'linux_amd64', 'pathik_bin'), 92 | os.path.join(project_root, 'pathik_bin'), 93 | os.path.join(project_root, 'bin', 'pathik_bin'), 94 | ] 95 | 96 | for location in possible_locations: 97 | if os.path.exists(location) and os.access(location, os.X_OK): 98 | return location 99 | 100 | # If not found, try using 'which' 101 | try: 102 | result = subprocess.run(['which', 'pathik_bin'], stdout=subprocess.PIPE, text=True, check=False) 103 | if result.returncode == 0 and result.stdout.strip(): 104 | return result.stdout.strip() 105 | except: 106 | pass 107 | 108 | return None 109 | 110 | def test_binary_command_orders(): 111 | """Test different command orderings with the pathik binary""" 112 | binary_path = find_pathik_binary() 113 | if not binary_path: 114 | print("❌ ERROR: Could not find pathik binary") 115 | return False 116 | 117 | print(f"Found pathik binary at: {binary_path}") 118 | 119 | # Check binary version 120 | version_result = run_command([binary_path, '-version']) 121 | if version_result.returncode != 0: 122 | print("❌ ERROR: Failed to get binary version") 123 | return False 124 | 125 | # Create output directory 126 | output_dir = tempfile.mkdtemp(prefix="pathik_direct_test_") 127 | print(f"Using output directory: {output_dir}") 128 | 129 | try: 130 | test_url = "https://example.com" 131 | 132 | print("\n=== Testing different command orderings ===") 133 | 134 | # Test 1: flags before -crawl (CORRECT ORDER) 135 | print("\n[TEST 1] flags before -crawl (CORRECT ORDER)") 136 | cmd1 = [binary_path, "-outdir", output_dir, "-crawl", test_url] 137 | result1 = run_command(cmd1) 138 | passed1 = result1.returncode == 0 139 | 140 | if passed1: 141 | print("✅ PASSED: Flags before -crawl works correctly") 142 | # Check if files were created 143 | html_files = [f for f in os.listdir(output_dir) if f.endswith('.html')] 144 | if html_files: 145 | print(f"HTML file created: {html_files[0]}") 146 | else: 147 | print("❌ FAILED: Flags before -crawl failed") 148 | 149 | # Clean output directory 150 | for file in os.listdir(output_dir): 151 | os.remove(os.path.join(output_dir, file)) 152 | 153 | # Test 2: -crawl before URLs before flags (INCORRECT ORDER) 154 | print("\n[TEST 2] -crawl before URLs before flags (INCORRECT ORDER)") 155 | cmd2 = [binary_path, "-crawl", test_url, "-outdir", output_dir] 156 | result2 = run_command(cmd2) 157 | passed2 = result2.returncode == 0 158 | 159 | if passed2: 160 | print("⚠️ UNEXPECTED PASS: -crawl before URLs before flags works (should fail)") 161 | # Check if files were created 162 | html_files = [f for f in os.listdir(output_dir) if f.endswith('.html')] 163 | if html_files: 164 | print(f"HTML file created: {html_files[0]}") 165 | else: 166 | print("✅ EXPECTED FAILURE: -crawl before URLs before flags failed (correct behavior)") 167 | print(f"Error message: {result2.stderr.strip()}") 168 | 169 | # Clean output directory 170 | for file in os.listdir(output_dir): 171 | os.remove(os.path.join(output_dir, file)) 172 | 173 | # Test 3: -crawl before flags before URLs (MIXED ORDER) 174 | print("\n[TEST 3] -crawl before flags before URLs (MIXED ORDER)") 175 | cmd3 = [binary_path, "-crawl", "-outdir", output_dir, test_url] 176 | result3 = run_command(cmd3) 177 | passed3 = result3.returncode == 0 178 | 179 | if passed3: 180 | print("⚠️ UNEXPECTED PASS: -crawl before flags before URLs works (unexpected)") 181 | # Check if files were created 182 | html_files = [f for f in os.listdir(output_dir) if f.endswith('.html')] 183 | if html_files: 184 | print(f"HTML file created: {html_files[0]}") 185 | else: 186 | print("✅ EXPECTED FAILURE: -crawl before flags before URLs failed (expected)") 187 | print(f"Error message: {result3.stderr.strip()}") 188 | 189 | print("\n=== Test Results Summary ===") 190 | print(f"Test 1 (Correct Order): {'✅ PASSED' if passed1 else '❌ FAILED'}") 191 | print(f"Test 2 (Incorrect Order): {'❌ FAILED' if not passed2 else '⚠️ UNEXPECTED PASS'}") 192 | print(f"Test 3 (Mixed Order): {'❌ FAILED' if not passed3 else '⚠️ UNEXPECTED PASS'}") 193 | 194 | if passed1 and not passed2: 195 | print("\n✅ OVERALL: Command ordering behavior is correct!") 196 | print("The binary requires flags to come before -crawl, and URLs to come after -crawl") 197 | return True 198 | else: 199 | print("\n⚠️ OVERALL: Command ordering behavior is not as expected!") 200 | return False 201 | 202 | finally: 203 | # Clean up 204 | if os.path.exists(output_dir): 205 | shutil.rmtree(output_dir) 206 | 207 | if __name__ == "__main__": 208 | test_binary_command_orders() -------------------------------------------------------------------------------- /test/python_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Test script for the pathik Python API with the command ordering fix. 4 | """ 5 | import os 6 | import sys 7 | import json 8 | import subprocess 9 | from pprint import pprint 10 | 11 | # Add parent directory to the path 12 | sys.path.insert(0, os.path.abspath('..')) 13 | 14 | # Import logging to see what's happening 15 | import logging 16 | logging.basicConfig(level=logging.DEBUG, 17 | format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') 18 | 19 | # First test is just to import pathik 20 | print("Importing pathik...") 21 | import pathik 22 | print(f"Successfully imported pathik version {pathik.__version__}") 23 | 24 | # Monkey patch subprocess.run to see the actual commands being run 25 | original_run = subprocess.run 26 | commands_run = [] 27 | 28 | def mock_run(*args, **kwargs): 29 | commands_run.append(args[0] if args else kwargs.get('args', '')) 30 | print(f"Mock subprocess.run called with: {' '.join(args[0] if args else kwargs.get('args', []))}") 31 | # Pass through to the original function 32 | return original_run(*args, **kwargs) 33 | 34 | # Apply the monkey patch 35 | subprocess.run = mock_run 36 | 37 | # Now test the crawl function 38 | print("\n=== Testing pathik.crawl ===") 39 | output_dir = "/tmp/pathik_python_test" 40 | os.makedirs(output_dir, exist_ok=True) 41 | print(f"Output directory: {output_dir}") 42 | 43 | # Try using the crawler directly 44 | print("\n=== Testing direct import from pathik.crawler ===") 45 | from pathik.crawler import crawl as direct_crawl 46 | from pathik.crawler import get_binary_path 47 | 48 | # Get the binary path 49 | binary_path = get_binary_path() 50 | print(f"Binary path: {binary_path}") 51 | 52 | # Test the command directly to verify behavior 53 | def test_direct_command(cmd): 54 | print(f"\nTesting direct command: {' '.join(cmd)}") 55 | result = original_run(cmd, capture_output=True, text=True) 56 | print(f"Exit code: {result.returncode}") 57 | print(f"Output: {result.stdout[:200]}...") 58 | print(f"Error: {result.stderr[:200]}..." if result.stderr else "No errors") 59 | return result 60 | 61 | # Test both command orders 62 | test_direct_command([binary_path, "-outdir", output_dir, "-crawl", "https://example.com"]) 63 | test_direct_command([binary_path, "-crawl", "https://example.com", "-outdir", output_dir]) 64 | 65 | # Test crawler directly 66 | print("\n=== Testing crawler module directly ===") 67 | try: 68 | result = direct_crawl("https://example.com", output_dir=output_dir) 69 | print("Command(s) executed:") 70 | for cmd in commands_run: 71 | print(f" {' '.join(cmd)}") 72 | print("\nCrawler direct result:") 73 | pprint(result) 74 | except Exception as e: 75 | print(f"Error: {e}") 76 | 77 | # Clear the commands list 78 | commands_run.clear() 79 | 80 | # Test the CLI crawl 81 | print("\n=== Testing cli.crawl ===") 82 | from pathik.cli import crawl as cli_crawl 83 | try: 84 | result = cli_crawl("https://example.com", output_dir=output_dir) 85 | print("Command(s) executed:") 86 | for cmd in commands_run: 87 | print(f" {' '.join(cmd)}") 88 | print("\nCLI crawl result:") 89 | pprint(result) 90 | except Exception as e: 91 | print(f"Error: {e}") 92 | 93 | # Restore the original subprocess.run 94 | subprocess.run = original_run 95 | 96 | print("\n=== Test complete ===") 97 | print("Commands executed during test:") 98 | for i, cmd in enumerate(commands_run): 99 | print(f"{i+1}: {' '.join(cmd)}") 100 | 101 | # Check files that were created 102 | print("\nFiles created:") 103 | created_files = os.listdir(output_dir) 104 | for file in created_files: 105 | file_path = os.path.join(output_dir, file) 106 | print(f" - {file} ({os.path.getsize(file_path)} bytes)") 107 | 108 | print("\nSummary:") 109 | print(f"Number of commands executed: {len(commands_run)}") 110 | print(f"Number of files created: {len(created_files)}") 111 | if "https://example.com" in result and "html" in result["https://example.com"]: 112 | print("✅ JSON result contains expected URL and HTML file") 113 | else: 114 | print("❌ JSON result does not contain expected structure") 115 | print(f"Keys in result: {list(result.keys())}") 116 | if "raw_output" in result: 117 | print(f"Raw output: {result['raw_output']}") -------------------------------------------------------------------------------- /test/run_all_tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Run all Pathik command ordering tests 3 | 4 | set -e # Exit on any error 5 | 6 | echo "===== PATHIK COMMAND ORDERING TESTS =====" 7 | echo "Running tests to verify the command ordering fix" 8 | echo 9 | 10 | # Navigate to test directory 11 | cd "$(dirname "$0")" 12 | 13 | # Print current directory and Python environment 14 | echo "Current directory: $(pwd)" 15 | echo "Python interpreter: $(which python)" 16 | echo 17 | 18 | # Run the command construction test first (dry run without actual API calls) 19 | echo "Running command construction tests..." 20 | python test_command_ordering.py 21 | echo 22 | 23 | # Run the full implementation test (actual API calls) 24 | echo "Running full implementation tests..." 25 | python test_fixed_impl.py 26 | echo 27 | 28 | echo "===== ALL TESTS COMPLETED =====" -------------------------------------------------------------------------------- /test/simple_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Simple direct test for pathik with the fixed command ordering. 4 | """ 5 | import os 6 | import sys 7 | import json 8 | 9 | # Add the parent directory to the path 10 | sys.path.insert(0, os.path.abspath('..')) 11 | 12 | # Import pathik directly 13 | import pathik 14 | 15 | def main(): 16 | """Test pathik directly to verify command ordering fix""" 17 | print(f"Testing pathik version: {pathik.__version__}") 18 | 19 | # Create output directory 20 | output_dir = "simple_test_output" 21 | os.makedirs(output_dir, exist_ok=True) 22 | 23 | # Test URL 24 | url = "https://example.com" 25 | 26 | print(f"\nCrawling {url}...") 27 | print(f"Output directory: {output_dir}") 28 | 29 | try: 30 | # This is the key test - this would fail before the fix 31 | # because -outdir would be placed after URLs 32 | results = pathik.crawl(url, output_dir=output_dir) 33 | 34 | if url in results and "html" in results[url]: 35 | print(f"\n✅ SUCCESS: {url} was crawled correctly") 36 | print(f"HTML file: {results[url]['html']}") 37 | print(f"Markdown file: {results[url]['markdown']}") 38 | 39 | # Check if files exist and have content 40 | html_file = results[url]['html'] 41 | md_file = results[url]['markdown'] 42 | 43 | if os.path.exists(html_file) and os.path.getsize(html_file) > 0: 44 | print(f"✅ HTML file has content ({os.path.getsize(html_file)} bytes)") 45 | else: 46 | print(f"❌ HTML file is missing or empty") 47 | 48 | if os.path.exists(md_file) and os.path.getsize(md_file) > 0: 49 | print(f"✅ Markdown file has content ({os.path.getsize(md_file)} bytes)") 50 | else: 51 | print(f"❌ Markdown file is missing or empty") 52 | else: 53 | print(f"❌ ERROR: Failed to crawl {url}") 54 | print(f"Results: {json.dumps(results, indent=2)}") 55 | 56 | except Exception as e: 57 | print(f"❌ ERROR: {str(e)}") 58 | import traceback 59 | traceback.print_exc() 60 | 61 | if __name__ == "__main__": 62 | main() -------------------------------------------------------------------------------- /test_secure_kafka.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Secure Kafka streaming test using our fixed implementation with customizable buffer sizes. 4 | """ 5 | import os 6 | import sys 7 | import uuid 8 | import argparse 9 | import subprocess 10 | from typing import List, Optional, Dict, Any 11 | import time 12 | 13 | # Force use of the local binary 14 | LOCAL_BINARY = os.path.join(os.path.dirname(os.path.abspath(__file__)), "pathik_bin") 15 | 16 | def stream_to_kafka_direct( 17 | urls: List[str], 18 | kafka_brokers: str = "localhost:9092", 19 | kafka_topic: str = "pathik_crawl_data", 20 | content_type: str = "both", 21 | session_id: Optional[str] = None, 22 | max_message_size: int = 10 * 1024 * 1024, # 10MB 23 | buffer_memory: int = 100 * 1024 * 1024, # 100MB 24 | max_request_size: int = 20 * 1024 * 1024 # 20MB 25 | ) -> Dict[str, Any]: 26 | """ 27 | Stream directly using our secured binary with buffer size customization 28 | """ 29 | if not os.path.exists(LOCAL_BINARY): 30 | raise FileNotFoundError(f"Local binary not found at {LOCAL_BINARY}") 31 | 32 | # Ensure binary is executable 33 | if not os.access(LOCAL_BINARY, os.X_OK): 34 | os.chmod(LOCAL_BINARY, 0o755) 35 | 36 | # Generate a session ID if not provided 37 | if not session_id: 38 | session_id = str(uuid.uuid4()) 39 | 40 | # Set environment variables for Kafka configuration 41 | env = os.environ.copy() 42 | env["KAFKA_BROKERS"] = kafka_brokers 43 | env["KAFKA_TOPIC"] = kafka_topic 44 | 45 | # Add buffer size configurations 46 | env["KAFKA_MAX_MESSAGE_SIZE"] = str(max_message_size) 47 | env["KAFKA_BUFFER_MEMORY"] = str(buffer_memory) 48 | env["KAFKA_MAX_REQUEST_SIZE"] = str(max_request_size) 49 | 50 | # Build command 51 | cmd = [ 52 | LOCAL_BINARY, 53 | "-kafka", 54 | "-content", content_type, 55 | "-session", session_id 56 | ] 57 | 58 | # Add URLs 59 | cmd.extend(urls) 60 | 61 | print(f"Running command: {' '.join(cmd)}") 62 | result = subprocess.run(cmd, env=env, capture_output=True, text=True) 63 | 64 | if result.returncode != 0: 65 | print(f"Error running command: {result.stderr}") 66 | raise RuntimeError(f"Command failed with exit code {result.returncode}") 67 | 68 | print(result.stdout) 69 | 70 | # Format results 71 | results = {} 72 | successful = 0 73 | for url in urls: 74 | results[url] = { 75 | "success": True, 76 | "details": { 77 | "topic": kafka_topic, 78 | "session_id": session_id 79 | } 80 | } 81 | successful += 1 82 | 83 | return { 84 | "results": results, 85 | "success_count": successful, 86 | "failed_count": len(urls) - successful, 87 | "session_id": session_id 88 | } 89 | 90 | def main(): 91 | # Set up argument parser 92 | parser = argparse.ArgumentParser(description="Test secure Kafka streaming with custom buffer sizes") 93 | parser.add_argument("--urls", type=str, required=True, help="Comma-separated list of URLs to stream") 94 | parser.add_argument("--brokers", type=str, default="localhost:9092", help="Kafka broker list (comma-separated)") 95 | parser.add_argument("--topic", type=str, default="pathik_crawl_data", help="Kafka topic to stream to") 96 | parser.add_argument("--content", type=str, choices=["html", "markdown", "both"], default="both", 97 | help="Type of content to stream") 98 | parser.add_argument("--session", type=str, help="Session ID (generated if not provided)") 99 | 100 | # Add buffer size customization 101 | parser.add_argument("--max-message-size", type=int, default=10 * 1024 * 1024, 102 | help="Maximum message size in bytes (default: 10MB)") 103 | parser.add_argument("--buffer-memory", type=int, default=100 * 1024 * 1024, 104 | help="Producer buffer memory in bytes (default: 100MB)") 105 | parser.add_argument("--max-request-size", type=int, default=20 * 1024 * 1024, 106 | help="Maximum request size in bytes (default: 20MB)") 107 | 108 | args = parser.parse_args() 109 | 110 | # Parse URLs 111 | url_list = [url.strip() for url in args.urls.split(",")] 112 | 113 | # Generate a session ID if not provided 114 | session_id = args.session 115 | if not session_id: 116 | session_id = str(uuid.uuid4()) 117 | 118 | # Display config information 119 | print(f"Streaming {len(url_list)} URLs to Kafka with SECURE implementation:") 120 | print(f"Kafka Brokers: {args.brokers}") 121 | print(f"Kafka Topic: {args.topic}") 122 | print(f"Content Type: {args.content}") 123 | print(f"Session ID: {session_id}") 124 | print(f"Max Message Size: {args.max_message_size:,} bytes") 125 | print(f"Buffer Memory: {args.buffer_memory:,} bytes") 126 | print(f"Max Request Size: {args.max_request_size:,} bytes") 127 | print("="*50) 128 | 129 | # Stream to Kafka with our secure implementation 130 | try: 131 | start_time = time.time() 132 | response = stream_to_kafka_direct( 133 | urls=url_list, 134 | kafka_brokers=args.brokers, 135 | kafka_topic=args.topic, 136 | content_type=args.content, 137 | session_id=session_id, 138 | max_message_size=args.max_message_size, 139 | buffer_memory=args.buffer_memory, 140 | max_request_size=args.max_request_size 141 | ) 142 | elapsed_time = time.time() - start_time 143 | 144 | results = response["results"] 145 | successful = response["success_count"] 146 | failed = response["failed_count"] 147 | 148 | print("\nStreaming Results:") 149 | print("="*50) 150 | 151 | for url, result in results.items(): 152 | status = "✅ Success" if result.get("success", False) else "❌ Failed" 153 | if result.get("success", False): 154 | print(f"{status} - {url}") 155 | if "details" in result: 156 | details = result["details"] 157 | print(f" Topic: {details.get('topic')}") 158 | else: 159 | print(f"{status} - {url}") 160 | if "error" in result: 161 | print(f" Error: {result['error']}") 162 | 163 | print("\nSummary:") 164 | print(f"Successfully streamed: {successful}/{len(url_list)}") 165 | print(f"Failed to stream: {failed}/{len(url_list)}") 166 | print(f"Total time: {elapsed_time:.2f} seconds") 167 | 168 | print("\nTo consume these messages from Kafka:") 169 | print(f" python examples/kafka_consumer_direct.py --session={session_id}") 170 | 171 | except Exception as e: 172 | print(f"Error: {e}") 173 | sys.exit(1) 174 | 175 | if __name__ == "__main__": 176 | main() --------------------------------------------------------------------------------