├── .github └── workflows │ └── CI.yml ├── .gitignore ├── Cargo.lock ├── Cargo.toml ├── LICENSE ├── README.md ├── pyproject.toml ├── python ├── html2text_rs │ ├── __init__.py │ ├── html2text_rs.pyi │ └── py.typed └── tests │ └── test_html2text_rs.py └── src └── lib.rs /.github/workflows/CI.yml: -------------------------------------------------------------------------------- 1 | # This file is autogenerated by maturin v1.8.3 2 | # To update, run 3 | # 4 | # maturin generate-ci github -o .github/workflows/CI.yml --pytest 5 | # 6 | name: CI 7 | 8 | on: 9 | push: 10 | branches: 11 | - main 12 | - '*' 13 | tags: 14 | - '*' 15 | pull_request: 16 | workflow_dispatch: 17 | 18 | permissions: 19 | contents: read 20 | 21 | jobs: 22 | linux: 23 | runs-on: ${{ matrix.platform.runner }} 24 | strategy: 25 | matrix: 26 | platform: 27 | - runner: ubuntu-22.04 28 | target: x86_64 29 | - runner: ubuntu-22.04 30 | target: x86 31 | - runner: ubuntu-22.04 32 | target: aarch64 33 | - runner: ubuntu-22.04 34 | target: armv7 35 | - runner: ubuntu-22.04 36 | target: s390x 37 | - runner: ubuntu-22.04 38 | target: ppc64le 39 | steps: 40 | - uses: actions/checkout@v4 41 | - uses: actions/setup-python@v5 42 | with: 43 | python-version: 3.x 44 | - name: Build wheels 45 | uses: PyO3/maturin-action@v1 46 | with: 47 | target: ${{ matrix.platform.target }} 48 | args: --release --out dist 49 | sccache: ${{ !startsWith(github.ref, 'refs/tags/') }} 50 | manylinux: auto 51 | - name: Build free-threaded wheels 52 | uses: PyO3/maturin-action@v1 53 | with: 54 | target: ${{ matrix.platform.target }} 55 | args: --release --out dist -i python3.13t 56 | sccache: ${{ !startsWith(github.ref, 'refs/tags/') }} 57 | manylinux: auto 58 | - name: Upload wheels 59 | uses: actions/upload-artifact@v4 60 | with: 61 | name: wheels-linux-${{ matrix.platform.target }} 62 | path: dist 63 | - name: pytest 64 | if: ${{ startsWith(matrix.platform.target, 'x86_64') }} 65 | shell: bash 66 | run: | 67 | set -e 68 | python3 -m venv .venv 69 | source .venv/bin/activate 70 | pip install html2text_rs --find-links dist --force-reinstall 71 | pip install pytest 72 | pytest 73 | - name: pytest 74 | if: ${{ !startsWith(matrix.platform.target, 'x86') && matrix.platform.target != 'ppc64' }} 75 | uses: uraimo/run-on-arch-action@v2 76 | with: 77 | arch: ${{ matrix.platform.target }} 78 | distro: ubuntu22.04 79 | githubToken: ${{ github.token }} 80 | install: | 81 | apt-get update 82 | apt-get install -y --no-install-recommends python3 python3-pip 83 | pip3 install -U pip pytest 84 | run: | 85 | set -e 86 | pip3 install html2text_rs --find-links dist --force-reinstall 87 | pytest 88 | 89 | musllinux: 90 | runs-on: ${{ matrix.platform.runner }} 91 | strategy: 92 | matrix: 93 | platform: 94 | - runner: ubuntu-22.04 95 | target: x86_64 96 | - runner: ubuntu-22.04 97 | target: x86 98 | - runner: ubuntu-22.04 99 | target: aarch64 100 | - runner: ubuntu-22.04 101 | target: armv7 102 | steps: 103 | - uses: actions/checkout@v4 104 | - uses: actions/setup-python@v5 105 | with: 106 | python-version: 3.x 107 | - name: Build wheels 108 | uses: PyO3/maturin-action@v1 109 | with: 110 | target: ${{ matrix.platform.target }} 111 | args: --release --out dist 112 | sccache: ${{ !startsWith(github.ref, 'refs/tags/') }} 113 | manylinux: musllinux_1_2 114 | - name: Build free-threaded wheels 115 | uses: PyO3/maturin-action@v1 116 | with: 117 | target: ${{ matrix.platform.target }} 118 | args: --release --out dist -i python3.13t 119 | sccache: ${{ !startsWith(github.ref, 'refs/tags/') }} 120 | manylinux: musllinux_1_2 121 | - name: Upload wheels 122 | uses: actions/upload-artifact@v4 123 | with: 124 | name: wheels-musllinux-${{ matrix.platform.target }} 125 | path: dist 126 | - name: pytest 127 | if: ${{ startsWith(matrix.platform.target, 'x86_64') }} 128 | uses: addnab/docker-run-action@v3 129 | with: 130 | image: alpine:latest 131 | options: -v ${{ github.workspace }}:/io -w /io 132 | run: | 133 | set -e 134 | apk add py3-pip py3-virtualenv 135 | python3 -m virtualenv .venv 136 | source .venv/bin/activate 137 | pip install html2text_rs --no-index --find-links dist --force-reinstall 138 | pip install pytest 139 | pytest 140 | - name: pytest 141 | if: ${{ !startsWith(matrix.platform.target, 'x86') }} 142 | uses: uraimo/run-on-arch-action@v2 143 | with: 144 | arch: ${{ matrix.platform.target }} 145 | distro: alpine_latest 146 | githubToken: ${{ github.token }} 147 | install: | 148 | apk add py3-virtualenv 149 | run: | 150 | set -e 151 | python3 -m virtualenv .venv 152 | source .venv/bin/activate 153 | pip install pytest 154 | pip install html2text_rs --find-links dist --force-reinstall 155 | pytest 156 | 157 | windows: 158 | runs-on: ${{ matrix.platform.runner }} 159 | strategy: 160 | matrix: 161 | platform: 162 | - runner: windows-latest 163 | target: x64 164 | - runner: windows-latest 165 | target: x86 166 | steps: 167 | - uses: actions/checkout@v4 168 | - uses: actions/setup-python@v5 169 | with: 170 | python-version: 3.x 171 | architecture: ${{ matrix.platform.target }} 172 | - name: Build wheels 173 | uses: PyO3/maturin-action@v1 174 | with: 175 | target: ${{ matrix.platform.target }} 176 | args: --release --out dist 177 | sccache: ${{ !startsWith(github.ref, 'refs/tags/') }} 178 | - name: Build free-threaded wheels 179 | uses: PyO3/maturin-action@v1 180 | with: 181 | target: ${{ matrix.platform.target }} 182 | args: --release --out dist -i python3.13t 183 | sccache: ${{ !startsWith(github.ref, 'refs/tags/') }} 184 | - name: Upload wheels 185 | uses: actions/upload-artifact@v4 186 | with: 187 | name: wheels-windows-${{ matrix.platform.target }} 188 | path: dist 189 | - name: pytest 190 | if: ${{ !startsWith(matrix.platform.target, 'aarch64') }} 191 | shell: bash 192 | run: | 193 | set -e 194 | python3 -m venv .venv 195 | source .venv/Scripts/activate 196 | pip install html2text_rs --find-links dist --force-reinstall 197 | pip install pytest 198 | pytest 199 | 200 | macos: 201 | runs-on: ${{ matrix.platform.runner }} 202 | strategy: 203 | matrix: 204 | platform: 205 | - runner: macos-13 206 | target: x86_64 207 | - runner: macos-14 208 | target: aarch64 209 | steps: 210 | - uses: actions/checkout@v4 211 | - uses: actions/setup-python@v5 212 | with: 213 | python-version: 3.x 214 | - name: Build wheels 215 | uses: PyO3/maturin-action@v1 216 | with: 217 | target: ${{ matrix.platform.target }} 218 | args: --release --out dist 219 | sccache: ${{ !startsWith(github.ref, 'refs/tags/') }} 220 | - name: Build free-threaded wheels 221 | uses: PyO3/maturin-action@v1 222 | with: 223 | target: ${{ matrix.platform.target }} 224 | args: --release --out dist -i python3.13t 225 | sccache: ${{ !startsWith(github.ref, 'refs/tags/') }} 226 | - name: Upload wheels 227 | uses: actions/upload-artifact@v4 228 | with: 229 | name: wheels-macos-${{ matrix.platform.target }} 230 | path: dist 231 | - name: pytest 232 | run: | 233 | set -e 234 | python3 -m venv .venv 235 | source .venv/bin/activate 236 | pip install html2text_rs --find-links dist --force-reinstall 237 | pip install pytest 238 | pytest 239 | 240 | sdist: 241 | runs-on: ubuntu-latest 242 | steps: 243 | - uses: actions/checkout@v4 244 | - name: Build sdist 245 | uses: PyO3/maturin-action@v1 246 | with: 247 | command: sdist 248 | args: --out dist 249 | - name: Upload sdist 250 | uses: actions/upload-artifact@v4 251 | with: 252 | name: wheels-sdist 253 | path: dist 254 | 255 | release: 256 | name: Release 257 | runs-on: ubuntu-latest 258 | if: ${{ startsWith(github.ref, 'refs/tags/') || github.event_name == 'workflow_dispatch' }} 259 | needs: [linux, musllinux, windows, macos, sdist] 260 | permissions: 261 | # Use to sign the release artifacts 262 | id-token: write 263 | # Used to upload release artifacts 264 | contents: write 265 | # Used to generate artifact attestation 266 | attestations: write 267 | steps: 268 | - uses: actions/download-artifact@v4 269 | - name: Generate artifact attestation 270 | uses: actions/attest-build-provenance@v2 271 | with: 272 | subject-path: 'wheels-*/*' 273 | - name: Publish to PyPI 274 | if: ${{ startsWith(github.ref, 'refs/tags/') }} 275 | uses: PyO3/maturin-action@v1 276 | env: 277 | MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }} 278 | with: 279 | command: upload 280 | args: --non-interactive --skip-existing wheels-*/* 281 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | .pytest_cache/ 6 | *.py[cod] 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | .venv/ 14 | env/ 15 | bin/ 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | include/ 26 | man/ 27 | venv/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | pip-selfcheck.json 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | 45 | # Translations 46 | *.mo 47 | 48 | # Mr Developer 49 | .mr.developer.cfg 50 | .project 51 | .pydevproject 52 | 53 | # Rope 54 | .ropeproject 55 | 56 | # Django stuff: 57 | *.log 58 | *.pot 59 | 60 | .DS_Store 61 | 62 | # Sphinx documentation 63 | docs/_build/ 64 | 65 | # PyCharm 66 | .idea/ 67 | 68 | # VSCode 69 | .vscode/ 70 | 71 | # Pyenv 72 | .python-version 73 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 4 4 | 5 | [[package]] 6 | name = "autocfg" 7 | version = "1.4.0" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" 10 | 11 | [[package]] 12 | name = "bitflags" 13 | version = "2.9.0" 14 | source = "registry+https://github.com/rust-lang/crates.io-index" 15 | checksum = "5c8214115b7bf84099f1309324e63141d4c5d7cc26862f97a0a857dbefe165bd" 16 | 17 | [[package]] 18 | name = "cc" 19 | version = "1.2.19" 20 | source = "registry+https://github.com/rust-lang/crates.io-index" 21 | checksum = "8e3a13707ac958681c13b39b458c073d0d9bc8a22cb1b2f4c8e55eb72c13f362" 22 | dependencies = [ 23 | "shlex", 24 | ] 25 | 26 | [[package]] 27 | name = "cfg-if" 28 | version = "1.0.0" 29 | source = "registry+https://github.com/rust-lang/crates.io-index" 30 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 31 | 32 | [[package]] 33 | name = "futf" 34 | version = "0.1.5" 35 | source = "registry+https://github.com/rust-lang/crates.io-index" 36 | checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" 37 | dependencies = [ 38 | "mac", 39 | "new_debug_unreachable", 40 | ] 41 | 42 | [[package]] 43 | name = "heck" 44 | version = "0.5.0" 45 | source = "registry+https://github.com/rust-lang/crates.io-index" 46 | checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" 47 | 48 | [[package]] 49 | name = "html2text" 50 | version = "0.14.3" 51 | source = "registry+https://github.com/rust-lang/crates.io-index" 52 | checksum = "393aaeda74fd1ee299520131edd11dbbeda69dd0a88965cc4a71945b78439fe9" 53 | dependencies = [ 54 | "html5ever", 55 | "tendril", 56 | "thiserror", 57 | "unicode-width", 58 | ] 59 | 60 | [[package]] 61 | name = "html2text_rs" 62 | version = "0.2.4" 63 | dependencies = [ 64 | "html2text", 65 | "pyo3", 66 | ] 67 | 68 | [[package]] 69 | name = "html5ever" 70 | version = "0.31.0" 71 | source = "registry+https://github.com/rust-lang/crates.io-index" 72 | checksum = "953cbbe631aae7fc0a112702ad5d3aaf09da38beaf45ea84610d6e1c358f569c" 73 | dependencies = [ 74 | "log", 75 | "mac", 76 | "markup5ever", 77 | "match_token", 78 | ] 79 | 80 | [[package]] 81 | name = "indoc" 82 | version = "2.0.6" 83 | source = "registry+https://github.com/rust-lang/crates.io-index" 84 | checksum = "f4c7245a08504955605670dbf141fceab975f15ca21570696aebe9d2e71576bd" 85 | 86 | [[package]] 87 | name = "libc" 88 | version = "0.2.172" 89 | source = "registry+https://github.com/rust-lang/crates.io-index" 90 | checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa" 91 | 92 | [[package]] 93 | name = "lock_api" 94 | version = "0.4.12" 95 | source = "registry+https://github.com/rust-lang/crates.io-index" 96 | checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" 97 | dependencies = [ 98 | "autocfg", 99 | "scopeguard", 100 | ] 101 | 102 | [[package]] 103 | name = "log" 104 | version = "0.4.27" 105 | source = "registry+https://github.com/rust-lang/crates.io-index" 106 | checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" 107 | 108 | [[package]] 109 | name = "mac" 110 | version = "0.1.1" 111 | source = "registry+https://github.com/rust-lang/crates.io-index" 112 | checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" 113 | 114 | [[package]] 115 | name = "markup5ever" 116 | version = "0.16.0" 117 | source = "registry+https://github.com/rust-lang/crates.io-index" 118 | checksum = "0ba2225413ed418d540a2c8247d794f4b0527a021da36f69c05344d716dc44c1" 119 | dependencies = [ 120 | "log", 121 | "phf", 122 | "phf_codegen", 123 | "string_cache", 124 | "string_cache_codegen", 125 | "tendril", 126 | ] 127 | 128 | [[package]] 129 | name = "match_token" 130 | version = "0.1.0" 131 | source = "registry+https://github.com/rust-lang/crates.io-index" 132 | checksum = "88a9689d8d44bf9964484516275f5cd4c9b59457a6940c1d5d0ecbb94510a36b" 133 | dependencies = [ 134 | "proc-macro2", 135 | "quote", 136 | "syn", 137 | ] 138 | 139 | [[package]] 140 | name = "memoffset" 141 | version = "0.9.1" 142 | source = "registry+https://github.com/rust-lang/crates.io-index" 143 | checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a" 144 | dependencies = [ 145 | "autocfg", 146 | ] 147 | 148 | [[package]] 149 | name = "new_debug_unreachable" 150 | version = "1.0.6" 151 | source = "registry+https://github.com/rust-lang/crates.io-index" 152 | checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" 153 | 154 | [[package]] 155 | name = "once_cell" 156 | version = "1.21.3" 157 | source = "registry+https://github.com/rust-lang/crates.io-index" 158 | checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" 159 | 160 | [[package]] 161 | name = "parking_lot" 162 | version = "0.12.3" 163 | source = "registry+https://github.com/rust-lang/crates.io-index" 164 | checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27" 165 | dependencies = [ 166 | "lock_api", 167 | "parking_lot_core", 168 | ] 169 | 170 | [[package]] 171 | name = "parking_lot_core" 172 | version = "0.9.10" 173 | source = "registry+https://github.com/rust-lang/crates.io-index" 174 | checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" 175 | dependencies = [ 176 | "cfg-if", 177 | "libc", 178 | "redox_syscall", 179 | "smallvec", 180 | "windows-targets", 181 | ] 182 | 183 | [[package]] 184 | name = "phf" 185 | version = "0.11.3" 186 | source = "registry+https://github.com/rust-lang/crates.io-index" 187 | checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" 188 | dependencies = [ 189 | "phf_shared", 190 | ] 191 | 192 | [[package]] 193 | name = "phf_codegen" 194 | version = "0.11.3" 195 | source = "registry+https://github.com/rust-lang/crates.io-index" 196 | checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a" 197 | dependencies = [ 198 | "phf_generator", 199 | "phf_shared", 200 | ] 201 | 202 | [[package]] 203 | name = "phf_generator" 204 | version = "0.11.3" 205 | source = "registry+https://github.com/rust-lang/crates.io-index" 206 | checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" 207 | dependencies = [ 208 | "phf_shared", 209 | "rand", 210 | ] 211 | 212 | [[package]] 213 | name = "phf_shared" 214 | version = "0.11.3" 215 | source = "registry+https://github.com/rust-lang/crates.io-index" 216 | checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" 217 | dependencies = [ 218 | "siphasher", 219 | ] 220 | 221 | [[package]] 222 | name = "portable-atomic" 223 | version = "1.11.0" 224 | source = "registry+https://github.com/rust-lang/crates.io-index" 225 | checksum = "350e9b48cbc6b0e028b0473b114454c6316e57336ee184ceab6e53f72c178b3e" 226 | 227 | [[package]] 228 | name = "precomputed-hash" 229 | version = "0.1.1" 230 | source = "registry+https://github.com/rust-lang/crates.io-index" 231 | checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" 232 | 233 | [[package]] 234 | name = "proc-macro2" 235 | version = "1.0.95" 236 | source = "registry+https://github.com/rust-lang/crates.io-index" 237 | checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" 238 | dependencies = [ 239 | "unicode-ident", 240 | ] 241 | 242 | [[package]] 243 | name = "pyo3" 244 | version = "0.24.1" 245 | source = "registry+https://github.com/rust-lang/crates.io-index" 246 | checksum = "17da310086b068fbdcefbba30aeb3721d5bb9af8db4987d6735b2183ca567229" 247 | dependencies = [ 248 | "cfg-if", 249 | "indoc", 250 | "libc", 251 | "memoffset", 252 | "once_cell", 253 | "portable-atomic", 254 | "pyo3-build-config", 255 | "pyo3-ffi", 256 | "pyo3-macros", 257 | "unindent", 258 | ] 259 | 260 | [[package]] 261 | name = "pyo3-build-config" 262 | version = "0.24.1" 263 | source = "registry+https://github.com/rust-lang/crates.io-index" 264 | checksum = "e27165889bd793000a098bb966adc4300c312497ea25cf7a690a9f0ac5aa5fc1" 265 | dependencies = [ 266 | "once_cell", 267 | "python3-dll-a", 268 | "target-lexicon", 269 | ] 270 | 271 | [[package]] 272 | name = "pyo3-ffi" 273 | version = "0.24.1" 274 | source = "registry+https://github.com/rust-lang/crates.io-index" 275 | checksum = "05280526e1dbf6b420062f3ef228b78c0c54ba94e157f5cb724a609d0f2faabc" 276 | dependencies = [ 277 | "libc", 278 | "pyo3-build-config", 279 | ] 280 | 281 | [[package]] 282 | name = "pyo3-macros" 283 | version = "0.24.1" 284 | source = "registry+https://github.com/rust-lang/crates.io-index" 285 | checksum = "5c3ce5686aa4d3f63359a5100c62a127c9f15e8398e5fdeb5deef1fed5cd5f44" 286 | dependencies = [ 287 | "proc-macro2", 288 | "pyo3-macros-backend", 289 | "quote", 290 | "syn", 291 | ] 292 | 293 | [[package]] 294 | name = "pyo3-macros-backend" 295 | version = "0.24.1" 296 | source = "registry+https://github.com/rust-lang/crates.io-index" 297 | checksum = "f4cf6faa0cbfb0ed08e89beb8103ae9724eb4750e3a78084ba4017cbe94f3855" 298 | dependencies = [ 299 | "heck", 300 | "proc-macro2", 301 | "pyo3-build-config", 302 | "quote", 303 | "syn", 304 | ] 305 | 306 | [[package]] 307 | name = "python3-dll-a" 308 | version = "0.2.13" 309 | source = "registry+https://github.com/rust-lang/crates.io-index" 310 | checksum = "49fe4227a288cf9493942ad0220ea3f185f4d1f2a14f197f7344d6d02f4ed4ed" 311 | dependencies = [ 312 | "cc", 313 | ] 314 | 315 | [[package]] 316 | name = "quote" 317 | version = "1.0.40" 318 | source = "registry+https://github.com/rust-lang/crates.io-index" 319 | checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" 320 | dependencies = [ 321 | "proc-macro2", 322 | ] 323 | 324 | [[package]] 325 | name = "rand" 326 | version = "0.8.5" 327 | source = "registry+https://github.com/rust-lang/crates.io-index" 328 | checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" 329 | dependencies = [ 330 | "rand_core", 331 | ] 332 | 333 | [[package]] 334 | name = "rand_core" 335 | version = "0.6.4" 336 | source = "registry+https://github.com/rust-lang/crates.io-index" 337 | checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" 338 | 339 | [[package]] 340 | name = "redox_syscall" 341 | version = "0.5.11" 342 | source = "registry+https://github.com/rust-lang/crates.io-index" 343 | checksum = "d2f103c6d277498fbceb16e84d317e2a400f160f46904d5f5410848c829511a3" 344 | dependencies = [ 345 | "bitflags", 346 | ] 347 | 348 | [[package]] 349 | name = "scopeguard" 350 | version = "1.2.0" 351 | source = "registry+https://github.com/rust-lang/crates.io-index" 352 | checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" 353 | 354 | [[package]] 355 | name = "serde" 356 | version = "1.0.219" 357 | source = "registry+https://github.com/rust-lang/crates.io-index" 358 | checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" 359 | dependencies = [ 360 | "serde_derive", 361 | ] 362 | 363 | [[package]] 364 | name = "serde_derive" 365 | version = "1.0.219" 366 | source = "registry+https://github.com/rust-lang/crates.io-index" 367 | checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" 368 | dependencies = [ 369 | "proc-macro2", 370 | "quote", 371 | "syn", 372 | ] 373 | 374 | [[package]] 375 | name = "shlex" 376 | version = "1.3.0" 377 | source = "registry+https://github.com/rust-lang/crates.io-index" 378 | checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" 379 | 380 | [[package]] 381 | name = "siphasher" 382 | version = "1.0.1" 383 | source = "registry+https://github.com/rust-lang/crates.io-index" 384 | checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" 385 | 386 | [[package]] 387 | name = "smallvec" 388 | version = "1.15.0" 389 | source = "registry+https://github.com/rust-lang/crates.io-index" 390 | checksum = "8917285742e9f3e1683f0a9c4e6b57960b7314d0b08d30d1ecd426713ee2eee9" 391 | 392 | [[package]] 393 | name = "string_cache" 394 | version = "0.8.9" 395 | source = "registry+https://github.com/rust-lang/crates.io-index" 396 | checksum = "bf776ba3fa74f83bf4b63c3dcbbf82173db2632ed8452cb2d891d33f459de70f" 397 | dependencies = [ 398 | "new_debug_unreachable", 399 | "parking_lot", 400 | "phf_shared", 401 | "precomputed-hash", 402 | "serde", 403 | ] 404 | 405 | [[package]] 406 | name = "string_cache_codegen" 407 | version = "0.5.4" 408 | source = "registry+https://github.com/rust-lang/crates.io-index" 409 | checksum = "c711928715f1fe0fe509c53b43e993a9a557babc2d0a3567d0a3006f1ac931a0" 410 | dependencies = [ 411 | "phf_generator", 412 | "phf_shared", 413 | "proc-macro2", 414 | "quote", 415 | ] 416 | 417 | [[package]] 418 | name = "syn" 419 | version = "2.0.100" 420 | source = "registry+https://github.com/rust-lang/crates.io-index" 421 | checksum = "b09a44accad81e1ba1cd74a32461ba89dee89095ba17b32f5d03683b1b1fc2a0" 422 | dependencies = [ 423 | "proc-macro2", 424 | "quote", 425 | "unicode-ident", 426 | ] 427 | 428 | [[package]] 429 | name = "target-lexicon" 430 | version = "0.13.2" 431 | source = "registry+https://github.com/rust-lang/crates.io-index" 432 | checksum = "e502f78cdbb8ba4718f566c418c52bc729126ffd16baee5baa718cf25dd5a69a" 433 | 434 | [[package]] 435 | name = "tendril" 436 | version = "0.4.3" 437 | source = "registry+https://github.com/rust-lang/crates.io-index" 438 | checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0" 439 | dependencies = [ 440 | "futf", 441 | "mac", 442 | "utf-8", 443 | ] 444 | 445 | [[package]] 446 | name = "thiserror" 447 | version = "2.0.12" 448 | source = "registry+https://github.com/rust-lang/crates.io-index" 449 | checksum = "567b8a2dae586314f7be2a752ec7474332959c6460e02bde30d702a66d488708" 450 | dependencies = [ 451 | "thiserror-impl", 452 | ] 453 | 454 | [[package]] 455 | name = "thiserror-impl" 456 | version = "2.0.12" 457 | source = "registry+https://github.com/rust-lang/crates.io-index" 458 | checksum = "7f7cf42b4507d8ea322120659672cf1b9dbb93f8f2d4ecfd6e51350ff5b17a1d" 459 | dependencies = [ 460 | "proc-macro2", 461 | "quote", 462 | "syn", 463 | ] 464 | 465 | [[package]] 466 | name = "unicode-ident" 467 | version = "1.0.18" 468 | source = "registry+https://github.com/rust-lang/crates.io-index" 469 | checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" 470 | 471 | [[package]] 472 | name = "unicode-width" 473 | version = "0.2.0" 474 | source = "registry+https://github.com/rust-lang/crates.io-index" 475 | checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd" 476 | 477 | [[package]] 478 | name = "unindent" 479 | version = "0.2.4" 480 | source = "registry+https://github.com/rust-lang/crates.io-index" 481 | checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3" 482 | 483 | [[package]] 484 | name = "utf-8" 485 | version = "0.7.6" 486 | source = "registry+https://github.com/rust-lang/crates.io-index" 487 | checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" 488 | 489 | [[package]] 490 | name = "windows-targets" 491 | version = "0.52.6" 492 | source = "registry+https://github.com/rust-lang/crates.io-index" 493 | checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" 494 | dependencies = [ 495 | "windows_aarch64_gnullvm", 496 | "windows_aarch64_msvc", 497 | "windows_i686_gnu", 498 | "windows_i686_gnullvm", 499 | "windows_i686_msvc", 500 | "windows_x86_64_gnu", 501 | "windows_x86_64_gnullvm", 502 | "windows_x86_64_msvc", 503 | ] 504 | 505 | [[package]] 506 | name = "windows_aarch64_gnullvm" 507 | version = "0.52.6" 508 | source = "registry+https://github.com/rust-lang/crates.io-index" 509 | checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" 510 | 511 | [[package]] 512 | name = "windows_aarch64_msvc" 513 | version = "0.52.6" 514 | source = "registry+https://github.com/rust-lang/crates.io-index" 515 | checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" 516 | 517 | [[package]] 518 | name = "windows_i686_gnu" 519 | version = "0.52.6" 520 | source = "registry+https://github.com/rust-lang/crates.io-index" 521 | checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" 522 | 523 | [[package]] 524 | name = "windows_i686_gnullvm" 525 | version = "0.52.6" 526 | source = "registry+https://github.com/rust-lang/crates.io-index" 527 | checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" 528 | 529 | [[package]] 530 | name = "windows_i686_msvc" 531 | version = "0.52.6" 532 | source = "registry+https://github.com/rust-lang/crates.io-index" 533 | checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" 534 | 535 | [[package]] 536 | name = "windows_x86_64_gnu" 537 | version = "0.52.6" 538 | source = "registry+https://github.com/rust-lang/crates.io-index" 539 | checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" 540 | 541 | [[package]] 542 | name = "windows_x86_64_gnullvm" 543 | version = "0.52.6" 544 | source = "registry+https://github.com/rust-lang/crates.io-index" 545 | checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" 546 | 547 | [[package]] 548 | name = "windows_x86_64_msvc" 549 | version = "0.52.6" 550 | source = "registry+https://github.com/rust-lang/crates.io-index" 551 | checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" 552 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "html2text_rs" 3 | version = "0.2.4" 4 | edition = "2021" 5 | description = "Convert HTML to markdown or plain text" 6 | authors = ["deedy5"] 7 | 8 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 9 | [lib] 10 | name = "html2text_rs" 11 | crate-type = ["cdylib"] 12 | 13 | [dependencies] 14 | pyo3 = { version = "0.24.1", features = ["extension-module", "abi3-py38", "generate-import-lib"] } 15 | html2text = "0.14.3" 16 | 17 | [profile.release] 18 | codegen-units = 1 19 | lto = "fat" 20 | opt-level = 3 21 | panic = "abort" 22 | strip = "symbols" 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 deedy5 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |  [](https://github.com/deedy5/html2text_rs/releases) [](https://pypi.org/project/html2text_rs) [](https://pepy.tech/project/html2text_rs) [](https://github.com/deedy5/html2text_rs/actions/workflows/CI.yml) 2 | 3 | # html2text_rs 4 | Convert HTML to markdown or plain text. 5 | Python binding to the rust [rust-html2text](https://github.com/jugglerchris/rust-html2text) library. 6 | 7 | ## Table of Contents 8 | 9 | - [Installation](#installation) 10 | - [Usage](#usage) 11 | - [text_markdown()](#1-text_markdown) 12 | - [text_plain()](#2-text_plain) 13 | - [text_rich()](#3-text_rich) 14 | 15 | ## Installation 16 | 17 | ```python 18 | pip install -U html2text_rs 19 | ``` 20 | 21 | ## Usage 22 | ### 1. text_markdown() 23 | ```python 24 | def text_markdown(html: str, width: int = 100): 25 | """Convert HTML to markdown text. 26 | 27 | Args: 28 | html (str): input html text. 29 | width (int): wrap text to width columns. Default is 100. 30 | 31 | """ 32 | ``` 33 | example: 34 | ```python 35 | import html2text_rs 36 | import requests 37 | 38 | resp = requests.get("https://en.wikipedia.org/wiki/AGM-88_HARM") 39 | 40 | text_markdown = html2text_rs.text_markdown(resp.text) 41 | print(text_markdown) 42 | ``` 43 | ### 2. text_plain() 44 | ```python 45 | def text_plain(html: str, width: int = 100): 46 | """Convert HTML to plain text. 47 | 48 | Args: 49 | html (str): input html text. 50 | width (int): wrap text to width columns. Default is 100. 51 | 52 | """ 53 | ``` 54 | example: 55 | ```python 56 | import html2text_rs 57 | import requests 58 | 59 | resp = requests.get("https://en.wikipedia.org/wiki/AGM-88_HARM") 60 | 61 | text_plain = html2text_rs.text_plain(resp.text) 62 | print(text_plain) 63 | ``` 64 | ### 3. text_rich() 65 | ```python 66 | def text_rich(html: str, width: int = 100): 67 | """Convert HTML to rich text. 68 | 69 | Args: 70 | html (str): input html text. 71 | width (int): wrap text to width columns. Default is 100. 72 | 73 | """ 74 | ``` 75 | example: 76 | ```python 77 | import html2text_rs 78 | import requests 79 | 80 | resp = requests.get("https://en.wikipedia.org/wiki/AGM-88_HARM") 81 | 82 | text_rich = html2text_rs.text_rich(resp.text) 83 | print(text_rich) 84 | ``` 85 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["maturin>=1.8,<2.0"] 3 | build-backend = "maturin" 4 | 5 | [project] 6 | name = "html2text_rs" 7 | description = "Convert HTML to markdown or plain text" 8 | keywords = [ 9 | "python", "html2text", "html-to-text", "html-to-markdown", "html2md", "markdown" 10 | ] 11 | authors = [ 12 | {name = "deedy5"} 13 | ] 14 | requires-python = ">=3.8" 15 | license = {text = "MIT License"} 16 | classifiers = [ 17 | "Programming Language :: Rust", 18 | "Programming Language :: Python :: Implementation :: CPython", 19 | "Programming Language :: Python :: Implementation :: PyPy", 20 | "Topic :: Text Editors :: Text Processing", 21 | "Topic :: Text Processing :: Markup :: HTML", 22 | "Topic :: Text Processing :: Markup :: Markdown", 23 | ] 24 | dynamic = ["version"] 25 | 26 | dependencies = [] 27 | 28 | [project.optional-dependencies] 29 | dev = [ 30 | "pytest>=8.3.2", 31 | ] 32 | 33 | [tool.maturin] 34 | python-source = "python" 35 | features = ["pyo3/extension-module"] 36 | -------------------------------------------------------------------------------- /python/html2text_rs/__init__.py: -------------------------------------------------------------------------------- 1 | from .html2text_rs import * 2 | -------------------------------------------------------------------------------- /python/html2text_rs/html2text_rs.pyi: -------------------------------------------------------------------------------- 1 | def text_markdown(html: str, width: int = 100) -> str: ... 2 | def text_plain(html: str, width: int = 100) -> str: ... 3 | def text_rich(html: str, width: int = 100) -> str: ... 4 | -------------------------------------------------------------------------------- /python/html2text_rs/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deedy5/html2text_rs/9dfc339420da1c67f7fbb2c3c7cf30e4fc443a64/python/html2text_rs/py.typed -------------------------------------------------------------------------------- /python/tests/test_html2text_rs.py: -------------------------------------------------------------------------------- 1 | import html2text_rs # type: ignore 2 | 3 | 4 | def test_text_markdown(): 5 | html = "
This is a test.
" 6 | expected_output = "# Hello World\n\nThis is a test.\n" 7 | result = html2text_rs.text_markdown(html, width=80) 8 | assert result == expected_output, ( 9 | f"\nExpected:\n {expected_output} \nGot:\n {result}" 10 | ) 11 | 12 | 13 | def test_text_plain(): 14 | html = "This is a test.
" 15 | expected_output = "Hello World\n\nThis is a test.\n" 16 | result = html2text_rs.text_plain(html, width=80) 17 | assert result == expected_output, ( 18 | f"\nExpected:\n {expected_output} \nGot:\n {result}" 19 | ) 20 | 21 | 22 | def test_text_rich(): 23 | html = "This is a test.
" 24 | expected_output = "# Hello World\n\nThis is a test.\n" 25 | result = html2text_rs.text_rich(html, width=80) 26 | assert result == expected_output, ( 27 | f"\nExpected:\n {expected_output} \nGot:\n {result}" 28 | ) 29 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | use html2text::{ 2 | from_read, from_read_with_decorator, 3 | render::{RichDecorator, TrivialDecorator}, 4 | }; 5 | use pyo3::{prelude::*, types::PyString}; 6 | 7 | /// Convert HTML to markdown text 8 | #[inline(always)] 9 | #[pyfunction] 10 | #[pyo3(signature=(html, width=100))] 11 | fn text_markdown<'py>(html: &Bound<'py, PyString>, width: usize) -> PyResult