├── .formatter.exs ├── .github ├── dependabot.yml └── workflows │ ├── ci.yml │ ├── release.yml │ └── rust-ci.yml ├── .gitignore ├── CHANGELOG.md ├── LICENSE-APACHE ├── LICENSE-MIT ├── README.md ├── RELEASE_CHECKLIST.md ├── lib ├── html5ever.ex └── html5ever │ └── native.ex ├── mix.exs ├── mix.lock ├── native └── html5ever_nif │ ├── .cargo │ └── config.toml │ ├── .gitignore │ ├── Cargo.lock │ ├── Cargo.toml │ └── src │ ├── common.rs │ ├── flat_dom.rs │ └── lib.rs ├── priv └── test_data │ ├── drudgereport.html │ └── example.html └── test ├── html5ever_test.exs └── test_helper.exs /.formatter.exs: -------------------------------------------------------------------------------- 1 | [ 2 | inputs: ["*.{ex,exs}", "{config,lib,test}/**/*.{ex,exs}"] 3 | ] 4 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: mix 4 | directory: "/" 5 | schedule: 6 | interval: daily 7 | open-pull-requests-limit: 5 8 | 9 | - package-ecosystem: cargo 10 | directory: "/native/html5ever_nif" 11 | schedule: 12 | interval: daily 13 | open-pull-requests-limit: 5 14 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | pull_request: 5 | push: 6 | branches: 7 | - master 8 | 9 | jobs: 10 | mix_test: 11 | runs-on: ubuntu-22.04 12 | env: 13 | MIX_ENV: test 14 | HTML5EVER_BUILD: "true" 15 | 16 | name: Elixir ${{ matrix.pair.elixir }} / OTP ${{ matrix.pair.otp }} 17 | 18 | strategy: 19 | fail-fast: false 20 | matrix: 21 | include: 22 | - pair: 23 | elixir: 1.13.4 24 | otp: "24.3" 25 | - pair: 26 | elixir: 1.16.1 27 | otp: "26.2" 28 | lint: lint 29 | steps: 30 | - uses: actions/checkout@v4 31 | 32 | - uses: erlef/setup-beam@v1 33 | with: 34 | otp-version: ${{ matrix.pair.otp }} 35 | elixir-version: ${{ matrix.pair.elixir }} 36 | 37 | - name: Install minimal stable Rust toolchain 38 | uses: dtolnay/rust-toolchain@stable 39 | 40 | - name: Install Dependencies 41 | run: mix deps.get 42 | 43 | - run: mix format --check-formatted 44 | if: ${{ matrix.lint }} 45 | 46 | - run: mix deps.unlock --check-unused 47 | if: ${{ matrix.lint }} 48 | 49 | - run: mix deps.compile 50 | 51 | - run: mix compile --warnings-as-errors 52 | if: ${{ matrix.lint }} 53 | 54 | - run: mix test 55 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Build precompiled NIFs 2 | 3 | permissions: 4 | id-token: write 5 | attestations: write 6 | contents: write 7 | 8 | on: 9 | push: 10 | branches: 11 | - main 12 | - master 13 | paths: 14 | # Just run on main branch if "native" path changed. 15 | - "native/**" 16 | # Also run if this file changes. 17 | - ".github/workflows/release.yml" 18 | tags: 19 | # Tags will always run. 20 | - "*" 21 | pull_request: 22 | paths: 23 | # In PRs we only run if this file changes. 24 | - ".github/workflows/release.yml" 25 | workflow_dispatch: 26 | 27 | jobs: 28 | build_release: 29 | name: NIF ${{ matrix.nif }} - ${{ matrix.job.target }} (${{ matrix.job.os }}) 30 | runs-on: ${{ matrix.job.os }} 31 | strategy: 32 | fail-fast: false 33 | matrix: 34 | nif: ["2.15"] 35 | job: 36 | - { target: arm-unknown-linux-gnueabihf , os: ubuntu-20.04 , use-cross: true } 37 | - { target: aarch64-unknown-linux-gnu , os: ubuntu-20.04 , use-cross: true } 38 | - { target: aarch64-unknown-linux-musl , os: ubuntu-20.04 , use-cross: true } 39 | - { target: aarch64-apple-darwin , os: macos-13 } 40 | - { target: riscv64gc-unknown-linux-gnu , os: ubuntu-20.04 , use-cross: true } 41 | - { target: x86_64-apple-darwin , os: macos-13 } 42 | - { target: x86_64-unknown-linux-gnu , os: ubuntu-20.04 } 43 | - { target: x86_64-unknown-linux-musl , os: ubuntu-20.04 , use-cross: true } 44 | - { target: x86_64-pc-windows-gnu , os: windows-2019 } 45 | - { target: x86_64-pc-windows-msvc , os: windows-2019 } 46 | 47 | steps: 48 | - name: Checkout source code 49 | uses: actions/checkout@v4 50 | 51 | - name: Extract project version 52 | shell: bash 53 | run: | 54 | # Get the project version from mix.exs 55 | echo "PROJECT_VERSION=$(sed -n 's/^ @version "\(.*\)"/\1/p' mix.exs | head -n1)" >> $GITHUB_ENV 56 | 57 | - name: Install Rust toolchain 58 | uses: dtolnay/rust-toolchain@stable 59 | with: 60 | toolchain: stable 61 | target: ${{ matrix.job.target }} 62 | 63 | - name: Build the project 64 | id: build-crate 65 | uses: philss/rustler-precompiled-action@v1.1.4 66 | with: 67 | project-name: html5ever_nif 68 | project-version: ${{ env.PROJECT_VERSION }} 69 | target: ${{ matrix.job.target }} 70 | nif-version: ${{ matrix.nif }} 71 | use-cross: ${{ matrix.job.use-cross }} 72 | project-dir: "native/html5ever_nif" 73 | 74 | - name: Artifact attestation 75 | uses: actions/attest-build-provenance@v1 76 | with: 77 | subject-path: ${{ steps.build-crate.outputs.file-path }} 78 | 79 | - name: Artifact upload 80 | uses: actions/upload-artifact@v4 81 | with: 82 | name: ${{ steps.build-crate.outputs.file-name }} 83 | path: ${{ steps.build-crate.outputs.file-path }} 84 | 85 | - name: Write SHA256 to the summary 86 | run: | 87 | echo "SHA256 for this artifact:" >> $GITHUB_STEP_SUMMARY 88 | echo "${{ steps.build-crate.outputs.file-sha256 }} ${{ steps.build-crate.outputs.file-name }}" >> $GITHUB_STEP_SUMMARY 89 | 90 | - name: Publish archives and packages 91 | uses: softprops/action-gh-release@v2 92 | with: 93 | files: | 94 | ${{ steps.build-crate.outputs.file-path }} 95 | if: startsWith(github.ref, 'refs/tags/') 96 | -------------------------------------------------------------------------------- /.github/workflows/rust-ci.yml: -------------------------------------------------------------------------------- 1 | name: Rust CI 2 | on: 3 | push: 4 | branches: 5 | - master 6 | paths: 7 | - "native/**" 8 | pull_request: 9 | paths: 10 | - "native/**" 11 | workflow_dispatch: 12 | 13 | jobs: 14 | lint-rust: 15 | name: Lint Rust 16 | runs-on: ubuntu-22.04 17 | strategy: 18 | matrix: 19 | manifest: 20 | - native/html5ever_nif/Cargo.toml 21 | 22 | steps: 23 | - uses: actions/checkout@v4 24 | 25 | - uses: dtolnay/rust-toolchain@stable 26 | with: 27 | components: rustfmt, clippy 28 | 29 | - uses: Swatinem/rust-cache@v2 30 | with: 31 | workspaces: | 32 | native/html5ever_nif 33 | 34 | - name: run rustfmt 35 | run: cargo fmt --manifest-path=${{ matrix.manifest }} --all -- --check 36 | 37 | - name: run clippy 38 | run: cargo clippy --manifest-path=${{ matrix.manifest }} -- -Dwarnings 39 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # The directory Mix will write compiled artifacts to. 2 | /_build 3 | 4 | # If you run "mix test --cover", coverage assets end up here. 5 | /cover 6 | 7 | # The directory Mix downloads your dependencies sources to. 8 | /deps 9 | 10 | # Where 3rd-party dependencies like ExDoc output generated docs. 11 | /doc 12 | 13 | # Ignore .fetch files in case you like to edit your project deps locally. 14 | /.fetch 15 | 16 | # If the VM crashes, it generates a dump, let's ignore it too. 17 | erl_crash.dump 18 | 19 | # Also ignore archive artifacts (built via "mix archive.build"). 20 | *.ez 21 | 22 | /priv/native 23 | 24 | /native/*/target 25 | 26 | # The checksum files for precompiled NIFs 27 | checksum-*.exs 28 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | All notable changes to this project will be documented in this file. 3 | 4 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 5 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 6 | 7 | ## [Unreleased] 8 | 9 | ## [0.16.1] - 2024-05-15 10 | 11 | ### Fixed 12 | 13 | - Fix parsing of HTML documents that may start with a comment or an XML doc tag. 14 | 15 | ## [0.16.0] - 2024-03-25 16 | 17 | ### Fixed 18 | 19 | - Fix parsing of comments in `parse/1`. 20 | - Avoid panic when parsing content with the "template" tag. 21 | 22 | ### Removed 23 | 24 | - Drop support for Elixir 1.12 25 | 26 | ## [0.15.0] - 2023-06-16 27 | 28 | ### Added 29 | 30 | - Add two new functions to parse documents: 31 | 32 | * `parse_with_attributes_as_maps/1` 33 | * `flat_parse_with_attributes_as_maps/1` 34 | 35 | And as the names suggest, it returns a document tree with attributes as maps, 36 | instead of lists of pairs. These functions are useful to match node attributes, 37 | since the order of attributes does not matter must of the times. 38 | 39 | ### Fixed 40 | 41 | - Use dirty CPU scheduler for all functions. For some reason we were using a 42 | normal scheduler, but this could cause instability. 43 | 44 | With a dirty scheduler we can parse medium to big files without worry about 45 | lengthy work. Please read https://www.erlang.org/doc/man/erl_nif.html#lengthy_work 46 | for further information. 47 | 48 | ### Removed 49 | 50 | - Remove support for Elixir 1.11. 51 | 52 | ## [0.14.3] - 2023-05-26 53 | 54 | ### Added 55 | 56 | - Add precompilation target for Linux running on RISC-V 64 bits machines. 57 | This is useful for projects using Nerves. 58 | 59 | Note that this is going to require `rustler_precompiled` v0.6 or above, since 60 | the that version includes RISC-V on Linux as defaults. 61 | 62 | - Add support for OTP 26 by updating the `rustler-sys` package. 63 | 64 | ## [0.14.2] - 2023-05-20 65 | 66 | ### Added 67 | 68 | - Add precompilation target for Linux running on ARM64 machines (both musl and gnu ABI). 69 | This is useful for projects using Nerves. 70 | 71 | Note that this is going to require `rustler_precompiled` v0.6 or above, since 72 | the that version includes ARM64 on Linux as defaults. 73 | 74 | ## [0.14.1] - 2023-05-20 75 | 76 | ### Added 77 | 78 | - Add support for `rustler_precompiled` v0.6. 79 | 80 | ### Changed 81 | 82 | - Update Rustler version in the crate from `v0.26` to `v0.28`. 83 | This shouldn't break anything, but would require the installation of rustler `v0.28` 84 | if needed in the Elixir side. 85 | 86 | - Change the Rust edition to 2021 (it was 2018). This shouldn't change any behaviour. 87 | 88 | ## [0.14.0] - 2022-11-04 89 | 90 | ### Changed 91 | 92 | - Require `rustler_precompiled` equal or above `v0.5.2` - thanks [@Benjamin-Philip](https://github.com/Benjamin-Philip). 93 | - Use `Application.compile_env/3` instead of `Application.get_env/3` in the native module. 94 | 95 | ## [0.13.1] - 2022-06-24 96 | 97 | ### Fixed 98 | 99 | - Fix the precompilation build for targets using `cross` by adding a `Cross.toml` 100 | file with a setting telling to read the `RUSTLER_NIF_VERSION` env var from the host machine. 101 | 102 | ## [0.13.0] - 2022-04-28 103 | 104 | ### Changed 105 | 106 | - Bump requirement for `rustler_precompiled` to `~> v0.4`. This is needed to avoid installing Rustler by default. 107 | - Bump `html5ever` (Rust crate) to `v0.26.0`. 108 | 109 | ## [0.12.0] - 2022-03-14 110 | 111 | ### Changed 112 | 113 | - Start using [`rustler_precompiled`](https://hex.pm/packages/rustler_precompiled) as 114 | dependency. 115 | 116 | ## [0.11.0] - 2021-12-15 117 | 118 | ### Security 119 | 120 | - Add checksum verification of precompiled NIF files before extracting 121 | them to the correct location. This is to avoid supply chain attacks. 122 | With this change we added a new mix task to download all the files 123 | and generate the checksum before publishing the package. Additionally 124 | the user can download only the local NIF file with the checksum. 125 | See the `RELEASE_CHECKLIST.md` file for details on how we ensure this 126 | works correctly. 127 | 128 | ### Removed 129 | 130 | - Remove support for Elixir 1.10 and below. This is to keep a policy of 131 | supporting the latest three Elixir versions. 132 | 133 | ### Changed 134 | 135 | - Switch from thread pool to being a dirty NIF. This prevents the 136 | resulting term from having to be sent between processes, and therefore 137 | prevents an extra copy from having to be performed. 138 | - In the FlatSink implementation for the NIF, track children in a pool 139 | instead of allocating new vectors for every node. This significantly 140 | reduces allocator pressure while parsing, and improves performance. 141 | - When converting a parsed FlatSink into its term representation, 142 | use a common child node stack instead of allocating a new one for every 143 | node. This significantly reduces allocator pressure while creating terms, 144 | and improves performance. 145 | - Start using LTO for the NIF compilation. This reduces the build size 146 | and improves performance. 147 | 148 | ### Fixed 149 | 150 | - Fix the target selection when using `TARGET_*` env vars on macOS. 151 | 152 | ## [0.10.1] - 2021-11-24 153 | 154 | ### Fixed 155 | 156 | - It provides a precompiled NIF for ARM 64 bits running on Linux. This 157 | is needed for Raspberry PI 4. 158 | 159 | ## [0.10.0] - 2021-11-24 160 | 161 | ### Added 162 | 163 | - Add the ability to download precompiled NIFs. We provide compiled 164 | NIF files in our GitHub releases page (from GitHub Actions) and the 165 | lib will try to download the correct NIF respecting the OS, NIF version 166 | and architecture of your build machine. This also works for Nerves 167 | projects that compiles to different targets. This way the Rust toolchain 168 | is not needed for most of people using this project. 169 | 170 | ### Fixed 171 | 172 | - Fix compilation on macOS. 173 | 174 | ## [0.9.0] - 2021-10-02 175 | 176 | ### Added 177 | 178 | - Add support for OTP 24. This was achieved by updating Rustler to v0.22. 179 | 180 | [Unreleased]: https://github.com/rusterlium/html5ever_elixir/compare/v0.16.1...HEAD 181 | [0.16.1]: https://github.com/rusterlium/html5ever_elixir/compare/v0.16.0...v0.16.1 182 | [0.16.0]: https://github.com/rusterlium/html5ever_elixir/compare/v0.15.0...v0.16.0 183 | [0.15.0]: https://github.com/rusterlium/html5ever_elixir/compare/v0.14.3...v0.15.0 184 | [0.14.3]: https://github.com/rusterlium/html5ever_elixir/compare/v0.14.2...v0.14.3 185 | [0.14.2]: https://github.com/rusterlium/html5ever_elixir/compare/v0.14.1...v0.14.2 186 | [0.14.1]: https://github.com/rusterlium/html5ever_elixir/compare/v0.14.0...v0.14.1 187 | [0.14.0]: https://github.com/rusterlium/html5ever_elixir/compare/v0.13.1...v0.14.0 188 | [0.13.1]: https://github.com/rusterlium/html5ever_elixir/compare/v0.13.0...v0.13.1 189 | [0.13.0]: https://github.com/rusterlium/html5ever_elixir/compare/v0.12.0...v0.13.0 190 | [0.12.0]: https://github.com/rusterlium/html5ever_elixir/compare/v0.11.0...v0.12.0 191 | [0.11.0]: https://github.com/rusterlium/html5ever_elixir/compare/v0.10.1...v0.11.0 192 | [0.10.1]: https://github.com/rusterlium/html5ever_elixir/compare/v0.10.0...v0.10.1 193 | [0.10.0]: https://github.com/rusterlium/html5ever_elixir/compare/v0.9.0...v0.10.0 194 | [0.9.0]: https://github.com/rusterlium/html5ever_elixir/releases/tag/v0.9.0 195 | -------------------------------------------------------------------------------- /LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017 hansihe 2 | 3 | Permission is hereby granted, free of charge, to any 4 | person obtaining a copy of this software and associated 5 | documentation files (the "Software"), to deal in the 6 | Software without restriction, including without 7 | limitation the rights to use, copy, modify, merge, 8 | publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software 10 | is furnished to do so, subject to the following 11 | conditions: 12 | 13 | The above copyright notice and this permission notice 14 | shall be included in all copies or substantial portions 15 | of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF 18 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 19 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 20 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT 21 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 22 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 23 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 24 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | DEALINGS IN THE SOFTWARE. 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Html5ever binding for Elixir 2 | 3 | [![CI](https://github.com/rusterlium/html5ever_elixir/actions/workflows/ci.yml/badge.svg)](https://github.com/rusterlium/html5ever_elixir/actions/workflows/ci.yml) 4 | 5 | NIF binding of [html5ever](https://github.com/servo/html5ever) using [Rustler](https://github.com/rusterlium/rustler). 6 | 7 | It is currently functional with basic features. 8 | 9 | ## Installation 10 | 11 | The package can be installed by adding `html5ever` to your list of dependencies in `mix.exs`: 12 | 13 | ```elixir 14 | def deps do 15 | [{:html5ever, "~> 0.16.0"}] 16 | end 17 | ``` 18 | 19 | Or with [`Mix.install/1`](https://hexdocs.pm/mix/Mix.html#install/2): 20 | 21 | ```elixir 22 | Mix.install([:html5ever]) 23 | ``` 24 | 25 | ## Forcing compilation 26 | 27 | By default **you don't need Rust installed** because the lib will try to download 28 | a precompiled NIF file. In case you want to force compilation set the 29 | `HTML5EVER_BUILD` environment variable to `true` or `1`. Alternatively you can also set the 30 | application env `:build_from_source` to `true` in order to force the build: 31 | 32 | ```elixir 33 | config :html5ever, Html5ever, build_from_source: true 34 | ``` 35 | 36 | You also need to add Rustler to your dependencies when you want to force 37 | the compilation: 38 | 39 | ```elixir 40 | def deps do 41 | [ 42 | {:html5ever, "~> 0.16.0"}, 43 | {:rustler, ">= 0.0.0", optional: true} 44 | ] 45 | end 46 | ``` 47 | 48 | ## License 49 | 50 | Licensed under either of 51 | 52 | * Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0) 53 | * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT) 54 | 55 | at your option. 56 | -------------------------------------------------------------------------------- /RELEASE_CHECKLIST.md: -------------------------------------------------------------------------------- 1 | # Release checklist 2 | 3 | In order to release a new version to Hex.pm we first need to: 4 | 5 | 1. write the changes in the `CHANGELOG.md` file 6 | 2. update the `README.md`, `CHANGELOG.md` and `mix.exs` with the new version 7 | 3. commit and create a tag for that version 8 | 4. push the changes to the repository with: `git push origin master --tags` 9 | 5. wait the CI to build all release files 10 | 6. run `HTML5EVER_BUILD=1 mix rustler_precompiled.download Html5ever.Native --all --print` 11 | 7. copy the output of the mix task and add to the release notes 12 | 8. run `mix hex.publish` and **make sure the checksum file is present** 13 | in the list of files to be published. Also make sure that the `target` 14 | directory of `native/html5ever_elixir` is **NOT** present. 15 | 16 | It's important to ensure that we publish the checksum file with the 17 | package because otherwise the users won't be able to use the lib 18 | with precompiled files. They will need to always enforce compilation. 19 | -------------------------------------------------------------------------------- /lib/html5ever.ex: -------------------------------------------------------------------------------- 1 | defmodule Html5ever do 2 | @moduledoc """ 3 | This is an HTML parser written in Rust. 4 | 5 | The project provides a NIF - Native Implemented Function. 6 | It works on top of [a parser of the same name](https://github.com/servo/html5ever) 7 | from the Servo project. 8 | 9 | By default this lib will try to use a precompiled NIF 10 | from the GitHub releases page. This way you don't need 11 | to have the Rust toolchain installed. 12 | In case no precompiled file is found and the Mix env is 13 | production then an error is raised. 14 | 15 | You can force the compilation to occur by setting the 16 | value of the `HTML5EVER_BUILD` environment variable to 17 | "true" or "1". Alternatively you can also set the application 18 | env `:build_from_source` to `true` in order to force the build: 19 | 20 | config :html5ever, Html5ever, build_from_source: true 21 | 22 | This project is possible thanks to [Rustler](https://hexdocs.pm/rustler). 23 | """ 24 | 25 | @doc """ 26 | Parses an HTML document from a string. 27 | 28 | This returns a list of tuples representing the HTML tree. 29 | 30 | ## Example 31 | 32 | iex> Html5ever.parse("

Hello world

") 33 | {:ok, 34 | [ 35 | {:doctype, "html", "", ""}, 36 | {"html", [], [{"head", [], []}, {"body", [], [{"h1", [], ["Hello world"]}]}]} 37 | ]} 38 | 39 | """ 40 | def parse(html) when is_binary(html) do 41 | Html5ever.Native.parse(html, false) 42 | end 43 | 44 | @doc """ 45 | Same as `parse/1`, but with attributes as maps. 46 | 47 | This is going to remove duplicated attributes, keeping the ones 48 | that appear first. 49 | 50 | ## Example 51 | 52 | iex> Html5ever.parse_with_attributes_as_maps( 53 | ...> "

Hello world

" 54 | ...> ) 55 | {:ok, 56 | [ 57 | {:doctype, "html", "", ""}, 58 | {"html", %{}, [{"head", %{}, []}, {"body", %{}, [{"h1", %{"class" => "title"}, ["Hello world"]}]}]} 59 | ]} 60 | 61 | """ 62 | def parse_with_attributes_as_maps(html) when is_binary(html) do 63 | Html5ever.Native.parse(html, true) 64 | end 65 | 66 | @doc """ 67 | Parses an HTML document from a string and returns a map. 68 | 69 | The map contains the document structure. 70 | 71 | ## Example 72 | 73 | iex> Html5ever.flat_parse("

Hello world

") 74 | {:ok, 75 | %{ 76 | nodes: %{ 77 | 0 => %{id: 0, parent: nil, type: :document}, 78 | 1 => %{id: 1, parent: 0, type: :doctype}, 79 | 2 => %{ 80 | attrs: [], 81 | children: [3, 4], 82 | id: 2, 83 | name: "html", 84 | parent: 0, 85 | type: :element 86 | }, 87 | 3 => %{ 88 | attrs: [], 89 | children: [], 90 | id: 3, 91 | name: "head", 92 | parent: 2, 93 | type: :element 94 | }, 95 | 4 => %{ 96 | attrs: [], 97 | children: [5], 98 | id: 4, 99 | name: "body", 100 | parent: 2, 101 | type: :element 102 | }, 103 | 5 => %{ 104 | attrs: [], 105 | children: [6], 106 | id: 5, 107 | name: "h1", 108 | parent: 4, 109 | type: :element 110 | }, 111 | 6 => %{contents: "Hello world", id: 6, parent: 5, type: :text} 112 | }, 113 | root: 0 114 | }} 115 | 116 | """ 117 | def flat_parse(html) when is_binary(html) do 118 | Html5ever.Native.flat_parse(html, false) 119 | end 120 | 121 | @doc """ 122 | Same as `flat_parse/1`, but with attributes as maps. 123 | 124 | This is going to remove duplicated attributes, keeping the ones 125 | that appear first. 126 | """ 127 | def flat_parse_with_attributes_as_maps(html) when is_binary(html) do 128 | Html5ever.Native.flat_parse(html, true) 129 | end 130 | end 131 | -------------------------------------------------------------------------------- /lib/html5ever/native.ex: -------------------------------------------------------------------------------- 1 | defmodule Html5ever.Native do 2 | @moduledoc false 3 | require Logger 4 | 5 | mix_config = Mix.Project.config() 6 | version = mix_config[:version] 7 | github_url = mix_config[:package][:links]["GitHub"] 8 | 9 | env_config = Application.compile_env(:html5ever, Html5ever, []) 10 | 11 | # This module will be replaced by the NIF module after 12 | # loaded. It throws an error in case the NIF can't be loaded. 13 | use RustlerPrecompiled, 14 | otp_app: :html5ever, 15 | crate: "html5ever_nif", 16 | mode: :release, 17 | base_url: "#{github_url}/releases/download/v#{version}", 18 | force_build: 19 | System.get_env("HTML5EVER_BUILD") in ["1", "true"] or env_config[:build_from_source], 20 | version: version 21 | 22 | def parse(_binary, _attrs_as_maps), do: err() 23 | def flat_parse(_binary, _attrs_as_maps), do: err() 24 | 25 | defp err, do: :erlang.nif_error(:nif_not_loaded) 26 | end 27 | -------------------------------------------------------------------------------- /mix.exs: -------------------------------------------------------------------------------- 1 | defmodule Html5ever.Mixfile do 2 | use Mix.Project 3 | 4 | @version "0.16.1-dev" 5 | @repo_url "https://github.com/rusterlium/html5ever_elixir" 6 | 7 | def project do 8 | [ 9 | app: :html5ever, 10 | version: @version, 11 | elixir: "~> 1.13", 12 | build_embedded: Mix.env() == :prod, 13 | start_permanent: Mix.env() == :prod, 14 | deps: deps(), 15 | docs: docs(), 16 | description: "NIF binding of html5ever using Rustler", 17 | package: package() 18 | ] 19 | end 20 | 21 | def application do 22 | [extra_applications: [:logger, :inets, :public_key]] 23 | end 24 | 25 | defp deps do 26 | [ 27 | {:rustler_precompiled, "~> 0.8.0"}, 28 | {:rustler, "~> 0.36.0", optional: true}, 29 | {:ex_doc, ">= 0.0.0", only: :dev} 30 | ] 31 | end 32 | 33 | defp docs do 34 | [ 35 | main: "Html5ever", 36 | extras: ["CHANGELOG.md"], 37 | skip_undefined_reference_warnings_on: ["CHANGELOG.md"], 38 | source_ref: "v#{@version}", 39 | source_url: @repo_url 40 | ] 41 | end 42 | 43 | defp package do 44 | [ 45 | files: [ 46 | "lib", 47 | "native", 48 | "checksum-*.exs", 49 | "mix.exs", 50 | "README.md", 51 | "CHANGELOG.md", 52 | "LICENSE-APACHE", 53 | "LICENSE-MIT" 54 | ], 55 | maintainers: ["hansihe", "philip"], 56 | licenses: ["MIT", "Apache-2.0"], 57 | links: %{"GitHub" => @repo_url} 58 | ] 59 | end 60 | end 61 | -------------------------------------------------------------------------------- /mix.lock: -------------------------------------------------------------------------------- 1 | %{ 2 | "castore": {:hex, :castore, "1.0.11", "4bbd584741601eb658007339ea730b082cc61f3554cf2e8f39bf693a11b49073", [:mix], [], "hexpm", "e03990b4db988df56262852f20de0f659871c35154691427a5047f4967a16a62"}, 3 | "earmark_parser": {:hex, :earmark_parser, "1.4.43", "34b2f401fe473080e39ff2b90feb8ddfeef7639f8ee0bbf71bb41911831d77c5", [:mix], [], "hexpm", "970a3cd19503f5e8e527a190662be2cee5d98eed1ff72ed9b3d1a3d466692de8"}, 4 | "ex_doc": {:hex, :ex_doc, "0.37.1", "65ca30d242082b95aa852b3b73c9d9914279fff56db5dc7b3859be5504417980", [:mix], [{:earmark_parser, "~> 1.4.42", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_c, ">= 0.1.0", [hex: :makeup_c, repo: "hexpm", optional: true]}, {:makeup_elixir, "~> 0.14 or ~> 1.0", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1 or ~> 1.0", [hex: :makeup_erlang, repo: "hexpm", optional: false]}, {:makeup_html, ">= 0.1.0", [hex: :makeup_html, repo: "hexpm", optional: true]}], "hexpm", "6774f75477733ea88ce861476db031f9399c110640752ca2b400dbbb50491224"}, 5 | "finch": {:hex, :finch, "0.19.0", "c644641491ea854fc5c1bbaef36bfc764e3f08e7185e1f084e35e0672241b76d", [:mix], [{:mime, "~> 1.0 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:mint, "~> 1.6.2 or ~> 1.7", [hex: :mint, repo: "hexpm", optional: false]}, {:nimble_options, "~> 0.4 or ~> 1.0", [hex: :nimble_options, repo: "hexpm", optional: false]}, {:nimble_pool, "~> 1.1", [hex: :nimble_pool, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "fc5324ce209125d1e2fa0fcd2634601c52a787aff1cd33ee833664a5af4ea2b6"}, 6 | "hpax": {:hex, :hpax, "1.0.2", "762df951b0c399ff67cc57c3995ec3cf46d696e41f0bba17da0518d94acd4aac", [:mix], [], "hexpm", "2f09b4c1074e0abd846747329eaa26d535be0eb3d189fa69d812bfb8bfefd32f"}, 7 | "jason": {:hex, :jason, "1.4.4", "b9226785a9aa77b6857ca22832cffa5d5011a667207eb2a0ad56adb5db443b8a", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "c5eb0cab91f094599f94d55bc63409236a8ec69a21a67814529e8d5f6cc90b3b"}, 8 | "makeup": {:hex, :makeup, "1.2.1", "e90ac1c65589ef354378def3ba19d401e739ee7ee06fb47f94c687016e3713d1", [:mix], [{:nimble_parsec, "~> 1.4", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "d36484867b0bae0fea568d10131197a4c2e47056a6fbe84922bf6ba71c8d17ce"}, 9 | "makeup_elixir": {:hex, :makeup_elixir, "1.0.1", "e928a4f984e795e41e3abd27bfc09f51db16ab8ba1aebdba2b3a575437efafc2", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.2.3 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "7284900d412a3e5cfd97fdaed4f5ed389b8f2b4cb49efc0eb3bd10e2febf9507"}, 10 | "makeup_erlang": {:hex, :makeup_erlang, "1.0.2", "03e1804074b3aa64d5fad7aa64601ed0fb395337b982d9bcf04029d68d51b6a7", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "af33ff7ef368d5893e4a267933e7744e46ce3cf1f61e2dccf53a111ed3aa3727"}, 11 | "mime": {:hex, :mime, "2.0.6", "8f18486773d9b15f95f4f4f1e39b710045fa1de891fada4516559967276e4dc2", [:mix], [], "hexpm", "c9945363a6b26d747389aac3643f8e0e09d30499a138ad64fe8fd1d13d9b153e"}, 12 | "mint": {:hex, :mint, "1.6.2", "af6d97a4051eee4f05b5500671d47c3a67dac7386045d87a904126fd4bbcea2e", [:mix], [{:castore, "~> 0.1.0 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: true]}, {:hpax, "~> 0.1.1 or ~> 0.2.0 or ~> 1.0", [hex: :hpax, repo: "hexpm", optional: false]}], "hexpm", "5ee441dffc1892f1ae59127f74afe8fd82fda6587794278d924e4d90ea3d63f9"}, 13 | "nimble_options": {:hex, :nimble_options, "1.1.1", "e3a492d54d85fc3fd7c5baf411d9d2852922f66e69476317787a7b2bb000a61b", [:mix], [], "hexpm", "821b2470ca9442c4b6984882fe9bb0389371b8ddec4d45a9504f00a66f650b44"}, 14 | "nimble_parsec": {:hex, :nimble_parsec, "1.4.2", "8efba0122db06df95bfaa78f791344a89352ba04baedd3849593bfce4d0dc1c6", [:mix], [], "hexpm", "4b21398942dda052b403bbe1da991ccd03a053668d147d53fb8c4e0efe09c973"}, 15 | "nimble_pool": {:hex, :nimble_pool, "1.1.0", "bf9c29fbdcba3564a8b800d1eeb5a3c58f36e1e11d7b7fb2e084a643f645f06b", [:mix], [], "hexpm", "af2e4e6b34197db81f7aad230c1118eac993acc0dae6bc83bac0126d4ae0813a"}, 16 | "req": {:hex, :req, "0.5.8", "50d8d65279d6e343a5e46980ac2a70e97136182950833a1968b371e753f6a662", [:mix], [{:brotli, "~> 0.3.1", [hex: :brotli, repo: "hexpm", optional: true]}, {:ezstd, "~> 1.0", [hex: :ezstd, repo: "hexpm", optional: true]}, {:finch, "~> 0.17", [hex: :finch, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}, {:mime, "~> 2.0.6 or ~> 2.1", [hex: :mime, repo: "hexpm", optional: false]}, {:nimble_csv, "~> 1.0", [hex: :nimble_csv, repo: "hexpm", optional: true]}, {:plug, "~> 1.0", [hex: :plug, repo: "hexpm", optional: true]}], "hexpm", "d7fc5898a566477e174f26887821a3c5082b243885520ee4b45555f5d53f40ef"}, 17 | "rustler": {:hex, :rustler, "0.36.0", "1decf059c60ec75911241325517c391717a9ad07d43e9a5ffda9d5c9ddd12936", [:mix], [{:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}, {:req, "~> 0.5", [hex: :req, repo: "hexpm", optional: false]}, {:toml, "~> 0.6", [hex: :toml, repo: "hexpm", optional: false]}], "hexpm", "03808c7d289da01da29d8d2fe19d07cae9f3d2f05ebaed87f0820a4dcfabe9d5"}, 18 | "rustler_precompiled": {:hex, :rustler_precompiled, "0.8.2", "5f25cbe220a8fac3e7ad62e6f950fcdca5a5a5f8501835d2823e8c74bf4268d5", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: false]}, {:rustler, "~> 0.23", [hex: :rustler, repo: "hexpm", optional: true]}], "hexpm", "63d1bd5f8e23096d1ff851839923162096364bac8656a4a3c00d1fff8e83ee0a"}, 19 | "telemetry": {:hex, :telemetry, "1.3.0", "fedebbae410d715cf8e7062c96a1ef32ec22e764197f70cda73d82778d61e7a2", [:rebar3], [], "hexpm", "7015fc8919dbe63764f4b4b87a95b7c0996bd539e0d499be6ec9d7f3875b79e6"}, 20 | "toml": {:hex, :toml, "0.7.0", "fbcd773caa937d0c7a02c301a1feea25612720ac3fa1ccb8bfd9d30d822911de", [:mix], [], "hexpm", "0690246a2478c1defd100b0c9b89b4ea280a22be9a7b313a8a058a2408a2fa70"}, 21 | } 22 | -------------------------------------------------------------------------------- /native/html5ever_nif/.cargo/config.toml: -------------------------------------------------------------------------------- 1 | [profile.release] 2 | lto = true 3 | 4 | [target.arm-unknown-linux-gnueabihf] 5 | linker = "arm-linux-gnueabihf-gcc" 6 | 7 | # See https://github.com/rust-lang/rust/issues/59302 8 | [target.x86_64-unknown-linux-musl] 9 | rustflags = [ 10 | "-C", "target-feature=-crt-static" 11 | ] 12 | 13 | # Same as above 14 | [target.aarch64-unknown-linux-musl] 15 | rustflags = [ 16 | "-C", "target-feature=-crt-static" 17 | ] 18 | -------------------------------------------------------------------------------- /native/html5ever_nif/.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | -------------------------------------------------------------------------------- /native/html5ever_nif/Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 4 4 | 5 | [[package]] 6 | name = "autocfg" 7 | version = "1.4.0" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" 10 | 11 | [[package]] 12 | name = "bitflags" 13 | version = "2.8.0" 14 | source = "registry+https://github.com/rust-lang/crates.io-index" 15 | checksum = "8f68f53c83ab957f72c32642f3868eec03eb974d1fb82e453128456482613d36" 16 | 17 | [[package]] 18 | name = "byteorder" 19 | version = "1.5.0" 20 | source = "registry+https://github.com/rust-lang/crates.io-index" 21 | checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" 22 | 23 | [[package]] 24 | name = "cfg-if" 25 | version = "1.0.0" 26 | source = "registry+https://github.com/rust-lang/crates.io-index" 27 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 28 | 29 | [[package]] 30 | name = "futf" 31 | version = "0.1.5" 32 | source = "registry+https://github.com/rust-lang/crates.io-index" 33 | checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" 34 | dependencies = [ 35 | "mac", 36 | "new_debug_unreachable", 37 | ] 38 | 39 | [[package]] 40 | name = "getrandom" 41 | version = "0.2.15" 42 | source = "registry+https://github.com/rust-lang/crates.io-index" 43 | checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" 44 | dependencies = [ 45 | "cfg-if", 46 | "libc", 47 | "wasi", 48 | ] 49 | 50 | [[package]] 51 | name = "heck" 52 | version = "0.5.0" 53 | source = "registry+https://github.com/rust-lang/crates.io-index" 54 | checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" 55 | 56 | [[package]] 57 | name = "html5ever" 58 | version = "0.27.0" 59 | source = "registry+https://github.com/rust-lang/crates.io-index" 60 | checksum = "c13771afe0e6e846f1e67d038d4cb29998a6779f93c809212e4e9c32efd244d4" 61 | dependencies = [ 62 | "log", 63 | "mac", 64 | "markup5ever", 65 | "proc-macro2", 66 | "quote", 67 | "syn", 68 | ] 69 | 70 | [[package]] 71 | name = "html5ever_nif" 72 | version = "0.1.0" 73 | dependencies = [ 74 | "html5ever", 75 | "lazy_static", 76 | "markup5ever", 77 | "rustler", 78 | "tendril", 79 | "thiserror", 80 | ] 81 | 82 | [[package]] 83 | name = "inventory" 84 | version = "0.3.17" 85 | source = "registry+https://github.com/rust-lang/crates.io-index" 86 | checksum = "3b31349d02fe60f80bbbab1a9402364cad7460626d6030494b08ac4a2075bf81" 87 | dependencies = [ 88 | "rustversion", 89 | ] 90 | 91 | [[package]] 92 | name = "lazy_static" 93 | version = "1.5.0" 94 | source = "registry+https://github.com/rust-lang/crates.io-index" 95 | checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" 96 | 97 | [[package]] 98 | name = "libc" 99 | version = "0.2.169" 100 | source = "registry+https://github.com/rust-lang/crates.io-index" 101 | checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" 102 | 103 | [[package]] 104 | name = "libloading" 105 | version = "0.8.6" 106 | source = "registry+https://github.com/rust-lang/crates.io-index" 107 | checksum = "fc2f4eb4bc735547cfed7c0a4922cbd04a4655978c09b54f1f7b228750664c34" 108 | dependencies = [ 109 | "cfg-if", 110 | "windows-targets", 111 | ] 112 | 113 | [[package]] 114 | name = "lock_api" 115 | version = "0.4.12" 116 | source = "registry+https://github.com/rust-lang/crates.io-index" 117 | checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" 118 | dependencies = [ 119 | "autocfg", 120 | "scopeguard", 121 | ] 122 | 123 | [[package]] 124 | name = "log" 125 | version = "0.4.25" 126 | source = "registry+https://github.com/rust-lang/crates.io-index" 127 | checksum = "04cbf5b083de1c7e0222a7a51dbfdba1cbe1c6ab0b15e29fff3f6c077fd9cd9f" 128 | 129 | [[package]] 130 | name = "mac" 131 | version = "0.1.1" 132 | source = "registry+https://github.com/rust-lang/crates.io-index" 133 | checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" 134 | 135 | [[package]] 136 | name = "markup5ever" 137 | version = "0.12.1" 138 | source = "registry+https://github.com/rust-lang/crates.io-index" 139 | checksum = "16ce3abbeba692c8b8441d036ef91aea6df8da2c6b6e21c7e14d3c18e526be45" 140 | dependencies = [ 141 | "log", 142 | "phf", 143 | "phf_codegen", 144 | "string_cache", 145 | "string_cache_codegen", 146 | "tendril", 147 | ] 148 | 149 | [[package]] 150 | name = "new_debug_unreachable" 151 | version = "1.0.6" 152 | source = "registry+https://github.com/rust-lang/crates.io-index" 153 | checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" 154 | 155 | [[package]] 156 | name = "once_cell" 157 | version = "1.20.2" 158 | source = "registry+https://github.com/rust-lang/crates.io-index" 159 | checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" 160 | 161 | [[package]] 162 | name = "parking_lot" 163 | version = "0.12.3" 164 | source = "registry+https://github.com/rust-lang/crates.io-index" 165 | checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27" 166 | dependencies = [ 167 | "lock_api", 168 | "parking_lot_core", 169 | ] 170 | 171 | [[package]] 172 | name = "parking_lot_core" 173 | version = "0.9.10" 174 | source = "registry+https://github.com/rust-lang/crates.io-index" 175 | checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" 176 | dependencies = [ 177 | "cfg-if", 178 | "libc", 179 | "redox_syscall", 180 | "smallvec", 181 | "windows-targets", 182 | ] 183 | 184 | [[package]] 185 | name = "phf" 186 | version = "0.11.3" 187 | source = "registry+https://github.com/rust-lang/crates.io-index" 188 | checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" 189 | dependencies = [ 190 | "phf_shared 0.11.3", 191 | ] 192 | 193 | [[package]] 194 | name = "phf_codegen" 195 | version = "0.11.3" 196 | source = "registry+https://github.com/rust-lang/crates.io-index" 197 | checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a" 198 | dependencies = [ 199 | "phf_generator 0.11.3", 200 | "phf_shared 0.11.3", 201 | ] 202 | 203 | [[package]] 204 | name = "phf_generator" 205 | version = "0.10.0" 206 | source = "registry+https://github.com/rust-lang/crates.io-index" 207 | checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6" 208 | dependencies = [ 209 | "phf_shared 0.10.0", 210 | "rand", 211 | ] 212 | 213 | [[package]] 214 | name = "phf_generator" 215 | version = "0.11.3" 216 | source = "registry+https://github.com/rust-lang/crates.io-index" 217 | checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" 218 | dependencies = [ 219 | "phf_shared 0.11.3", 220 | "rand", 221 | ] 222 | 223 | [[package]] 224 | name = "phf_shared" 225 | version = "0.10.0" 226 | source = "registry+https://github.com/rust-lang/crates.io-index" 227 | checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096" 228 | dependencies = [ 229 | "siphasher 0.3.11", 230 | ] 231 | 232 | [[package]] 233 | name = "phf_shared" 234 | version = "0.11.3" 235 | source = "registry+https://github.com/rust-lang/crates.io-index" 236 | checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" 237 | dependencies = [ 238 | "siphasher 1.0.1", 239 | ] 240 | 241 | [[package]] 242 | name = "ppv-lite86" 243 | version = "0.2.20" 244 | source = "registry+https://github.com/rust-lang/crates.io-index" 245 | checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" 246 | dependencies = [ 247 | "zerocopy", 248 | ] 249 | 250 | [[package]] 251 | name = "precomputed-hash" 252 | version = "0.1.1" 253 | source = "registry+https://github.com/rust-lang/crates.io-index" 254 | checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" 255 | 256 | [[package]] 257 | name = "proc-macro2" 258 | version = "1.0.93" 259 | source = "registry+https://github.com/rust-lang/crates.io-index" 260 | checksum = "60946a68e5f9d28b0dc1c21bb8a97ee7d018a8b322fa57838ba31cc878e22d99" 261 | dependencies = [ 262 | "unicode-ident", 263 | ] 264 | 265 | [[package]] 266 | name = "quote" 267 | version = "1.0.38" 268 | source = "registry+https://github.com/rust-lang/crates.io-index" 269 | checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc" 270 | dependencies = [ 271 | "proc-macro2", 272 | ] 273 | 274 | [[package]] 275 | name = "rand" 276 | version = "0.8.5" 277 | source = "registry+https://github.com/rust-lang/crates.io-index" 278 | checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" 279 | dependencies = [ 280 | "libc", 281 | "rand_chacha", 282 | "rand_core", 283 | ] 284 | 285 | [[package]] 286 | name = "rand_chacha" 287 | version = "0.3.1" 288 | source = "registry+https://github.com/rust-lang/crates.io-index" 289 | checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" 290 | dependencies = [ 291 | "ppv-lite86", 292 | "rand_core", 293 | ] 294 | 295 | [[package]] 296 | name = "rand_core" 297 | version = "0.6.4" 298 | source = "registry+https://github.com/rust-lang/crates.io-index" 299 | checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" 300 | dependencies = [ 301 | "getrandom", 302 | ] 303 | 304 | [[package]] 305 | name = "redox_syscall" 306 | version = "0.5.8" 307 | source = "registry+https://github.com/rust-lang/crates.io-index" 308 | checksum = "03a862b389f93e68874fbf580b9de08dd02facb9a788ebadaf4a3fd33cf58834" 309 | dependencies = [ 310 | "bitflags", 311 | ] 312 | 313 | [[package]] 314 | name = "regex-lite" 315 | version = "0.1.6" 316 | source = "registry+https://github.com/rust-lang/crates.io-index" 317 | checksum = "53a49587ad06b26609c52e423de037e7f57f20d53535d66e08c695f347df952a" 318 | 319 | [[package]] 320 | name = "rustler" 321 | version = "0.36.0" 322 | source = "registry+https://github.com/rust-lang/crates.io-index" 323 | checksum = "1f7b219d7473cf473409665a4898d66688b34736e51bb5791098b0d3390e4c98" 324 | dependencies = [ 325 | "inventory", 326 | "libloading", 327 | "regex-lite", 328 | "rustler_codegen", 329 | ] 330 | 331 | [[package]] 332 | name = "rustler_codegen" 333 | version = "0.36.0" 334 | source = "registry+https://github.com/rust-lang/crates.io-index" 335 | checksum = "743ec5267bd5f18fd88d89f7e729c0f43b97d9c2539959915fa1f234300bb621" 336 | dependencies = [ 337 | "heck", 338 | "inventory", 339 | "proc-macro2", 340 | "quote", 341 | "syn", 342 | ] 343 | 344 | [[package]] 345 | name = "rustversion" 346 | version = "1.0.19" 347 | source = "registry+https://github.com/rust-lang/crates.io-index" 348 | checksum = "f7c45b9784283f1b2e7fb61b42047c2fd678ef0960d4f6f1eba131594cc369d4" 349 | 350 | [[package]] 351 | name = "scopeguard" 352 | version = "1.2.0" 353 | source = "registry+https://github.com/rust-lang/crates.io-index" 354 | checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" 355 | 356 | [[package]] 357 | name = "serde" 358 | version = "1.0.217" 359 | source = "registry+https://github.com/rust-lang/crates.io-index" 360 | checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70" 361 | dependencies = [ 362 | "serde_derive", 363 | ] 364 | 365 | [[package]] 366 | name = "serde_derive" 367 | version = "1.0.217" 368 | source = "registry+https://github.com/rust-lang/crates.io-index" 369 | checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0" 370 | dependencies = [ 371 | "proc-macro2", 372 | "quote", 373 | "syn", 374 | ] 375 | 376 | [[package]] 377 | name = "siphasher" 378 | version = "0.3.11" 379 | source = "registry+https://github.com/rust-lang/crates.io-index" 380 | checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" 381 | 382 | [[package]] 383 | name = "siphasher" 384 | version = "1.0.1" 385 | source = "registry+https://github.com/rust-lang/crates.io-index" 386 | checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" 387 | 388 | [[package]] 389 | name = "smallvec" 390 | version = "1.13.2" 391 | source = "registry+https://github.com/rust-lang/crates.io-index" 392 | checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" 393 | 394 | [[package]] 395 | name = "string_cache" 396 | version = "0.8.7" 397 | source = "registry+https://github.com/rust-lang/crates.io-index" 398 | checksum = "f91138e76242f575eb1d3b38b4f1362f10d3a43f47d182a5b359af488a02293b" 399 | dependencies = [ 400 | "new_debug_unreachable", 401 | "once_cell", 402 | "parking_lot", 403 | "phf_shared 0.10.0", 404 | "precomputed-hash", 405 | "serde", 406 | ] 407 | 408 | [[package]] 409 | name = "string_cache_codegen" 410 | version = "0.5.2" 411 | source = "registry+https://github.com/rust-lang/crates.io-index" 412 | checksum = "6bb30289b722be4ff74a408c3cc27edeaad656e06cb1fe8fa9231fa59c728988" 413 | dependencies = [ 414 | "phf_generator 0.10.0", 415 | "phf_shared 0.10.0", 416 | "proc-macro2", 417 | "quote", 418 | ] 419 | 420 | [[package]] 421 | name = "syn" 422 | version = "2.0.96" 423 | source = "registry+https://github.com/rust-lang/crates.io-index" 424 | checksum = "d5d0adab1ae378d7f53bdebc67a39f1f151407ef230f0ce2883572f5d8985c80" 425 | dependencies = [ 426 | "proc-macro2", 427 | "quote", 428 | "unicode-ident", 429 | ] 430 | 431 | [[package]] 432 | name = "tendril" 433 | version = "0.4.3" 434 | source = "registry+https://github.com/rust-lang/crates.io-index" 435 | checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0" 436 | dependencies = [ 437 | "futf", 438 | "mac", 439 | "utf-8", 440 | ] 441 | 442 | [[package]] 443 | name = "thiserror" 444 | version = "2.0.11" 445 | source = "registry+https://github.com/rust-lang/crates.io-index" 446 | checksum = "d452f284b73e6d76dd36758a0c8684b1d5be31f92b89d07fd5822175732206fc" 447 | dependencies = [ 448 | "thiserror-impl", 449 | ] 450 | 451 | [[package]] 452 | name = "thiserror-impl" 453 | version = "2.0.11" 454 | source = "registry+https://github.com/rust-lang/crates.io-index" 455 | checksum = "26afc1baea8a989337eeb52b6e72a039780ce45c3edfcc9c5b9d112feeb173c2" 456 | dependencies = [ 457 | "proc-macro2", 458 | "quote", 459 | "syn", 460 | ] 461 | 462 | [[package]] 463 | name = "unicode-ident" 464 | version = "1.0.15" 465 | source = "registry+https://github.com/rust-lang/crates.io-index" 466 | checksum = "11cd88e12b17c6494200a9c1b683a04fcac9573ed74cd1b62aeb2727c5592243" 467 | 468 | [[package]] 469 | name = "utf-8" 470 | version = "0.7.6" 471 | source = "registry+https://github.com/rust-lang/crates.io-index" 472 | checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" 473 | 474 | [[package]] 475 | name = "wasi" 476 | version = "0.11.0+wasi-snapshot-preview1" 477 | source = "registry+https://github.com/rust-lang/crates.io-index" 478 | checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" 479 | 480 | [[package]] 481 | name = "windows-targets" 482 | version = "0.52.6" 483 | source = "registry+https://github.com/rust-lang/crates.io-index" 484 | checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" 485 | dependencies = [ 486 | "windows_aarch64_gnullvm", 487 | "windows_aarch64_msvc", 488 | "windows_i686_gnu", 489 | "windows_i686_gnullvm", 490 | "windows_i686_msvc", 491 | "windows_x86_64_gnu", 492 | "windows_x86_64_gnullvm", 493 | "windows_x86_64_msvc", 494 | ] 495 | 496 | [[package]] 497 | name = "windows_aarch64_gnullvm" 498 | version = "0.52.6" 499 | source = "registry+https://github.com/rust-lang/crates.io-index" 500 | checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" 501 | 502 | [[package]] 503 | name = "windows_aarch64_msvc" 504 | version = "0.52.6" 505 | source = "registry+https://github.com/rust-lang/crates.io-index" 506 | checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" 507 | 508 | [[package]] 509 | name = "windows_i686_gnu" 510 | version = "0.52.6" 511 | source = "registry+https://github.com/rust-lang/crates.io-index" 512 | checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" 513 | 514 | [[package]] 515 | name = "windows_i686_gnullvm" 516 | version = "0.52.6" 517 | source = "registry+https://github.com/rust-lang/crates.io-index" 518 | checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" 519 | 520 | [[package]] 521 | name = "windows_i686_msvc" 522 | version = "0.52.6" 523 | source = "registry+https://github.com/rust-lang/crates.io-index" 524 | checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" 525 | 526 | [[package]] 527 | name = "windows_x86_64_gnu" 528 | version = "0.52.6" 529 | source = "registry+https://github.com/rust-lang/crates.io-index" 530 | checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" 531 | 532 | [[package]] 533 | name = "windows_x86_64_gnullvm" 534 | version = "0.52.6" 535 | source = "registry+https://github.com/rust-lang/crates.io-index" 536 | checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" 537 | 538 | [[package]] 539 | name = "windows_x86_64_msvc" 540 | version = "0.52.6" 541 | source = "registry+https://github.com/rust-lang/crates.io-index" 542 | checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" 543 | 544 | [[package]] 545 | name = "zerocopy" 546 | version = "0.7.35" 547 | source = "registry+https://github.com/rust-lang/crates.io-index" 548 | checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" 549 | dependencies = [ 550 | "byteorder", 551 | "zerocopy-derive", 552 | ] 553 | 554 | [[package]] 555 | name = "zerocopy-derive" 556 | version = "0.7.35" 557 | source = "registry+https://github.com/rust-lang/crates.io-index" 558 | checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" 559 | dependencies = [ 560 | "proc-macro2", 561 | "quote", 562 | "syn", 563 | ] 564 | -------------------------------------------------------------------------------- /native/html5ever_nif/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "html5ever_nif" 3 | version = "0.1.0" 4 | authors = ["HansiHE "] 5 | edition = "2021" 6 | 7 | [lib] 8 | name = "html5ever_nif" 9 | path = "src/lib.rs" 10 | crate-type = ["cdylib"] 11 | 12 | [dependencies] 13 | # See the Precompilation guide for details about the features: https://github.com/philss/rustler_precompiled/blob/main/PRECOMPILATION_GUIDE.md 14 | rustler = { version = "0.36", default-features = false, features = ["nif_version_2_15"] } 15 | 16 | html5ever = "0.27" 17 | markup5ever = "0.12" 18 | 19 | tendril = "0.4" 20 | lazy_static = "1.5" 21 | 22 | thiserror = "2" 23 | -------------------------------------------------------------------------------- /native/html5ever_nif/src/common.rs: -------------------------------------------------------------------------------- 1 | use rustler::{Encoder, Env, Term}; 2 | 3 | use html5ever::QualName; 4 | use tendril::StrTendril; 5 | 6 | // Zero-cost wrapper types which makes it possible to implement 7 | // Encoder for these externally defined types. 8 | // Unsure if this is a great way of doing it, but it's the way 9 | // that produced the cleanest and least noisy code. 10 | pub struct QualNameWrapper<'a>(pub &'a QualName); 11 | pub struct StrTendrilWrapper<'a>(pub &'a StrTendril); 12 | 13 | impl Encoder for QualNameWrapper<'_> { 14 | fn encode<'a>(&self, env: Env<'a>) -> Term<'a> { 15 | let data: &str = &self.0.local; 16 | data.encode(env) 17 | } 18 | } 19 | impl Encoder for StrTendrilWrapper<'_> { 20 | fn encode<'a>(&self, env: Env<'a>) -> Term<'a> { 21 | let data: &str = self.0; 22 | data.encode(env) 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /native/html5ever_nif/src/flat_dom.rs: -------------------------------------------------------------------------------- 1 | use html5ever::tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink}; 2 | use html5ever::{Attribute, QualName}; 3 | use markup5ever::ExpandedName; 4 | 5 | use tendril::StrTendril; 6 | 7 | use std::borrow::Cow; 8 | 9 | use rustler::{Encoder, Env, Term}; 10 | 11 | use crate::common::{QualNameWrapper, StrTendrilWrapper}; 12 | use crate::Html5everExError; 13 | 14 | #[derive(Copy, Clone, PartialEq, Debug)] 15 | pub struct NodeHandle(pub usize); 16 | 17 | pub enum PoolOrVec { 18 | Pool { head: usize, len: usize }, 19 | Vec { vec: Vec }, 20 | } 21 | 22 | impl PoolOrVec 23 | where 24 | T: Clone, 25 | { 26 | pub fn new(pool: &[T]) -> Self { 27 | PoolOrVec::Pool { 28 | head: pool.len(), 29 | len: 0, 30 | } 31 | } 32 | 33 | pub fn get<'a>(&'a self, idx: usize, pool: &'a [T]) -> Option<&'a T> { 34 | match self { 35 | PoolOrVec::Pool { head, len } if idx < *len => Some(&pool[*head + idx]), 36 | PoolOrVec::Vec { vec } => vec.get(idx), 37 | _ => None, 38 | } 39 | } 40 | 41 | pub fn as_slice<'a>(&'a self, pool: &'a [T]) -> &'a [T] { 42 | match self { 43 | PoolOrVec::Pool { head, len } => &pool[*head..(*head + *len)], 44 | PoolOrVec::Vec { vec } => vec, 45 | } 46 | } 47 | 48 | pub fn push(&mut self, item: T, pool: &mut Vec) { 49 | match self { 50 | PoolOrVec::Pool { head, len } if pool.len() == *head + *len => { 51 | pool.push(item); 52 | *len += 1; 53 | } 54 | val @ PoolOrVec::Pool { .. } => { 55 | if let PoolOrVec::Pool { head, len } = val { 56 | let mut vec = pool[*head..(*head + *len)].to_owned(); 57 | vec.push(item); 58 | *val = PoolOrVec::Vec { vec }; 59 | } else { 60 | unreachable!() 61 | } 62 | } 63 | PoolOrVec::Vec { vec } => { 64 | vec.push(item); 65 | } 66 | } 67 | } 68 | 69 | pub fn iter<'a>(&'a self, pool: &'a [T]) -> impl Iterator + 'a { 70 | self.as_slice(pool).iter() 71 | } 72 | 73 | pub fn insert(&mut self, index: usize, item: T, pool: &mut Vec) { 74 | match self { 75 | PoolOrVec::Pool { head, len } if pool.len() == *head + *len => { 76 | pool.insert(*head + index, item); 77 | *len += 1; 78 | } 79 | val @ PoolOrVec::Pool { .. } => { 80 | *val = PoolOrVec::Vec { 81 | vec: { 82 | let mut vec = val.as_slice(pool).to_owned(); 83 | vec.insert(index, item); 84 | vec 85 | }, 86 | }; 87 | } 88 | PoolOrVec::Vec { vec } => { 89 | vec.insert(index, item); 90 | } 91 | } 92 | } 93 | 94 | pub fn remove(&mut self, index: usize, pool: &mut [T]) { 95 | match self { 96 | val @ PoolOrVec::Pool { .. } => { 97 | *val = PoolOrVec::Vec { 98 | vec: { 99 | let mut vec = val.as_slice(pool).to_owned(); 100 | vec.remove(index); 101 | vec 102 | }, 103 | }; 104 | } 105 | PoolOrVec::Vec { vec } => { 106 | vec.remove(index); 107 | } 108 | } 109 | } 110 | } 111 | 112 | pub struct Node { 113 | id: NodeHandle, 114 | children: PoolOrVec, 115 | parent: Option, 116 | data: NodeData, 117 | } 118 | impl Node { 119 | fn new(id: usize, data: NodeData, pool: &[NodeHandle]) -> Self { 120 | Node { 121 | id: NodeHandle(id), 122 | parent: None, 123 | children: PoolOrVec::new(pool), 124 | data, 125 | } 126 | } 127 | } 128 | 129 | #[derive(Debug, PartialEq)] 130 | pub enum NodeData { 131 | Document, 132 | DocType { 133 | name: StrTendril, 134 | public_id: StrTendril, 135 | system_id: StrTendril, 136 | }, 137 | Text { 138 | contents: StrTendril, 139 | }, 140 | Comment { 141 | contents: StrTendril, 142 | }, 143 | Element { 144 | name: QualName, 145 | attrs: Vec, 146 | template_contents: Option, 147 | mathml_annotation_xml_integration_point: bool, 148 | }, 149 | ProcessingInstruction { 150 | target: StrTendril, 151 | contents: StrTendril, 152 | }, 153 | } 154 | 155 | pub struct FlatSink { 156 | pub root: NodeHandle, 157 | pub nodes: Vec, 158 | pub pool: Vec, 159 | } 160 | 161 | impl FlatSink { 162 | pub fn new() -> FlatSink { 163 | let mut sink = FlatSink { 164 | root: NodeHandle(0), 165 | nodes: Vec::with_capacity(200), 166 | pool: Vec::with_capacity(2000), 167 | }; 168 | 169 | // Element 0 is always root 170 | sink.nodes 171 | .push(Node::new(0, NodeData::Document, &sink.pool)); 172 | 173 | sink 174 | } 175 | 176 | pub fn root(&self) -> NodeHandle { 177 | self.root 178 | } 179 | 180 | pub fn node_mut(&mut self, handle: NodeHandle) -> &mut Node { 181 | &mut self.nodes[handle.0] 182 | } 183 | pub fn node(&self, handle: NodeHandle) -> &Node { 184 | &self.nodes[handle.0] 185 | } 186 | 187 | pub fn make_node(&mut self, data: NodeData) -> NodeHandle { 188 | let node = Node::new(self.nodes.len(), data, &self.pool); 189 | let id = node.id; 190 | self.nodes.push(node); 191 | id 192 | } 193 | } 194 | 195 | fn node_or_text_to_node(sink: &mut FlatSink, not: NodeOrText) -> NodeHandle { 196 | match not { 197 | NodeOrText::AppendNode(handle) => handle, 198 | NodeOrText::AppendText(text) => sink.make_node(NodeData::Text { contents: text }), 199 | } 200 | } 201 | 202 | impl TreeSink for FlatSink { 203 | type Output = Self; 204 | type Handle = NodeHandle; 205 | 206 | fn finish(self) -> Self::Output { 207 | self 208 | } 209 | 210 | // TODO: Log this or something 211 | fn parse_error(&mut self, _msg: Cow<'static, str>) {} 212 | fn set_quirks_mode(&mut self, _mode: QuirksMode) {} 213 | 214 | fn get_document(&mut self) -> Self::Handle { 215 | NodeHandle(0) 216 | } 217 | fn get_template_contents(&mut self, target: &Self::Handle) -> Self::Handle { 218 | // Inspired in https://github.com/servo/html5ever/blob/1a62a39879a1def200dcb87b900265993e6c1c83/rcdom/lib.rs#L235 219 | // It is not getting the templates contents. But is printing the empty tag. 220 | // TODO: print the contents as text. 221 | let node = self.node(*target); 222 | if let NodeData::Element { 223 | ref template_contents, 224 | .. 225 | } = node.data 226 | { 227 | *template_contents.as_ref().expect("not a template element!") 228 | } else { 229 | panic!("not a template element!") 230 | } 231 | } 232 | 233 | fn same_node(&self, x: &Self::Handle, y: &Self::Handle) -> bool { 234 | x == y 235 | } 236 | fn elem_name(&self, target: &Self::Handle) -> ExpandedName { 237 | let node = self.node(*target); 238 | match node.data { 239 | NodeData::Element { ref name, .. } => name.expanded(), 240 | _ => unreachable!(), 241 | } 242 | } 243 | 244 | fn create_element( 245 | &mut self, 246 | name: QualName, 247 | attrs: Vec, 248 | flags: ElementFlags, 249 | ) -> Self::Handle { 250 | let template_contents = if flags.template { 251 | Some(self.make_node(NodeData::Document)) 252 | } else { 253 | None 254 | }; 255 | 256 | self.make_node(NodeData::Element { 257 | name, 258 | attrs, 259 | mathml_annotation_xml_integration_point: flags.mathml_annotation_xml_integration_point, 260 | template_contents, 261 | }) 262 | } 263 | 264 | fn create_comment(&mut self, text: StrTendril) -> Self::Handle { 265 | self.make_node(NodeData::Comment { contents: text }) 266 | } 267 | 268 | fn append(&mut self, parent_id: &Self::Handle, child: NodeOrText) { 269 | let handle = node_or_text_to_node(self, child); 270 | 271 | self.nodes[parent_id.0] 272 | .children 273 | .push(handle, &mut self.pool); 274 | self.node_mut(handle).parent = Some(*parent_id); 275 | } 276 | 277 | fn append_based_on_parent_node( 278 | &mut self, 279 | element: &Self::Handle, 280 | prev_element: &Self::Handle, 281 | child: NodeOrText, 282 | ) { 283 | let has_parent = self.node(*element).parent.is_some(); 284 | if has_parent { 285 | self.append_before_sibling(element, child); 286 | } else { 287 | self.append(prev_element, child); 288 | } 289 | } 290 | 291 | fn append_before_sibling( 292 | &mut self, 293 | sibling: &Self::Handle, 294 | new_node: NodeOrText, 295 | ) { 296 | let new_node_handle = node_or_text_to_node(self, new_node); 297 | 298 | let parent = self.node(*sibling).parent.unwrap(); 299 | let parent_node = &mut self.nodes[parent.0]; 300 | let sibling_index = parent_node 301 | .children 302 | .iter(&self.pool) 303 | .enumerate() 304 | .find(|&(_, node)| node == sibling) 305 | .unwrap() 306 | .0; 307 | parent_node 308 | .children 309 | .insert(sibling_index, new_node_handle, &mut self.pool); 310 | } 311 | 312 | fn append_doctype_to_document( 313 | &mut self, 314 | name: StrTendril, 315 | public_id: StrTendril, 316 | system_id: StrTendril, 317 | ) { 318 | let doctype = self.make_node(NodeData::DocType { 319 | name, 320 | public_id, 321 | system_id, 322 | }); 323 | let root = self.root; 324 | self.nodes[root.0].children.push(doctype, &mut self.pool); 325 | self.node_mut(doctype).parent = Some(self.root); 326 | } 327 | 328 | fn add_attrs_if_missing( 329 | &mut self, 330 | target_handle: &Self::Handle, 331 | mut add_attrs: Vec, 332 | ) { 333 | let target = self.node_mut(*target_handle); 334 | match target.data { 335 | NodeData::Element { ref mut attrs, .. } => { 336 | for attr in add_attrs.drain(..) { 337 | if !attrs.iter().any(|a| attr.name == a.name) { 338 | attrs.push(attr); 339 | } 340 | } 341 | } 342 | _ => unreachable!(), 343 | } 344 | } 345 | 346 | fn remove_from_parent(&mut self, target: &Self::Handle) { 347 | let parent = self.node(*target).parent.unwrap(); 348 | let parent_node = &mut self.nodes[parent.0]; 349 | let sibling_index = parent_node 350 | .children 351 | .iter(&self.pool) 352 | .enumerate() 353 | .find(|&(_, node)| node == target) 354 | .unwrap() 355 | .0; 356 | parent_node.children.remove(sibling_index, &mut self.pool); 357 | } 358 | 359 | fn reparent_children(&mut self, node: &Self::Handle, new_parent: &Self::Handle) { 360 | let old_children = self.node(*node).children.as_slice(&self.pool).to_owned(); 361 | for child in &old_children { 362 | self.node_mut(*child).parent = Some(*new_parent); 363 | } 364 | let new_node = &mut self.nodes[new_parent.0]; 365 | for child in old_children { 366 | new_node.children.push(child, &mut self.pool); 367 | } 368 | } 369 | 370 | fn mark_script_already_started(&mut self, _elem: &Self::Handle) { 371 | panic!("unsupported"); 372 | } 373 | 374 | //fn has_parent_node(&self, handle: &Self::Handle) -> bool { 375 | // self.node(*handle).parent.is_some() 376 | //} 377 | 378 | fn create_pi(&mut self, target: StrTendril, data: StrTendril) -> Self::Handle { 379 | self.make_node(NodeData::ProcessingInstruction { 380 | target, 381 | contents: data, 382 | }) 383 | } 384 | } 385 | 386 | impl Encoder for NodeHandle { 387 | fn encode<'a>(&self, env: Env<'a>) -> Term<'a> { 388 | self.0.encode(env) 389 | } 390 | } 391 | 392 | fn to_custom_error(_err: rustler::error::Error) -> Html5everExError { 393 | Html5everExError::MapEntry 394 | } 395 | 396 | fn encode_node<'a>( 397 | node: &Node, 398 | env: Env<'a>, 399 | pool: &[NodeHandle], 400 | attributes_as_maps: bool, 401 | ) -> Result, Html5everExError> { 402 | let pairs: Vec<(Term, Term)> = vec![ 403 | (atoms::id().encode(env), node.id.encode(env)), 404 | ( 405 | atoms::parent().encode(env), 406 | match node.parent { 407 | Some(handle) => handle.encode(env), 408 | None => atoms::nil().encode(env), 409 | }, 410 | ), 411 | ]; 412 | 413 | let mut map = Term::map_from_pairs(env, &pairs).map_err(to_custom_error)?; 414 | 415 | match node.data { 416 | NodeData::Document => map 417 | .map_put(atoms::type_().encode(env), atoms::document().encode(env)) 418 | .map_err(to_custom_error), 419 | NodeData::Element { 420 | ref attrs, 421 | ref name, 422 | .. 423 | } => { 424 | let pairs: Vec<(Term, Term)> = vec![ 425 | (atoms::type_().encode(env), atoms::element().encode(env)), 426 | ( 427 | atoms::children().encode(env), 428 | node.children.as_slice(pool).encode(env), 429 | ), 430 | (atoms::name().encode(env), QualNameWrapper(name).encode(env)), 431 | ( 432 | atoms::attrs().encode(env), 433 | attributes_to_term(env, attrs, attributes_as_maps), 434 | ), 435 | ]; 436 | 437 | for (key, value) in pairs { 438 | map = map.map_put(key, value).map_err(to_custom_error)?; 439 | } 440 | 441 | Ok(map) 442 | } 443 | NodeData::Text { ref contents } => map 444 | .map_put(atoms::type_().encode(env), atoms::text().encode(env)) 445 | .map_err(to_custom_error)? 446 | .map_put( 447 | atoms::contents().encode(env), 448 | StrTendrilWrapper(contents).encode(env), 449 | ) 450 | .map_err(to_custom_error), 451 | NodeData::DocType { .. } => map 452 | .map_put(atoms::type_().encode(env), atoms::doctype().encode(env)) 453 | .map_err(to_custom_error), 454 | NodeData::Comment { ref contents } => map 455 | .map_put(atoms::type_().encode(env), atoms::comment().encode(env)) 456 | .map_err(to_custom_error)? 457 | .map_put( 458 | atoms::contents().encode(env), 459 | StrTendrilWrapper(contents).encode(env), 460 | ) 461 | .map_err(to_custom_error), 462 | _ => unimplemented!(), 463 | } 464 | } 465 | 466 | mod atoms { 467 | rustler::atoms! { 468 | nil, 469 | 470 | type_ = "type", 471 | document, 472 | element, 473 | text, 474 | doctype, 475 | comment, 476 | 477 | name, 478 | nodes, 479 | root, 480 | id, 481 | parent, 482 | children, 483 | contents, 484 | attrs, 485 | } 486 | } 487 | 488 | pub fn flat_sink_to_flat_term<'a>( 489 | env: Env<'a>, 490 | sink: &FlatSink, 491 | attributes_as_maps: bool, 492 | ) -> Result, Html5everExError> { 493 | let mut nodes_map = rustler::types::map::map_new(env); 494 | 495 | for node in sink.nodes.iter() { 496 | nodes_map = nodes_map 497 | .map_put( 498 | node.id.encode(env), 499 | encode_node(node, env, &sink.pool, attributes_as_maps)?, 500 | ) 501 | .map_err(to_custom_error)?; 502 | } 503 | 504 | ::rustler::types::map::map_new(env) 505 | .map_put(atoms::nodes().encode(env), nodes_map) 506 | .map_err(to_custom_error)? 507 | .map_put(atoms::root().encode(env), sink.root.encode(env)) 508 | .map_err(to_custom_error) 509 | } 510 | 511 | struct RecState { 512 | node: NodeHandle, 513 | child_n: usize, 514 | child_base: usize, 515 | } 516 | 517 | pub fn flat_sink_to_rec_term<'a>( 518 | env: Env<'a>, 519 | sink: &FlatSink, 520 | attributes_as_maps: bool, 521 | ) -> Result, Html5everExError> { 522 | let mut child_stack = vec![]; 523 | 524 | let mut stack: Vec = vec![RecState { 525 | node: sink.root(), 526 | child_base: 0, 527 | child_n: 0, 528 | }]; 529 | let mut comments_bf_doctype = 0usize; 530 | let mut read_doctype = false; 531 | 532 | loop { 533 | let mut top = stack.pop().unwrap(); 534 | let top_node = &sink.nodes[top.node.0]; 535 | 536 | if let Some(child_node) = top_node.children.get(top.child_n, &sink.pool) { 537 | // If we find another child, we recurse downwards 538 | 539 | let child = RecState { 540 | node: *child_node, 541 | child_base: child_stack.len(), 542 | child_n: 0, 543 | }; 544 | debug_assert!(sink.nodes[child_node.0].data != NodeData::Document); 545 | 546 | top.child_n += 1; 547 | stack.push(top); 548 | stack.push(child); 549 | continue; 550 | } else { 551 | // If there are no more children, we add the child to the parent 552 | // (or we return if we are the root) 553 | 554 | let term; 555 | 556 | match &top_node.data { 557 | NodeData::Document => { 558 | let term = child_stack[top.child_base..].encode(env); 559 | for _ in 0..(child_stack.len() - top.child_base) { 560 | child_stack.pop(); 561 | } 562 | 563 | assert_eq!(stack.len(), 0); 564 | return Ok(term); 565 | } 566 | NodeData::DocType { 567 | name, 568 | public_id, 569 | system_id, 570 | } => { 571 | assert!(!stack.is_empty()); 572 | assert!(child_stack.is_empty() || comments_bf_doctype == child_stack.len()); 573 | 574 | read_doctype = true; 575 | 576 | term = ( 577 | atoms::doctype(), 578 | StrTendrilWrapper(name), 579 | StrTendrilWrapper(public_id), 580 | StrTendrilWrapper(system_id), 581 | ) 582 | .encode(env); 583 | } 584 | NodeData::Element { attrs, name, .. } => { 585 | assert!(!stack.is_empty()); 586 | 587 | let attribute_terms = attributes_to_term(env, attrs, attributes_as_maps); 588 | 589 | term = ( 590 | QualNameWrapper(name), 591 | attribute_terms, 592 | &child_stack[top.child_base..], 593 | ) 594 | .encode(env); 595 | for _ in 0..(child_stack.len() - top.child_base) { 596 | child_stack.pop(); 597 | } 598 | } 599 | NodeData::Text { contents } => { 600 | term = StrTendrilWrapper(contents).encode(env); 601 | } 602 | NodeData::Comment { contents } => { 603 | if !read_doctype { 604 | comments_bf_doctype += 1 605 | }; 606 | 607 | term = (atoms::comment(), StrTendrilWrapper(contents)).encode(env); 608 | } 609 | _ => unimplemented!(""), 610 | } 611 | 612 | child_stack.push(term); 613 | } 614 | } 615 | } 616 | 617 | fn attributes_to_term<'a>( 618 | env: Env<'a>, 619 | attributes: &[Attribute], 620 | attributes_as_maps: bool, 621 | ) -> Term<'a> { 622 | let pairs: Vec<(QualNameWrapper, StrTendrilWrapper)> = attributes 623 | .iter() 624 | .map(|a| (QualNameWrapper(&a.name), StrTendrilWrapper(&a.value))) 625 | .collect(); 626 | 627 | if attributes_as_maps { 628 | Term::map_from_pairs(env, &pairs).unwrap() 629 | } else { 630 | pairs.encode(env) 631 | } 632 | } 633 | -------------------------------------------------------------------------------- /native/html5ever_nif/src/lib.rs: -------------------------------------------------------------------------------- 1 | use flat_dom::FlatSink; 2 | use rustler::types::binary::Binary; 3 | use rustler::{Env, Term}; 4 | 5 | use tendril::TendrilSink; 6 | use thiserror::Error; 7 | 8 | mod common; 9 | mod flat_dom; 10 | 11 | #[derive(Error, Debug)] 12 | pub enum Html5everExError { 13 | #[error("cannot transform bytes from binary to a valid UTF8 string")] 14 | BytesToUtf8(#[from] std::str::Utf8Error), 15 | 16 | #[error("cannot insert entry in a map")] 17 | MapEntry, 18 | } 19 | 20 | impl rustler::Encoder for Html5everExError { 21 | fn encode<'a>(&self, env: Env<'a>) -> Term<'a> { 22 | format!("{self}").encode(env) 23 | } 24 | } 25 | 26 | #[rustler::nif(schedule = "DirtyCpu")] 27 | fn parse<'a>( 28 | env: Env<'a>, 29 | binary: Binary, 30 | attributes_as_maps: bool, 31 | ) -> Result, Html5everExError> { 32 | let flat_sink = build_flat_sink(binary.as_slice())?; 33 | 34 | flat_dom::flat_sink_to_rec_term(env, &flat_sink, attributes_as_maps) 35 | } 36 | 37 | #[rustler::nif(schedule = "DirtyCpu")] 38 | fn flat_parse<'a>( 39 | env: Env<'a>, 40 | binary: Binary, 41 | attributes_as_maps: bool, 42 | ) -> Result, Html5everExError> { 43 | let flat_sink = build_flat_sink(binary.as_slice())?; 44 | 45 | flat_dom::flat_sink_to_flat_term(env, &flat_sink, attributes_as_maps) 46 | } 47 | 48 | fn build_flat_sink(bin_slice: &[u8]) -> Result { 49 | let utf8 = std::str::from_utf8(bin_slice)?; 50 | 51 | let sink = flat_dom::FlatSink::new(); 52 | let parser = html5ever::parse_document(sink, Default::default()); 53 | 54 | Ok(parser.one(utf8)) 55 | } 56 | 57 | rustler::init!("Elixir.Html5ever.Native"); 58 | -------------------------------------------------------------------------------- /priv/test_data/drudgereport.html: -------------------------------------------------------------------------------- 1 | DRUDGE REPORT 2017® 2 | 3 | 4 | 5 | 8 | 9 | 15 | 23 | 24 | 25 | 26 |
27 | 28 | 29 |
30 | 31 |
32 | 33 | 34 |
35 |

36 | 55 | 56 | 57 | 58 |
59 | 60 |
61 | 62 |

63 | 64 |
65 | 66 |
67 | 68 |
69 | 226 | 227 | 228 | 229 |
230 | 231 |
378 | 379 | 380 | 381 |
382 | 383 |
384 |
70 | 71 | EPA accuses FIAT CHRYSLER of emissions cheating... 72 |
73 |

74 | OBAMACARE NEAR DEATH... 75 |
76 | APPLE Sets Sights on Hollywood With Plans for Original Content... 77 |
78 | Samartian sees shot policeman on road, kills gunman who ambushed... 79 |
80 | COPS: Black Teen Behind 'KoolkidsKlanKkk' Account Threatening Black Students... 81 |
82 | SURVEY: 93% Of Cops More Worried For Safety....

83 | Interactions With Black Citizens More Tense...
84 |
85 | More than 40% of California out of drought after powerful storms... 86 |
87 | Deportations Drop 73% Under Obama, Hit 43-Year Low...

88 | Illegals Flocking To 'Sanctuary City' Chicago Before Trump Inauguration... 89 |
90 | Bee placed on endangered list... 91 |
92 | Drone crashes into Space Needle... 93 |
94 | Why Smart People Don't Multitask... 95 |
96 | 97 | 98 | 99 | 100 |
101 | 102 | 103 | 104 |
105 |
106 | 107 |
108 | 109 | 110 | 111 | FRONT PAGES UK 112 | WORLD
113 | BOXOFFICE 114 | TV RATINGS 115 |
116 | ABCNEWS
117 | ADWEEK
118 | ANTI-WAR.COM
119 | ATLANTIC
120 | BBC
121 | BILD
122 | BILLBOARD
123 | BLAZE
124 | BOSTON GLOBE
125 | BOSTON HERALD
126 | BREITBART
127 | BUSINESS INSIDER
128 | BUZZFEED
129 | CBS NEWS
130 | CBS NEWS LOCAL
131 | C-SPAN
132 | CHICAGO SUN-TIMES
133 | CHICAGO TRIB
134 | CHRISTIAN SCIENCE
135 | CNBC
136 | CNN
137 | DAILY BEAST
138 | DAILY CALLER
139 | DEADLINE HOLLYWOOD
140 | DER SPIEGEL
141 | E!
142 | ECONOMIST
143 | ENT WEEKLY
144 | FINANCIAL TIMES
145 | FORBES
146 | FOXNEWS
147 | FRANCE 24
148 | FREE BEACON
149 | FREE REPUBLIC
150 | HOT AIR
151 | HELLO!
152 | HILL
153 | HILL: JUST IN
154 | H'WOOD REPORTER
155 | HUFFINGTON POST
156 | INFOWARS
157 | INTERCEPT
158 | INVEST BUS DAILY
159 | JERUSALEM POST
160 | LA DAILY NEWS
161 | LA TIMES
162 | LUCIANNE.COM
163 | MEDIAITE
164 | MOTHER JONES
165 | NATION
166 | NATIONAL REVIEW
167 | NBC NEWS
168 | NEW REPUBLIC
169 | NEW YORK
170 | NY DAILY NEWS
171 | NY OBSERVER
172 | NY POST
173 | NY TIMES
174 | NY TIMES WIRE
175 | NEW YORKER
176 | NEWSBUSTERS
177 | NEWSMAX
178 | NEWSWEEK
179 | NKOREAN NEWS
180 | PEOPLE
181 | PHILLY INQUIRER
182 | PHILLY DAILY NEWS
183 | PJ MEDIA
184 | POLITICO
185 | RADAR
186 | REAL CLEAR POLITICS
187 | REASON
188 | ROLL CALL
189 | ROLLING STONE
190 | SALON
191 | SAN FRAN CHRON
192 | SKY NEWS
193 | SLATE
194 | SMOKING GUN
195 | SYDNEY MORNING HERALD
196 | TALKING POINTS MEMO
197 | TIME MAG
198 | TMZ
199 | [UK] DAILY MAIL
200 | [UK] DAILY MAIL FEED
201 | [UK] DAILY MIRROR
202 | [UK] DAILY RECORD
203 | [UK] EVENING STANDARD
204 | [UK] EXPRESS
205 | [UK] GUARDIAN
206 | [UK] INDEPENDENT
207 | [UK] SUN
208 | [UK] TELEGRAPH
209 | US NEWS
210 | USA TODAY
211 | VANITY FAIR
212 | VARIETY
213 | VILLAGE VOICE
214 | WALL STREET JOURNAL
215 | WALL STREET JOURNAL FEED
216 | WASH EXAMINER
217 | WASH POST
218 | WASH TIMES
219 | WEEKLY STANDARD
220 | WORLD NET DAILY
221 | X17
222 | ZERO HEDGE
223 | 224 |
225 |
232 | 233 | PAUL ANKA SET TO PERFORM 'MY WAY' AT TRUMP INAUGURATION... 234 |
235 | Babies 'made without mothers'... 236 |
237 | Soros Lost $1 Billion in Weeks After Trump Election...

238 | Hillary stumbles out of Manhattan restaurant... 239 |
240 | PEW: Religion Plummeted in America During Obama Era...

241 | School District Suspends Bible Study After Donuts Used To Entice... 242 |
243 | TRUMP HOT: Le Pen Spotted at Tower in Unannounced Visit During French Race...

244 | NATIONALISM AT HEART OF CAMPAIGN... 245 |
246 | Girl streams her suicide on FACEBOOK...

247 | Live Video Grows as Platform to Broadcast Violence... 248 |
249 | COPS: Man posted on wife's FACEBOOK to cover up her killing... 250 |
251 | Lady lived with sister's decomposed body in mansion for over year... 252 |
253 | 12-year-old demands chicken nugget at gunpoint... 254 |
255 | 256 |

257 | Poised Prescott looking to make history... 259 |
260 | China launches new electronic intelligence naval ship... 261 |

262 | Turns to robots as workers age... 263 |
264 |

265 | Spectacular cloud formation spotted over Oz from passenger plane... 266 |
267 | Mystery footage shows 'huge ball of fire' shooting over Earth... 268 |
269 | Over-the-top luxury cruise ship built to create wows -- for one-percenters... 270 |
271 | Study finds how stress raises heart disease and stroke risk... 272 |
273 | Scientists hear voice of ancient humans in baboon calls...

274 | Researchers closer to solving mystery of Earth's core... 275 |
276 | 277 |

278 | THE INCREDIBLE BULK...

279 | 'World's Strongest' eats 7 lbs of meat -- per day... 280 |
281 | Orwellian billed as convenient as more devices eavesdrop...

282 | WEARABLE SENSORS 'CHECK ENGINE' LIGHT FOR HEALTH... 283 |
284 | Smartphone, internet use record high... 285 |
286 | 287 | 288 | 289 | 3 AM GIRLS
290 | JILL ABRAMSON
291 | CINDY ADAMS
292 | BAZ BAMIGBOYE
293 | DAVE BARRY
294 | FRED BARNES
295 | MICHAEL BARONE
296 | PAUL BEDARD
297 | BIZARRE [SUN]
298 | BRENT BOZELL
299 | DAVID BROOKS
300 | PAT BUCHANAN
301 | HOWIE CARR
302 | MONA CHAREN
303 | CHRIS CILLIZZA
304 | CNN: RELIABLE SOURCES
305 | [NY DAILY NEWS] CONFIDENTIAL
306 | DAVID CORN
307 | ANN COULTER
308 | LOU DOBBS
309 | MAUREEN DOWD
310 | LARRY ELDER
311 | JOSEPH FARAH
312 | SUZANNE FIELDS
313 | FISHBOWL, DC
314 | FISHBOWL, NYC
315 | ROGER FRIEDMAN
316 | BILL GERTZ
317 | JONAH GOLDBERG
318 | GLENN GREENWALD
319 | LLOYD GROVE
320 | HANNITY
321 | STEPHEN HAYES
322 | HUGH HEWITT
323 | KATIE HOPKINS
324 | LAURA INGRAHAM
325 | INSIDE BELTWAY
326 | RICHARD JOHNSON
327 | ALEX JONES
328 | MICKEY KAUS
329 | KEITH J. KELLY
330 | KRAUTHAMMER
331 | KRISTOF
332 | KRISTOL
333 | KRUGMAN
334 | LARRY KUDLOW
335 | HOWIE KURTZ
336 | MARK LEVIN
337 | DAVID LIMBAUGH
338 | RUSH LIMBAUGH
339 | RICH LOWRY
340 | MICHELLE MALKIN
341 | DANA MILBANK
342 | PIERS MORGAN
343 | DICK MORRIS
344 | PEGGY NOONAN
345 | PAGE SIX
346 | ANDREA PEYSER
347 | JIM PINKERTON
348 | POLITICO MORNING MEDIA
349 | POLITICO PLAYBOOK
350 | BILL PRESS
351 | WES PRUDEN
352 | REX REED
353 | RICHARD ROEPER
354 | BETSY ROTHSTEIN
355 | JIM RUTENBERG 356 |
357 | MICHAEL SAVAGE
358 | LIZ SMITH
359 | THOMAS SOWELL
360 | BRIAN STELTER
361 | MARK STEYN
362 | ROGER STONE
363 | TAKI THEODORACOPULOS
364 | CAL THOMAS
365 | BOB TYRRELL 366 |
367 | TV NEWSER
368 | JEFF WELLS
369 | WASHINGTON WHISPERS
370 | GEORGE WILL
371 | WALTER WILLIAMS
372 | MILO YIANNOPOULOS
373 | BYRON YORK
374 | BILL ZWECKER
375 | 376 |
377 |
385 | 386 | 387 | 388 | GREAT AGAIN: AMAZON to create 100,000 jobs in USA... 389 |

390 | Bezos anonymous buyer of BIGGEST house in DC... 391 |
392 | Blackout halts Trump CIA pick confirmation hearing...

393 | Dem Sen grills Pompeo -- on climate change! 394 |
395 | CNN Trump tirade may affect TIME WARNER-AT&T merger...

396 | Voters Unfazed by 'Controversies'... 397 |
398 | Trump's 'Cadillac One' limo will have tear gas cannon, shotgun... 399 |
400 | Kremlin says U.S. military build-up in Poland a national security threat... 401 |

402 | Army moves 2,500 tanks, trucks... 403 |
404 | Obama expands power of NSA in final days... 405 |

406 |

407 | How Can We Miss a President Who Won't Go Away?

408 | Stevie Wonder serenades Michelle O...

409 | Obama Dog Sunny Bites White House Guest... 410 |
411 | Porn Star Breaks Silence Over Abuse Allegations... 412 |
413 | Something 'fishy' going on at LA sushi restaurants... 414 |
415 | US soldier commits suicide at Kuwait base... 416 |
417 | Istanbul tourism reels after attacks... 418 |

419 | Spain Dismisses Terror and Hate Crime Case Against Puppeteers... 420 |
421 | 422 | 423 |
424 | 425 | 426 | 427 |
428 |
429 | 430 | 431 |
432 | 433 | 434 | 435 | AGENCE FRANCE-PRESSE
436 | AP TOP
437 | AP HEADLINE WALL
438 | AP RAW
439 | AP RADIO
440 | BLOOMBERG
441 | DEUTSCHE PRESSE-AGENTUR
442 | DOW JONES
443 | INDO-ASIAN NEWS SERVICE
444 | INTERFAX
445 | ITAR-TASS
446 | KYODO
447 | MCCLATCHY [DC]
448 | PRAVDA
449 | PRESS TRUST INDIA
450 | PR NEWSWIRE
451 | REUTERS
452 | REUTERS POLITICS
453 | REUTERS WORLD
454 | XINHUA
455 | UPI
456 | YONHAP 457 |
458 | 459 | 460 |
461 | 462 | 463 | 464 | 465 |
466 |
467 |
468 | 469 |
470 | 471 | GOOGLE NEWS

472 |
473 |

474 |
475 | 476 |
477 |
478 |
479 | RECENT DRUDGE HEADLINES... 480 |
481 | WEATHER ACTION
482 | QUAKE SHEET 483 |
484 |
485 | SEND NEWS TIPS TO DRUDGE
486 |
487 | 488 |
489 |
490 | GET IT ON THE GO: DRUDGE MOBILE... 491 | 492 |
493 | VISITS TO DRUDGE 1/12/2017

494 | 038,310,062 PAST 24 HOURS
495 | 784,990,369 PAST 31 DAYS
496 | 10,731,244,947 PAST YEAR 497 |
498 | 499 |
500 | DRUDGE REFERENCE DESK 501 |
502 | EMAIL: DRUDGE@DRUDGEREPORT.COM 503 |
504 | BE SEEN! RUN ADS ON DRUDGE REPORT...

505 | PRIVACY POLICY... 506 |
507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 535 | 536 | 537 | 538 | 539 | -------------------------------------------------------------------------------- /priv/test_data/example.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Example Domain 5 | 6 | 7 | 8 | 9 | 40 | 41 | 42 | 43 |
44 |

Example Domain

45 |

This domain is established to be used for illustrative examples in documents. You may use this 46 | domain in examples without prior coordination or asking for permission.

47 |

More information...

48 |
49 | 50 | 51 | -------------------------------------------------------------------------------- /test/html5ever_test.exs: -------------------------------------------------------------------------------- 1 | defmodule Html5everTest do 2 | use ExUnit.Case, async: true 3 | doctest Html5ever 4 | 5 | def read_html(name) do 6 | path = Path.join([:code.priv_dir(:html5ever), "test_data", name]) 7 | File.read!(path) 8 | end 9 | 10 | test "parse basic html" do 11 | html = "

Hello

" 12 | 13 | assert Html5ever.parse(html) == 14 | {:ok, 15 | [ 16 | {"html", [], 17 | [ 18 | {"head", [], []}, 19 | {"body", [], [{"h1", [], ["Hello"]}, {:comment, " my comment "}]} 20 | ]} 21 | ]} 22 | end 23 | 24 | test "does not parse with not valid UTF8 binary" do 25 | invalid = 26 | <<98, 29, 104, 122, 46, 145, 14, 37, 122, 155, 227, 121, 49, 120, 108, 209, 155, 113, 229, 27 | 98, 90, 181, 146>> 28 | 29 | assert Html5ever.parse(invalid) == 30 | {:error, "cannot transform bytes from binary to a valid UTF8 string"} 31 | end 32 | 33 | test "flat parse basic html" do 34 | html = "" 35 | 36 | ret = 37 | {:ok, 38 | %{ 39 | nodes: %{ 40 | 0 => %{id: 0, parent: nil, type: :document}, 41 | 1 => %{children: [2, 3], id: 1, parent: 0, type: :element, attrs: [], name: "html"}, 42 | 2 => %{children: [], id: 2, parent: 1, type: :element, attrs: [], name: "head"}, 43 | 3 => %{ 44 | children: [], 45 | id: 3, 46 | parent: 1, 47 | type: :element, 48 | attrs: [{"test", "woo"}], 49 | name: "body" 50 | } 51 | }, 52 | root: 0 53 | }} 54 | 55 | assert Html5ever.flat_parse(html) == ret 56 | end 57 | 58 | test "does not flat parse with not valid UTF8 binary" do 59 | invalid = 60 | <<98, 29, 104, 122, 46, 145, 14, 37, 122, 155, 227, 121, 49, 120, 108, 209, 155, 113, 229, 61 | 98, 90, 181, 146>> 62 | 63 | assert Html5ever.flat_parse(invalid) == 64 | {:error, "cannot transform bytes from binary to a valid UTF8 string"} 65 | end 66 | 67 | test "flat parse basic html with attributes as maps" do 68 | # Duplicated attribute is removed. 69 | html = "" 70 | 71 | ret = 72 | {:ok, 73 | %{ 74 | nodes: %{ 75 | 0 => %{id: 0, parent: nil, type: :document}, 76 | 1 => %{children: [2, 3], id: 1, parent: 0, type: :element, attrs: %{}, name: "html"}, 77 | 2 => %{children: [], id: 2, parent: 1, type: :element, attrs: %{}, name: "head"}, 78 | 3 => %{ 79 | children: [], 80 | id: 3, 81 | parent: 1, 82 | type: :element, 83 | attrs: %{"test" => "woo", "class" => "content"}, 84 | name: "body" 85 | } 86 | }, 87 | root: 0 88 | }} 89 | 90 | assert Html5ever.flat_parse_with_attributes_as_maps(html) == ret 91 | end 92 | 93 | test "parse example.com html" do 94 | html = read_html("example.html") 95 | assert {:ok, _} = Html5ever.parse(html) 96 | end 97 | 98 | test "flat parse example.com html" do 99 | html = read_html("example.html") 100 | assert {:ok, _} = Html5ever.flat_parse(html) 101 | end 102 | 103 | test "parse drudgereport.com html" do 104 | html = read_html("drudgereport.html") 105 | assert {:ok, _} = Html5ever.parse(html) 106 | end 107 | 108 | test "flat parse drudgereport.com html" do 109 | html = read_html("drudgereport.html") 110 | assert {:ok, _} = Html5ever.flat_parse(html) 111 | end 112 | 113 | test "unbalanced worst case" do 114 | html = String.duplicate("
", 100) 115 | assert {:ok, _} = Html5ever.parse(html) 116 | end 117 | 118 | test "flat unbalanced worst case" do 119 | html = String.duplicate("
", 100) 120 | assert {:ok, _} = Html5ever.flat_parse(html) 121 | end 122 | 123 | test "reasonably deep html" do 124 | html = """ 125 | 126 | 127 | 128 | Test 129 | 130 | 131 |
132 | 133 |
134 | 135 | 136 | very deep content 137 | 138 | 139 |
140 | 141 |
142 |
143 | 144 | 145 | """ 146 | 147 | parsed = Html5ever.parse(html) 148 | 149 | assert {:ok, 150 | [ 151 | {:doctype, "html", "", ""}, 152 | {"html", [], 153 | [ 154 | {"head", [], ["\n", " ", {"title", [], ["Test"]}, "\n", " "]}, 155 | "\n", 156 | " ", 157 | {"body", [], 158 | [ 159 | "\n", 160 | " ", 161 | {"div", [{"class", "content"}], 162 | [ 163 | "\n", 164 | " ", 165 | {"span", [], 166 | [ 167 | "\n", 168 | " ", 169 | {"div", [], 170 | [ 171 | "\n", 172 | " ", 173 | {"span", [], 174 | [ 175 | "\n", 176 | " ", 177 | {"small", [], 178 | ["\n", " very deep content", "\n", " "]}, 179 | "\n", 180 | " " 181 | ]}, 182 | "\n", 183 | " " 184 | ]}, 185 | "\n", 186 | " ", 187 | {"img", [{"src", "file.jpg"}], []}, 188 | "\n", 189 | " " 190 | ]}, 191 | "\n", 192 | " " 193 | ]}, 194 | "\n", 195 | " ", 196 | "\n", 197 | "\n" 198 | ]} 199 | ]} 200 | ]} = parsed 201 | end 202 | 203 | test "reasonably deep html with attributes as maps" do 204 | html = """ 205 | 206 | 207 | 208 | Test 209 | 210 | 211 |
212 | 213 |
214 | 215 | 216 | very deep content 217 | 218 | 219 |
220 | 221 |
222 |
223 | 224 | 225 | """ 226 | 227 | parsed = Html5ever.parse_with_attributes_as_maps(html) 228 | 229 | assert {:ok, 230 | [ 231 | {:doctype, "html", "", ""}, 232 | {"html", %{}, 233 | [ 234 | {"head", %{}, ["\n", " ", {"title", %{}, ["Test"]}, "\n", " "]}, 235 | "\n", 236 | " ", 237 | {"body", %{}, 238 | [ 239 | "\n", 240 | " ", 241 | {"div", %{"class" => "content"}, 242 | [ 243 | "\n", 244 | " ", 245 | {"span", %{}, 246 | [ 247 | "\n", 248 | " ", 249 | {"div", %{}, 250 | [ 251 | "\n", 252 | " ", 253 | {"span", %{}, 254 | [ 255 | "\n", 256 | " ", 257 | {"small", %{}, 258 | ["\n", " very deep content", "\n", " "]}, 259 | "\n", 260 | " " 261 | ]}, 262 | "\n", 263 | " " 264 | ]}, 265 | "\n", 266 | " ", 267 | {"img", %{"src" => "file.jpg"}, []}, 268 | "\n", 269 | " " 270 | ]}, 271 | "\n", 272 | " " 273 | ]}, 274 | "\n", 275 | " ", 276 | "\n", 277 | "\n" 278 | ]} 279 | ]} 280 | ]} = parsed 281 | end 282 | 283 | test "parse html with a template tag ignores template content" do 284 | html = """ 285 | 286 | 287 | With template 288 | 289 |

Document

290 | 294 | 295 | 296 | """ 297 | 298 | assert Html5ever.parse(html) == 299 | {:ok, 300 | [ 301 | {:doctype, "html", "", ""}, 302 | {"html", [], 303 | [ 304 | {"head", [], [{"title", [], ["With template"]}]}, 305 | "\n", 306 | {"body", [], 307 | ["\n", {"h1", [], ["Document"]}, "\n", {"template", [], []}, "\n", "\n", "\n"]} 308 | ]} 309 | ]} 310 | end 311 | 312 | test "parse html starting with a XML tag" do 313 | html = """ 314 | 315 | 316 | 317 | 318 | Hello 319 | 320 | link 321 | 322 | 323 | """ 324 | 325 | assert Html5ever.parse(html) == 326 | {:ok, 327 | [ 328 | {:comment, "?xml version=\"1.0\" encoding=\"UTF-8\"?"}, 329 | {:comment, " also a comment is allowed "}, 330 | {:doctype, "html", "-//W3C//DTD XHTML 1.0 Strict//EN", 331 | "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"}, 332 | { 333 | "html", 334 | [{"xmlns", "http://www.w3.org/1999/xhtml"}, {"xml:lang", "en"}, {"lang", "en"}], 335 | [ 336 | {"head", [], [{"title", [], ["Hello"]}]}, 337 | "\n", 338 | " ", 339 | {"body", [], 340 | [ 341 | "\n", 342 | " ", 343 | {"a", [{"id", "anchor"}, {"href", "https://example.com"}], ["link"]}, 344 | "\n", 345 | " ", 346 | "\n", 347 | "\n" 348 | ]} 349 | ] 350 | } 351 | ]} 352 | end 353 | end 354 | -------------------------------------------------------------------------------- /test/test_helper.exs: -------------------------------------------------------------------------------- 1 | ExUnit.start() 2 | --------------------------------------------------------------------------------