├── .clang-format ├── .github ├── CODEOWNERS ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── feature_request.md │ └── question.md ├── dependabot.yml └── workflows │ ├── build.yml │ ├── release.yml │ ├── website.yml │ └── wheels.yml ├── .gitignore ├── .gitmodules ├── .vscode └── settings.json ├── CHANGELOG.md ├── CMakeLists.txt ├── CONTRIBUTING.md ├── COPYING ├── MANIFEST.in ├── Package.swift ├── README.md ├── SECURITY.md ├── cli └── main.cpp ├── cmake ├── Doc.cmake └── Packaging.cmake ├── docs ├── Doxyfile ├── doxygen-awesome-css │ ├── .gitignore │ ├── LICENSE │ ├── doxygen-awesome-darkmode-toggle.js │ ├── doxygen-awesome-fragment-copy-button.js │ ├── doxygen-awesome-interactive-toc.js │ ├── doxygen-awesome-paragraph-link.js │ ├── doxygen-awesome-sidebar-only-darkmode-toggle.css │ ├── doxygen-awesome-sidebar-only.css │ ├── doxygen-awesome.css │ └── doxygen-custom │ │ ├── custom.css │ │ └── header.html └── index.md ├── html2md.pc.in ├── html2mdConfig.cmake.in ├── include ├── html2md.h └── table.h ├── js └── bindings.cpp ├── objc ├── html2md_objc.mm └── include │ └── html2md_objc.h ├── pyproject.toml ├── python ├── README.md └── bindings.cpp ├── scripts └── clang-format.sh ├── src ├── html2md.cpp └── table.cpp └── tests ├── CMakeLists.txt ├── README.md ├── benchmark.cpp ├── blockquote.md ├── breaks.md ├── code.md ├── comment.html ├── escaping.md ├── formating.md ├── links.md ├── lists.md ├── main.cpp ├── python ├── test_advanced.py ├── test_basic.py └── test_html_symbol_conversions.py └── tables.md /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | Language: Cpp 3 | # BasedOnStyle: LLVM 4 | AccessModifierOffset: -2 5 | AlignAfterOpenBracket: Align 6 | AlignArrayOfStructures: None 7 | AlignConsecutiveMacros: None 8 | AlignConsecutiveAssignments: None 9 | AlignConsecutiveBitFields: None 10 | AlignConsecutiveDeclarations: None 11 | AlignEscapedNewlines: Right 12 | AlignOperands: Align 13 | AlignTrailingComments: true 14 | AllowAllArgumentsOnNextLine: true 15 | AllowAllParametersOfDeclarationOnNextLine: true 16 | AllowShortEnumsOnASingleLine: true 17 | AllowShortBlocksOnASingleLine: Never 18 | AllowShortCaseLabelsOnASingleLine: false 19 | AllowShortFunctionsOnASingleLine: All 20 | AllowShortLambdasOnASingleLine: All 21 | AllowShortIfStatementsOnASingleLine: Never 22 | AllowShortLoopsOnASingleLine: false 23 | AlwaysBreakAfterDefinitionReturnType: None 24 | AlwaysBreakAfterReturnType: None 25 | AlwaysBreakBeforeMultilineStrings: false 26 | AlwaysBreakTemplateDeclarations: MultiLine 27 | AttributeMacros: 28 | - __capability 29 | BinPackArguments: true 30 | BinPackParameters: true 31 | BraceWrapping: 32 | AfterCaseLabel: false 33 | AfterClass: false 34 | AfterControlStatement: Never 35 | AfterEnum: false 36 | AfterFunction: false 37 | AfterNamespace: false 38 | AfterObjCDeclaration: false 39 | AfterStruct: false 40 | AfterUnion: false 41 | AfterExternBlock: false 42 | BeforeCatch: false 43 | BeforeElse: false 44 | BeforeLambdaBody: false 45 | BeforeWhile: false 46 | IndentBraces: false 47 | SplitEmptyFunction: true 48 | SplitEmptyRecord: true 49 | SplitEmptyNamespace: true 50 | BreakBeforeBinaryOperators: None 51 | BreakBeforeConceptDeclarations: true 52 | BreakBeforeBraces: Attach 53 | BreakBeforeInheritanceComma: false 54 | BreakInheritanceList: BeforeColon 55 | BreakBeforeTernaryOperators: true 56 | BreakConstructorInitializersBeforeComma: false 57 | BreakConstructorInitializers: BeforeColon 58 | BreakAfterJavaFieldAnnotations: false 59 | BreakStringLiterals: true 60 | ColumnLimit: 80 61 | CommentPragmas: '^ IWYU pragma:' 62 | QualifierAlignment: Leave 63 | CompactNamespaces: false 64 | ConstructorInitializerIndentWidth: 4 65 | ContinuationIndentWidth: 4 66 | Cpp11BracedListStyle: true 67 | DeriveLineEnding: true 68 | DerivePointerAlignment: false 69 | DisableFormat: false 70 | EmptyLineAfterAccessModifier: Never 71 | EmptyLineBeforeAccessModifier: LogicalBlock 72 | ExperimentalAutoDetectBinPacking: false 73 | PackConstructorInitializers: BinPack 74 | BasedOnStyle: '' 75 | ConstructorInitializerAllOnOneLineOrOnePerLine: false 76 | AllowAllConstructorInitializersOnNextLine: true 77 | FixNamespaceComments: true 78 | ForEachMacros: 79 | - foreach 80 | - Q_FOREACH 81 | - BOOST_FOREACH 82 | IfMacros: 83 | - KJ_IF_MAYBE 84 | IncludeBlocks: Preserve 85 | IncludeCategories: 86 | - Regex: '^"(llvm|llvm-c|clang|clang-c)/' 87 | Priority: 2 88 | SortPriority: 0 89 | CaseSensitive: false 90 | - Regex: '^(<|"(gtest|gmock|isl|json)/)' 91 | Priority: 3 92 | SortPriority: 0 93 | CaseSensitive: false 94 | - Regex: '.*' 95 | Priority: 1 96 | SortPriority: 0 97 | CaseSensitive: false 98 | IncludeIsMainRegex: '(Test)?$' 99 | IncludeIsMainSourceRegex: '' 100 | IndentAccessModifiers: false 101 | IndentCaseLabels: false 102 | IndentCaseBlocks: false 103 | IndentGotoLabels: true 104 | IndentPPDirectives: None 105 | IndentExternBlock: AfterExternBlock 106 | IndentRequires: false 107 | IndentWidth: 2 108 | IndentWrappedFunctionNames: false 109 | InsertTrailingCommas: None 110 | JavaScriptQuotes: Leave 111 | JavaScriptWrapImports: true 112 | KeepEmptyLinesAtTheStartOfBlocks: true 113 | LambdaBodyIndentation: Signature 114 | MacroBlockBegin: '' 115 | MacroBlockEnd: '' 116 | MaxEmptyLinesToKeep: 1 117 | NamespaceIndentation: None 118 | ObjCBinPackProtocolList: Auto 119 | ObjCBlockIndentWidth: 2 120 | ObjCBreakBeforeNestedBlockParam: true 121 | ObjCSpaceAfterProperty: false 122 | ObjCSpaceBeforeProtocolList: true 123 | PenaltyBreakAssignment: 2 124 | PenaltyBreakBeforeFirstCallParameter: 19 125 | PenaltyBreakComment: 300 126 | PenaltyBreakFirstLessLess: 120 127 | PenaltyBreakOpenParenthesis: 0 128 | PenaltyBreakString: 1000 129 | PenaltyBreakTemplateDeclaration: 10 130 | PenaltyExcessCharacter: 1000000 131 | PenaltyReturnTypeOnItsOwnLine: 60 132 | PenaltyIndentedWhitespace: 0 133 | PointerAlignment: Right 134 | PPIndentWidth: -1 135 | ReferenceAlignment: Pointer 136 | ReflowComments: true 137 | RemoveBracesLLVM: false 138 | SeparateDefinitionBlocks: Leave 139 | ShortNamespaceLines: 1 140 | SortIncludes: CaseSensitive 141 | SortJavaStaticImport: Before 142 | SortUsingDeclarations: true 143 | SpaceAfterCStyleCast: false 144 | SpaceAfterLogicalNot: false 145 | SpaceAfterTemplateKeyword: true 146 | SpaceBeforeAssignmentOperators: true 147 | SpaceBeforeCaseColon: false 148 | SpaceBeforeCpp11BracedList: false 149 | SpaceBeforeCtorInitializerColon: true 150 | SpaceBeforeInheritanceColon: true 151 | SpaceBeforeParens: ControlStatements 152 | SpaceBeforeParensOptions: 153 | AfterControlStatements: true 154 | AfterForeachMacros: true 155 | AfterFunctionDefinitionName: false 156 | AfterFunctionDeclarationName: false 157 | AfterIfMacros: true 158 | AfterOverloadedOperator: false 159 | BeforeNonEmptyParentheses: false 160 | SpaceAroundPointerQualifiers: Default 161 | SpaceBeforeRangeBasedForLoopColon: true 162 | SpaceInEmptyBlock: false 163 | SpaceInEmptyParentheses: false 164 | SpacesBeforeTrailingComments: 1 165 | SpacesInAngles: Never 166 | SpacesInConditionalStatement: false 167 | SpacesInContainerLiterals: true 168 | SpacesInCStyleCastParentheses: false 169 | SpacesInLineCommentPrefix: 170 | Minimum: 1 171 | Maximum: -1 172 | SpacesInParentheses: false 173 | SpacesInSquareBrackets: false 174 | SpaceBeforeSquareBrackets: false 175 | BitFieldColonSpacing: Both 176 | Standard: Latest 177 | StatementAttributeLikeMacros: 178 | - Q_EMIT 179 | StatementMacros: 180 | - Q_UNUSED 181 | - QT_REQUIRE_VERSION 182 | TabWidth: 8 183 | UseCRLF: false 184 | UseTab: Never 185 | WhitespaceSensitiveMacros: 186 | - STRINGIZE 187 | - PP_STRINGIZE 188 | - BOOST_PP_STRINGIZE 189 | - NS_SWIFT_NAME 190 | - CF_SWIFT_NAME 191 | ... 192 | 193 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # Generated by CODEOWNERS.com 2 | 3 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Run '....' 16 | 2. See error 17 | 18 | **Expected behavior** 19 | A clear and concise description of what you expected to happen. 20 | 21 | **Output of `dmesg | tail -n2`** 22 | (for Linux and maybe mac) 23 | 24 | **Desktop (please complete the following information):** 25 | - OS: [e.g. Ubuntu 22.04] 26 | 27 | **Additional context** 28 | Add any other context about the problem here. 29 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/question.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Question 3 | about: Ask a question 4 | title: '' 5 | labels: question 6 | assignees: '' 7 | 8 | --- 9 | 10 | For example 'How customize lists' 11 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | 6 | version: 2 7 | updates: 8 | # Maintain dependencies for GitHub Actions 9 | - package-ecosystem: "github-actions" 10 | directory: "/" 11 | schedule: 12 | interval: "weekly" 13 | 14 | # Keep submodules up to date 15 | - package-ecosystem: "gitsubmodule" 16 | directory: "/" 17 | schedule: 18 | interval: "weekly" 19 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: 'Build' 2 | 3 | on: 4 | push: 5 | branches: 6 | - '*' 7 | pull_request: 8 | workflow_dispatch: 9 | 10 | jobs: 11 | build: 12 | runs-on: ${{ matrix.os }} 13 | strategy: 14 | fail-fast: false 15 | matrix: 16 | os: [ubuntu-latest, macos-latest, windows-latest] 17 | steps: 18 | - name: Checkout 19 | uses: actions/checkout@v4 20 | 21 | # 22 | # Build using CMake 23 | # 24 | - name: Build using CMake 25 | run: | 26 | mkdir build && cd build 27 | cmake .. 28 | cmake --build . -j8 29 | 30 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Create release and update assets 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*' 7 | workflow_dispatch: 8 | 9 | jobs: 10 | build: 11 | name: Build and upload release assetes 12 | runs-on: ${{ matrix.os }} 13 | strategy: 14 | fail-fast: false 15 | matrix: 16 | include: 17 | - os: ubuntu-latest 18 | files: | 19 | build/packages/html2md*.deb 20 | build/packages/html2md*.tar.gz 21 | - os: ubuntu-22.04 22 | files: | 23 | build/packages/html2md*.deb 24 | - os: windows-latest 25 | files: | 26 | build/packages/html2md*.zip 27 | steps: 28 | - name: Checkout repo 29 | uses: actions/checkout@v4 30 | 31 | - name: Build and package 32 | run: | 33 | mkdir build && cd build 34 | cmake -DBUILD_TEST=OFF -DBUILD_DOC=OFF -DCMAKE_BUILD_TYPE=Release .. 35 | cmake --build . --config Release 36 | cmake --build . --config Release --target package 37 | shell: bash 38 | 39 | - name: Upload package 40 | uses: actions/upload-artifact@v4 41 | with: 42 | path: ${{ matrix.files }} 43 | name: ${{ matrix.os }} 44 | 45 | publish: 46 | name: Create release and upload files 47 | runs-on: ubuntu-22.04 48 | needs: build 49 | permissions: 50 | contents: write 51 | strategy: 52 | fail-fast: false 53 | steps: 54 | - uses: actions/checkout@v4 55 | with: 56 | fetch-depth: 0 57 | 58 | - name: Download release asstets 59 | uses: actions/download-artifact@v4 60 | with: 61 | path: packages 62 | 63 | - name: Find changes and release assets 64 | id: files 65 | run: | 66 | PREVIOS="$(git tag --sort=creatordate | tail -n 2 | head -n1)" 67 | wget https://raw.githubusercontent.com/tim-gromeyer/html2md/$PREVIOS/CHANGELOG.md -O OLD.md 68 | echo "CHANGES<> $GITHUB_ENV 69 | echo "$(grep -Fvxf OLD.md CHANGELOG.md | tail -n +2)" >> $GITHUB_ENV 70 | echo "EOF" >> $GITHUB_ENV 71 | 72 | echo "FILES<> $GITHUB_ENV 73 | find packages/ -name "*" -type f >> $GITHUB_ENV 74 | echo "EOF" >> $GITHUB_ENV 75 | 76 | - name: Release 77 | uses: softprops/action-gh-release@v2 78 | if: startsWith(github.ref, 'refs/tags/') 79 | with: 80 | generate_release_notes: true 81 | body: ${{ env.CHANGES }} 82 | files: ${{ env.FILES }} 83 | -------------------------------------------------------------------------------- /.github/workflows/website.yml: -------------------------------------------------------------------------------- 1 | name: Update website 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*' 7 | workflow_dispatch: 8 | 9 | jobs: 10 | website: 11 | name: Build website and deploy to gh pages 12 | runs-on: ubuntu-22.04 13 | 14 | steps: 15 | - uses: actions/checkout@v4 16 | 17 | - name: Add version to Doxyfile 18 | run: | 19 | echo "PROJECT_NUMBER = ${{ github.ref_name }}" >> docs/Doxyfile 20 | 21 | - name: Run Doxygen 22 | uses: mattnotmitt/doxygen-action@edge 23 | with: 24 | doxyfile-path: 'docs/Doxyfile' 25 | 26 | - name: Deploy 27 | uses: peaceiris/actions-gh-pages@v4 28 | with: 29 | github_token: ${{ secrets.GITHUB_TOKEN }} 30 | publish_dir: ./doc 31 | 32 | -------------------------------------------------------------------------------- /.github/workflows/wheels.yml: -------------------------------------------------------------------------------- 1 | name: Build Python wheels 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*' 7 | workflow_dispatch: 8 | 9 | jobs: 10 | build_sdist: 11 | name: Build SDist 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v4 15 | with: 16 | submodules: true 17 | 18 | - name: Build SDist 19 | run: pipx run build --sdist 20 | 21 | - name: Check metadata 22 | run: pipx run twine check dist/* 23 | 24 | - uses: actions/upload-artifact@v4 25 | with: 26 | name: dist-sdist 27 | path: dist/*.tar.gz 28 | 29 | build_wheels: 30 | name: Wheels on ${{ matrix.os }} 31 | runs-on: ${{ matrix.os }} 32 | strategy: 33 | fail-fast: false 34 | matrix: 35 | os: [ubuntu-latest, windows-latest, macos-latest] 36 | 37 | steps: 38 | - uses: actions/checkout@v4 39 | with: 40 | submodules: true 41 | 42 | - uses: pypa/cibuildwheel@v2.19.1 43 | env: 44 | CIBW_ARCHS_MACOS: "auto universal2" 45 | CIBW_SKIP: "{cp36-*, cp37-*}" # Skip Python 3.6/3.7 46 | 47 | - name: Verify clean directory 48 | run: git diff --exit-code 49 | shell: bash 50 | 51 | - name: Upload wheels 52 | uses: actions/upload-artifact@v4 53 | with: 54 | name: dist-${{ matrix.os }} 55 | path: wheelhouse/*.whl 56 | 57 | upload_all: 58 | name: Upload wheels 59 | needs: [build_wheels, build_sdist] 60 | runs-on: ubuntu-latest 61 | 62 | steps: 63 | - uses: actions/download-artifact@v4 64 | with: 65 | path: dist 66 | pattern: dist-* 67 | merge-multiple: true 68 | 69 | - uses: pypa/gh-action-pypi-publish@v1.9.0 70 | with: 71 | password: ${{ secrets.PYPI_API_TOKEN }} 72 | 73 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.user* 2 | build-linux/ 3 | build-windows/ 4 | build-wasm/ 5 | doc/ 6 | tests/error.log 7 | conan/ 8 | build/ 9 | dist/ 10 | pyhtml2md.egg-info/ 11 | *.whl 12 | wheelhouse/ 13 | tests/__pycache__/ 14 | .DS_Store 15 | /.build 16 | /Packages 17 | xcuserdata/ 18 | DerivedData/ 19 | .swiftpm/configuration/registries.json 20 | .swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata 21 | .netrc 22 | *.pyc -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "python/pybind11"] 2 | path = python/pybind11 3 | url = https://github.com/pybind/pybind11 4 | [submodule "tests/md4c"] 5 | path = tests/md4c 6 | url = https://github.com/tim-gromeyer/MarkdownEdit_md4c 7 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "cmake.configureArgs": [ 3 | "-DBUILD_TEST=ON", 4 | "-DPYTHON_BINDINGS=ON", 5 | "-G=Ninja", 6 | ] 7 | } -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Change log 2 | 3 | [TOC] 4 | 5 | ## 1.7.0 6 | - Added API to add/remove HTML symbol conversions (see #158) 7 | - Added HTML symbol conversion API to Python bindings 8 | - Fixed attribute parsing with whitespace around equals sign (see #159) 9 | - Added full support for uppercase tags/attributes 10 | - Supported whitespace around tag names 11 | - Fixed issues with self-closing tags 12 | - Improved performance 13 | - Improved blockquote handling (see #157) 14 | 15 | ## 1.6.6 16 | - Python: Fix wheels not uploaded 17 | 18 | ## 1.6.5 19 | - Fix handling of `
` tags 20 | 21 | ## 1.6.4 22 | - Fix handling of `
` tags outside of paragraphs (`

`) 23 | 24 | ## 1.6.3 25 | - Update python dependencies, hopefully fixes (#133) 26 | 27 | ## 1.6.2 28 | - Fix HTML entities not converted (see #131) 29 | 30 | ## 1.6.0 31 | 32 | - Add option for soft line break 33 | - Add option for hard line break 34 | - Fix handling of self-closing tags 35 | - Updated python package building (see #100) 36 | 37 | ## 1.5.4 38 | 39 | - Fix crash (see #67) 40 | - Add support for newer Python versions 41 | 42 | ## 1.5.3 43 | 44 | - Make `blockquote` work correctly! 45 | - Additional note for 1.5.2: Add Python 12 packages 46 | 47 | ## 1.5.2 48 | 49 | - FIXED: Add `titile` support for images 50 | - FIXED: Code got formatted (Spaces removed) 51 | - Fixed some formatting issues (like a space infront of `!`) 52 | - FIXED: Escaping of `*`, \`, and `\` 53 | - Reduced memory usage 54 | - Improved performance 55 | 56 | ## v1.5.1 57 | 58 | - **~40% Performance Improvement** 59 | 60 | ## v1.5.0 61 | 62 | - **Added a option to Format Markdown Tables** 63 | - More tests 64 | - Reworked cli program for better usability 65 | 66 | ## v1.4.4 67 | 68 | - New release with Python 3.11 support/packages 69 | - Updated internal dependencies 70 | 71 | ## v1.4.3 72 | 73 | - Improved performance 74 | - Updated 3rdparty tools (for creating python packages and creating releases) 75 | - Fix code example 76 | 77 | ## v1.4.2 78 | 79 | - Fixed windows release build are linked against debug libraries 80 | 81 | ## v1.4.1 82 | 83 | - **Fixed ALL memory leaks** 84 | - Fixed bugs(`html2md::Options::includeTitle` not working) 85 | - Added more tests 86 | - Documentation: Updated Doxygen to v1.9.6 87 | - Include Windows to releases 88 | 89 | ## v1.4.0 90 | 91 | - Improved CMake support massively! 92 | - Fixed tests 93 | - Added support for CMake 3.8 94 | - Fix Python source package 95 | 96 | ## v1.3.0 97 | 98 | **BREAKING CHANGES!** 99 | 100 | - Renamed `Converter::Convert2Md` -> `Converter::convert()` 101 | - Renamed `options` -> `Options` 102 | 103 | ## v1.2.2 104 | 105 | - Fixed bug when calling `Convert2Md()` multiple times 106 | - Corrected serval typos. Ignore the rest of the change log. 107 | 108 | ## v1.2.1 109 | 110 | - Added missing python dependency 111 | 112 | ## v1.2.0 113 | 114 | - **Added python bindings** 115 | - Added new option: `includeTable`. 116 | 117 | ## v1.1.5 118 | 119 | - Added more command line options to the executable 120 | 121 | ## v1.1.4 122 | 123 | - Releases now include deb files 124 | 125 | ## v1.1.3 126 | 127 | The user can now test his own Markdown files. Simply specify to the test program as argument. 128 | 129 | ## v1.1.2 130 | 131 | - Add changes for v1.1.1 132 | - Create releases when a new tag is added(automatically) 133 | 134 | ## v.1.1.1 135 | 136 | - Fix windows build(by replacing get) 137 | 138 | ## v1.1.0 139 | 140 | - Reworked command line program 141 | - Renamed `AppendToMd` to `appendToMd` 142 | - Renamed `AppendBlank` to `appendBlank` 143 | - **Require *c++11* instead of *c++17*.** Only the tests require *c++17* now. 144 | - Added more tests 145 | - Fix typos in comments 146 | - Improved documentation 147 | 148 | ## v1.0.1 149 | 150 | - Fixed several bugs 151 | - Added more tests: make test 152 | - Updated documentation: make doc 153 | - Added packaging: make package 154 | 155 | ## v1.0.0 156 | 157 | Initial release. All basics work but `blockquote` needs a rework. 158 | 159 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.8...3.31) 2 | project(html2md VERSION 1.7.0 LANGUAGES CXX) 3 | 4 | set(PROJECT_HOMEPAGE_URL "https://tim-gromeyer.github.io/html2md/") 5 | set(html2md_HOMEPAGE_URL "${PROJECT_HOMEPAGE_URL}") 6 | 7 | set(PROJECT_DESCRIPTION "Transform your HTML into clean, easy-to-read markdown with html2md") 8 | set(html2md_DESCRIPTION "${PROJECT_DESCRIPTION}") 9 | 10 | # If build type not specified we use release 11 | if (NOT CMAKE_BUILD_TYPE) 12 | message(STATUS "Build type not specified. Release is used.") 13 | set(CMAKE_BUILD_TYPE "Release") 14 | endif() 15 | 16 | # Improve performance 17 | if(CMAKE_BUILD_TYPE STREQUAL "Release") 18 | string(REPLACE "-O2" "-O3" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") 19 | string(REPLACE "-O2" "-O3" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") 20 | endif() 21 | 22 | # Check if it was included via `add_subdirectory` 23 | get_directory_property(subproject PARENT_DIRECTORY) 24 | 25 | # Create HTML for webassembly 26 | if(EMSCRIPTEN) 27 | set(CMAKE_EXECUTABLE_SUFFIX ".html") 28 | endif() 29 | 30 | # Some options 31 | if (subproject) 32 | option(BUILD_EXE "Build a executable to convert html to markdown." OFF) 33 | else() 34 | option(BUILD_EXE "Build a executable to convert html to markdown." ON) 35 | endif() 36 | option(BUILD_DOC "Build documentation" OFF) 37 | option(BUILD_TEST "Build tests" OFF) 38 | option(PYTHON_BINDINGS "Build python bindings" OFF) 39 | 40 | set(SOURCES 41 | src/html2md.cpp 42 | src/table.cpp 43 | ) 44 | set(HEADERS 45 | include/html2md.h 46 | include/table.h 47 | ) 48 | 49 | if(PYTHON_BINDINGS) 50 | add_subdirectory(python/pybind11) 51 | pybind11_add_module(pyhtml2md python/bindings.cpp ${SOURCES} ${HEADER}) 52 | target_compile_features(pyhtml2md PUBLIC 53 | cxx_auto_type # auto keyword 54 | cxx_constexpr # constexpr support 55 | cxx_range_for # for (auto test : tests) 56 | cxx_std_11 # Require at least c++11 57 | ) 58 | target_compile_definitions(pyhtml2md PRIVATE PYTHON_BINDINGS) 59 | target_include_directories(pyhtml2md PRIVATE include) 60 | if (SKBUILD) 61 | install(TARGETS pyhtml2md DESTINATION "${SKBUILD_PLATLIB_DIR}") 62 | endif() 63 | return() 64 | endif() 65 | 66 | add_library(html2md ${SOURCES}) 67 | set_target_properties(html2md PROPERTIES 68 | VERSION ${PROJECT_VERSION} 69 | SOVERSION ${PROJECT_VERSION_MAJOR} 70 | PUBLIC_HEADER "${HEADERS}" 71 | ) 72 | target_include_directories(html2md PUBLIC $) 73 | target_compile_features(html2md PUBLIC cxx_std_11) # Require at least c++11 74 | 75 | if ((subproject AND BUILD_SHARED_LIBS) OR BUILD_EXE) 76 | add_library(html2md-static STATIC ${HEADERS} ${SOURCES}) 77 | target_include_directories(html2md-static PUBLIC include) 78 | target_compile_features(html2md-static PUBLIC cxx_std_11) # Require at least c++11 79 | endif() 80 | 81 | if(BUILD_EXE) 82 | add_executable(html2md-exe cli/main.cpp) 83 | target_link_libraries(html2md-exe html2md-static) 84 | set_target_properties(html2md-exe PROPERTIES OUTPUT_NAME "html2md") 85 | target_compile_definitions(html2md-exe PUBLIC VERSION="${PROJECT_VERSION}") 86 | target_compile_features(html2md-exe PUBLIC cxx_std_11) # Require at least c++11 87 | endif() 88 | 89 | if(BUILD_TEST) 90 | add_subdirectory(tests) 91 | endif() 92 | 93 | if(BUILD_DOC) 94 | include(cmake/Doc.cmake) 95 | endif() 96 | 97 | # Don't install as a subproject 98 | if(subproject) 99 | return() 100 | endif() 101 | 102 | include(GNUInstallDirs) 103 | include(CMakePackageConfigHelpers) 104 | 105 | install(TARGETS html2md 106 | EXPORT html2mdTargets 107 | ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} 108 | LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} 109 | RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} 110 | PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/html2md 111 | ) 112 | install(EXPORT html2mdTargets 113 | FILE html2mdTargets.cmake 114 | DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/html2md" 115 | ) 116 | 117 | configure_file(html2md.pc.in html2md.pc @ONLY) 118 | install(FILES ${CMAKE_BINARY_DIR}/html2md.pc DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/pkgconfig) 119 | 120 | write_basic_package_version_file( 121 | "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake" 122 | VERSION ${PROJECT_VERSION} 123 | COMPATIBILITY SameMajorVersion 124 | ) 125 | 126 | configure_package_config_file(${CMAKE_CURRENT_SOURCE_DIR}/${PROJECT_NAME}Config.cmake.in 127 | "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake" 128 | INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/html2md 129 | NO_CHECK_REQUIRED_COMPONENTS_MACRO 130 | ) 131 | 132 | install(FILES 133 | "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake" 134 | "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake" 135 | DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/html2md 136 | ) 137 | 138 | if (BUILD_EXE) 139 | install(TARGETS html2md-exe DESTINATION bin) 140 | endif() 141 | 142 | include(cmake/Packaging.cmake) 143 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | Just fork the repo, edit it and then create a pull request! 4 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Tim Gromeyer 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include python/README.md COPYING python/pybind11/LICENSE python/pybind11/CMakeLists.txt CMakeLists.txt python/bindings.cpp 2 | graft python/pybind11/include 3 | graft python/pybind11/tools 4 | graft src 5 | graft include 6 | -------------------------------------------------------------------------------- /Package.swift: -------------------------------------------------------------------------------- 1 | // swift-tools-version:5.5 2 | // The swift-tools-version declares the minimum version of Swift required to build this package. 3 | 4 | import PackageDescription 5 | 6 | let package = Package( 7 | name: "html2md", 8 | products: [ 9 | .library(name: "html2md", targets: ["html2md"]), 10 | ], 11 | targets: [ 12 | .target( 13 | name: "html2md", 14 | dependencies: ["html2md_cpp"], 15 | path: ".", 16 | sources: [ 17 | "objc/html2md_objc.mm", 18 | ], 19 | publicHeadersPath: "objc/include", 20 | cxxSettings: [ 21 | // header is inherited from html2md_cpp 22 | // we should compile this objc file with c++11 23 | .unsafeFlags(["-std=c++11"]), 24 | ] 25 | ), 26 | .target( 27 | name: "html2md_cpp", 28 | path: ".", 29 | sources: [ 30 | "src/html2md.cpp", 31 | "src/table.cpp", 32 | ], 33 | publicHeadersPath: "include", 34 | cxxSettings: [ 35 | .unsafeFlags(["-std=c++11"]), 36 | .unsafeFlags(["-Wno-parentheses", "-Wno-conversion"]), 37 | ] 38 | ), 39 | ] 40 | ) 41 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # html2md 2 | 3 | Transform your HTML into clean, easy-to-read markdown with html2md 4 | 5 | ## Table of Contents 6 | 7 | - [What does it do](#what-does-it-do) 8 | - [How to use this library](#how-to-use-this-library) 9 | - [Supported Tags](#supported-tags) 10 | - [Bindings](#bindings) 11 | - [Requirements](#requirements) 12 | - [License](#license) 13 | 14 | 15 | ## What does it do 16 | 17 | html2md is a fast and reliable C++ library for converting HTML content into markdown. It offers support for a wide range of HTML tags, including those for formatting text, creating lists, and inserting images and links. In addition, html2md is the only HTML to markdown converter that offers support for **table formatting**, making it a valuable tool for users who need to convert HTML tables into markdown. 18 | 19 | 20 | ## How to use this library 21 | 22 | ### CMake 23 | 24 | Install html2md. Use eighter the prebild packages from [GitHub releases](https://github.com/tim-gromeyer/html2md/releases) or build and install it yourself. 25 | 26 | Afterwards: 27 | 28 | ```cmake 29 | find_package(html2md) 30 | target_link_library(your_target PRIVATE html2md) 31 | ``` 32 | 33 | ### Manually 34 | 35 | To use html2md, follow these steps: 36 | 37 | 1. Clone the library: `git clone https://github.com/tim-gromeyer/html2md` 38 | 2. Add the files `include/html2md.h` and `src/html2md.cpp` to your project 39 | 3. Include the `html2md.h` header in your code 40 | 4. Use the `html2md::Convert` function to convert your HTML content into markdown 41 | 42 | Here is an example of how to use the `html2md::Convert` function: 43 | 44 | ```cpp 45 | #include 46 | 47 | //... 48 | 49 | std::cout << html2md::Convert("

foo

"); // # foo 50 | ``` 51 | 52 | ## Supported Tags 53 | 54 | html2md supports the following HTML tags: 55 | 56 | | Tag | Description | Comment | 57 | |--------------|--------------------|-----------------------------------------------------| 58 | | `a` | Anchor or link | Supports the `href`, `name` and `title` attributes. | 59 | | `b` | Bold | | 60 | | `blockquote` | Indented paragraph | | 61 | | `br` | Line break | | 62 | | `cite` | Inline citation | Same as `i`. | 63 | | `code` | Code | | 64 | | `dd` | Definition data | | 65 | | `del` | Strikethrough | | 66 | | `dfn` | Definition | Same as `i`. | 67 | | `div` | Document division | | 68 | | `em` | Emphasized | Same as `i`. | 69 | | `h1` | Level 1 heading | | 70 | | `h2` | Level 2 heading | | 71 | | `h3` | Level 3 heading | | 72 | | `h4` | Level 4 heading | | 73 | | `h5` | Level 5 heading | | 74 | | `h6` | Level 6 heading | | 75 | | `head` | Document header | Ignored. | 76 | | `hr` | Horizontal line | | 77 | | `i` | Italic | | 78 | | `img` | Image | Supports `src`, `alt`, `title` attributes. | 79 | | `li` | List item | | 80 | | `meta` | Meta-information | Ignored. | 81 | | `ol` | Ordered list | | 82 | | `p` | Paragraph | | 83 | | `pre` | Preformatted text | Works only with `code`. | 84 | | `s` | Strikethrough | Same as `del`. | 85 | | `span` | Grouped elements | Does nothing. | 86 | | `strong` | Strong | Same as `b`. | 87 | | `table` | Table | Tables are formatted! | 88 | | `tbody` | Table body | Does nothing. | 89 | | `td` | Table data cell | Uses `align` from `th`. | 90 | | `tfoot` | Table footer | Does nothing. | 91 | | `th` | Table header cell | Supports the `align` attribute. | 92 | | `thead` | Table header | Does nothing. | 93 | | `title` | Document title | Same as `h1`. | 94 | | `tr` | Table row | | 95 | | `u` | Underlined | Uses HTML. | 96 | | `ul` | Unordered list | | 97 | 98 | ## Bindings 99 | 100 | - [Python](python/README.md) 101 | 102 | ## Requirements 103 | 104 | 1. A compiler with **c++11** support like *g++>=9* 105 | 106 | That's all! 107 | 108 | ## License 109 | 110 | html2md is licensed under [The MIT License (MIT)](https://opensource.org/licenses/MIT) 111 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Supported Versions 4 | 5 | | Version | Supported | 6 | | -------- | ------------------ | 7 | | Latest | :white_check_mark: | 8 | | Other | :x: | 9 | 10 | ## Reporting a Vulnerability 11 | 12 | Create a new [issue](https://github.com/tim-gromeyer/html2md/issues/new/choose), tell me where the bug is and I'll try to fix it. 13 | Pull requests are welcome! 14 | -------------------------------------------------------------------------------- /cli/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "html2md.h" 7 | 8 | using std::cerr; 9 | using std::cin; 10 | using std::cout; 11 | using std::endl; 12 | using std::fstream; 13 | using std::ifstream; 14 | using std::ios; 15 | using std::string; 16 | using std::stringstream; 17 | 18 | namespace FileUtils { 19 | bool exists(const std::string &name) { 20 | ifstream f(name.c_str()); 21 | return f.good(); 22 | } 23 | 24 | string readAll(const string &file) { 25 | ifstream in(file); 26 | stringstream buffer; 27 | buffer << in.rdbuf(); 28 | 29 | if (in.bad()) { 30 | throw std::runtime_error("Error reading file: " + file); 31 | } 32 | 33 | return buffer.str(); 34 | } 35 | 36 | void writeFile(const string &file, const string &content) { 37 | fstream out(file, ios::out); 38 | if (!out.is_open()) { 39 | throw std::runtime_error("Error writing file: " + file); 40 | } 41 | 42 | out << content; 43 | out.close(); 44 | 45 | if (out.bad()) { 46 | throw std::runtime_error("Error writing file: " + file); 47 | } 48 | } 49 | } // namespace FileUtils 50 | 51 | constexpr const char *const DESCRIPTION = 52 | " [Options] files...\n\n" 53 | "Simple and fast HTML to Markdown converter with table support.\n\n" 54 | "Options:\n" 55 | " -h, --help\tDisplays this help information.\n" 56 | " -v, --version\tDisplay version information and exit.\n" 57 | " -o, --output\tSets the output file.\n" 58 | " -i, --input\tSets the input text.\n" 59 | " -p, --print\tPrint the generated Markdown.\n" 60 | " -r, --replace\tOverwrite the output file (if it already exists) without " 61 | "asking.\n"; 62 | 63 | struct Options { 64 | bool print = false; 65 | bool replace = false; 66 | string inputFile; 67 | string outputFile; 68 | string inputText; 69 | }; 70 | 71 | void printHelp(const string &programName) { 72 | cout << programName << DESCRIPTION; 73 | } 74 | 75 | void printVersion() { cout << "Version " << VERSION << endl; } 76 | 77 | bool confirmOverride(const string &fileName) { 78 | while (true) { 79 | cout << fileName << " already exists, override? [y/n] "; 80 | string override; 81 | getline(cin, override); 82 | 83 | if (override.empty()) { 84 | continue; 85 | } 86 | 87 | if (override == "y" || override == "Y") { 88 | return true; 89 | } else if (override == "n" || override == "N") { 90 | return false; 91 | } else { 92 | cout << "Invalid input" << endl; 93 | } 94 | } 95 | } 96 | 97 | Options parseCommandLine(int argc, char **argv) { 98 | Options options; 99 | 100 | if (argc == 1) { 101 | printHelp(argv[0]); 102 | exit(EXIT_SUCCESS); 103 | } 104 | 105 | for (int i = 1; i < argc; i++) { 106 | string arg = argv[i]; 107 | 108 | if (arg == "-h" || arg == "--help") { 109 | printHelp(argv[0]); 110 | exit(EXIT_SUCCESS); 111 | } else if (arg == "-v" || arg == "--version") { 112 | printVersion(); 113 | exit(EXIT_SUCCESS); 114 | } else if (arg == "-p" || arg == "--print") { 115 | options.print = true; 116 | } else if (arg == "-r" || arg == "--replace") { 117 | options.replace = true; 118 | } else if (arg == "-o" || arg == "--output") { 119 | if (i + 1 < argc) { 120 | options.outputFile = argv[i + 1]; 121 | i++; 122 | } else { 123 | cerr << "The" << arg << "option requires a file name!\n" << endl; 124 | exit(EXIT_FAILURE); 125 | } 126 | } else if (arg == "-i" || arg == "--input") { 127 | if (i + 1 < argc) { 128 | options.inputText = argv[i + 1]; 129 | i++; 130 | } else { 131 | cerr << "The" << arg << "option requires HTML text!" << endl; 132 | exit(EXIT_FAILURE); 133 | } 134 | } else if (options.inputFile.empty()) { 135 | options.inputFile = arg; 136 | } 137 | } 138 | 139 | return options; 140 | } 141 | 142 | int main(int argc, char **argv) { 143 | Options options = parseCommandLine(argc, argv); 144 | 145 | string input; 146 | if (!options.inputText.empty()) { 147 | input = options.inputText; 148 | } else if (!options.inputFile.empty() && 149 | FileUtils::exists(options.inputFile)) { 150 | input = FileUtils::readAll(options.inputFile); 151 | } else { 152 | cerr << "No valid input provided!" << endl; 153 | return EXIT_FAILURE; 154 | } 155 | 156 | html2md::Converter converter(input); 157 | string md = converter.convert(); 158 | 159 | if (options.print) { 160 | cout << md << endl; 161 | } 162 | 163 | if (!options.outputFile.empty()) { 164 | if (FileUtils::exists(options.outputFile) && !options.replace) { 165 | if (confirmOverride(options.outputFile)) { 166 | FileUtils::writeFile(options.outputFile, md); 167 | cout << "Markdown written to " << options.outputFile << endl; 168 | } else { 169 | cout << "Markdown not written." << endl; 170 | } 171 | } else { 172 | FileUtils::writeFile(options.outputFile, md); 173 | cout << "Markdown written to " << options.outputFile << endl; 174 | } 175 | } 176 | 177 | return EXIT_SUCCESS; 178 | } 179 | -------------------------------------------------------------------------------- /cmake/Doc.cmake: -------------------------------------------------------------------------------- 1 | find_package(Doxygen) 2 | 3 | if(DOXYGEN_FOUND) 4 | add_custom_target( 5 | doc 6 | COMMAND echo "PROJECT_NUMBER = ${PROJECT_VERSION}" >> docs/Doxyfile && ${DOXYGEN_EXECUTABLE} docs/Doxyfile 7 | WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} 8 | COMMENT "Generating API documentation using Doxygen" 9 | VERBATIM 10 | ) 11 | else() 12 | message(WARNING "Doxygen not found. The documentation will not be created!") 13 | endif() 14 | -------------------------------------------------------------------------------- /cmake/Packaging.cmake: -------------------------------------------------------------------------------- 1 | include(InstallRequiredSystemLibraries) 2 | 3 | set(CPACK_STRIP_FILES ON) 4 | set(CPACK_PACKAGE_NAME ${PROJECT_NAME} ) 5 | set(CPACK_PACKAGE_VERSION ${PROJECT_VERSION}) 6 | set(CPACK_PACKAGE_CONTACT "Tim Gromeyer") 7 | set(CPACK_PACKAGE_VENDOR ${CPACK_PACKAGE_CONTACT}) 8 | set(CPACK_PACKAGE_DESCRIPTION_SUMMARY ${PROJECT_DESCRIPTION}) 9 | set(CPACK_PACKAGE_DESCRIPTION "Simple and fast HTML to Markdown conversion library with table support, written in c++.") 10 | 11 | set(CPACK_RESOURCE_FILE_LICENSE ${PROJECT_SOURCE_DIR}/COPYING) 12 | set(CPACK_RESOURCE_FILE_README ${PROJECT_SOURCE_DIR}/README.md) 13 | 14 | # Speed it up! 15 | set(CPACK_THREADS 0) # all 16 | 17 | # Variables specific to CPack RPM generator 18 | set(CPACK_RPM_PACKAGE_DESCRIPTION ${CPACK_PACKAGE_DESCRIPTION}) 19 | set(CPACK_RPM_PACKAGE_LICENSE "MIT") 20 | set(CPACK_RPM_PACKAGE_GROUP "Development/Tools") 21 | set(CPACK_RPM_PACKAGE_URL ${PROJECT_HOMEPAGE_URL}) 22 | # set(CPACK_RPM_PACKAGE_REQUIRES "/sbin/chkconfig, /bin/mktemp, /bin/rm, /bin/mv, libstdc++ >= 2.96") # TODO: Find correct packages 23 | 24 | # Variables specific to CPack DEB generator 25 | set(CPACK_DEBIAN_PACKAGE_DESCRIPTION ${CPACK_PACKAGE_DESCRIPTION}) 26 | set(CPACK_DEBIAN_PACKAGE_SECTION "devel") 27 | set(CPACK_DEBIAN_PACKAGE_HOMEPAGE ${PROJECT_HOMEPAGE_URL}) 28 | set(CPACK_DEBIAN_PACKAGE_SHLIBDEPS YES) 29 | set(CPACK_DEBIAN_PACKAGE_SUGGESTS "") 30 | set(CPACK_DEBIAN_PACKAGE_CONFLICTS "") 31 | set(CPACK_DEBIAN_PACKAGE_MAINTAINER "${CPACK_PACKAGE_CONTACT} ") 32 | 33 | if(WIN32) 34 | set(CPACK_GENERATOR "ZIP") 35 | 36 | elseif(APPLE) 37 | set(CPACK_GENERATOR "ZIP") 38 | set(CPACK_SYSTEM_NAME "OSX") 39 | 40 | elseif(UNIX AND NOT EXMSCRIPTEN AND NOT ANDROID) 41 | # Determine distribution and release 42 | execute_process(COMMAND lsb_release -si OUTPUT_VARIABLE distribution OUTPUT_STRIP_TRAILING_WHITESPACE) 43 | execute_process(COMMAND lsb_release -sc OUTPUT_VARIABLE release OUTPUT_STRIP_TRAILING_WHITESPACE) 44 | execute_process(COMMAND uname -m OUTPUT_VARIABLE CPACK_RPM_PACKAGE_ARCHITECTURE OUTPUT_STRIP_TRAILING_WHITESPACE) 45 | 46 | if(release STREQUAL "n/a") 47 | execute_process(COMMAND lsb_release -sr OUTPUT_VARIABLE release OUTPUT_STRIP_TRAILING_WHITESPACE) 48 | endif() 49 | 50 | if(distribution STREQUAL "Debian" OR distribution STREQUAL "Ubuntu" OR distribution STREQUAL "Linuxmint") 51 | set(CPACK_GENERATOR "DEB") 52 | execute_process(COMMAND dpkg --print-architecture OUTPUT_VARIABLE CPACK_DEBIAN_PACKAGE_ARCHITECTURE OUTPUT_STRIP_TRAILING_WHITESPACE) 53 | set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}_${CPACK_PACKAGE_VERSION}_${CPACK_DEBIAN_PACKAGE_ARCHITECTURE}_${distribution}+${release}) 54 | 55 | elseif(distribution MATCHES "RedHat.*") 56 | # extract the major version from RedHat full version (e.g. 6.7 --> 6) 57 | execute_process(COMMAND lsb_release -sr COMMAND sed s/[.].*// OUTPUT_VARIABLE redhat_version_major OUTPUT_STRIP_TRAILING_WHITESPACE) 58 | set(CPACK_GENERATOR "RPM") 59 | set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION}-${CPACK_RPM_PACKAGE_RELEASE}.el${redhat_version_major}.${CPACK_RPM_PACKAGE_ARCHITECTURE}) 60 | 61 | elseif(distribution MATCHES "openSUSE.*") 62 | set(CPACK_GENERATOR "RPM") 63 | set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION}-${release}.${CPACK_RPM_PACKAGE_ARCHITECTURE}) 64 | 65 | elseif(distribution STREQUAL "Fedora") 66 | set(CPACK_GENERATOR "RPM") 67 | set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION}.fc${release}.${CPACK_RPM_PACKAGE_ARCHITECTURE}) 68 | 69 | elseif(distribution STREQUAL "Scientific") 70 | set(CPACK_GENERATOR "RPM") 71 | set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION}-${release}.${CPACK_RPM_PACKAGE_ARCHITECTURE}) 72 | 73 | else() 74 | set(CPACK_GENERATOR "STGZ") 75 | set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION}-${release}.${CPACK_RPM_PACKAGE_ARCHITECTURE}) 76 | endif() 77 | 78 | set(CPACK_GENERATOR "TGZ;${CPACK_GENERATOR}") 79 | endif() 80 | 81 | # Store the packages in a separat dir 82 | set(CPACK_PACKAGE_DIRECTORY "${CMAKE_BINARY_DIR}/packages") 83 | set(CPACK_PACKAGE_INSTALL_DIRECTORY ${PROJECT_NAME}) 84 | 85 | include(CPack) 86 | -------------------------------------------------------------------------------- /docs/Doxyfile: -------------------------------------------------------------------------------- 1 | PROJECT_NAME = "html2md" 2 | PROJECT_BRIEF = "Simple and fast HTML to Markdown converter" 3 | 4 | INPUT = include/ src/ docs/index.md CHANGELOG.md python/README.md 5 | USE_MDFILE_AS_MAINPAGE = docs/index.md 6 | 7 | RECURSIVE = YES 8 | ENABLE_PREPROCESSING = YES 9 | 10 | MARKDOWN_SUPPORT = YES 11 | HTML_OUTPUT = doc 12 | GENERATE_LATEX = NO 13 | 14 | GENERATE_TREEVIEW = YES 15 | DISABLE_INDEX = NO 16 | FULL_SIDEBAR = NO 17 | EXTRACT_ALL = YES 18 | TREEVIEW_WIDTH = 335 19 | 20 | HTML_HEADER = docs/doxygen-awesome-css/doxygen-custom/header.html 21 | 22 | HTML_EXTRA_STYLESHEET = docs/doxygen-awesome-css/doxygen-awesome.css \ 23 | docs/doxygen-awesome-css/doxygen-custom/custom.css \ 24 | docs/doxygen-awesome-css/doxygen-awesome-sidebar-only.css \ 25 | docs/doxygen-awesome-css/doxygen-awesome-sidebar-only-darkmode-toggle.css 26 | 27 | HTML_EXTRA_FILES = docs/doxygen-awesome-css/doxygen-awesome-darkmode-toggle.js \ 28 | docs/doxygen-awesome-css/doxygen-awesome-fragment-copy-button.js \ 29 | docs/doxygen-awesome-css/doxygen-awesome-paragraph-link.js \ 30 | docs/doxygen-awesome-css/doxygen-awesome-interactive-toc.js 31 | 32 | # Transparent background for graphs 33 | HAVE_DOT = YES 34 | DOT_IMAGE_FORMAT = svg 35 | DOT_TRANSPARENT = YES 36 | INTERACTIVE_SVG = YES 37 | 38 | # TOC 39 | TOC_EXPAND = YES 40 | TOC_INCLUDE_HEADINGS = 5 41 | 42 | # Fix dark mode not deactivatable(Doxygen v1.9.6) 43 | HTML_COLORSTYLE = TOGGLE 44 | 45 | # Tests 46 | SOURCE_BROWSER = YES 47 | SEARCHENGINE = YES 48 | 49 | -------------------------------------------------------------------------------- /docs/doxygen-awesome-css/.gitignore: -------------------------------------------------------------------------------- 1 | docs/html 2 | .DS_Store 3 | .idea -------------------------------------------------------------------------------- /docs/doxygen-awesome-css/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 jothepro 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /docs/doxygen-awesome-css/doxygen-awesome-darkmode-toggle.js: -------------------------------------------------------------------------------- 1 | /** 2 | 3 | Doxygen Awesome 4 | https://github.com/jothepro/doxygen-awesome-css 5 | 6 | MIT License 7 | 8 | Copyright (c) 2021 - 2022 jothepro 9 | 10 | Permission is hereby granted, free of charge, to any person obtaining a copy 11 | of this software and associated documentation files (the "Software"), to deal 12 | in the Software without restriction, including without limitation the rights 13 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 | copies of the Software, and to permit persons to whom the Software is 15 | furnished to do so, subject to the following conditions: 16 | 17 | The above copyright notice and this permission notice shall be included in all 18 | copies or substantial portions of the Software. 19 | 20 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 23 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 26 | SOFTWARE. 27 | 28 | */ 29 | 30 | class DoxygenAwesomeDarkModeToggle extends HTMLElement { 31 | // SVG icons from https://fonts.google.com/icons 32 | // Licensed under the Apache 2.0 license: 33 | // https://www.apache.org/licenses/LICENSE-2.0.html 34 | static lightModeIcon = `` 35 | static darkModeIcon = `` 36 | static title = "Toggle Light/Dark Mode" 37 | 38 | static prefersLightModeInDarkModeKey = "prefers-light-mode-in-dark-mode" 39 | static prefersDarkModeInLightModeKey = "prefers-dark-mode-in-light-mode" 40 | 41 | static _staticConstructor = function() { 42 | DoxygenAwesomeDarkModeToggle.enableDarkMode(DoxygenAwesomeDarkModeToggle.userPreference) 43 | // Update the color scheme when the browsers preference changes 44 | // without user interaction on the website. 45 | window.matchMedia('(prefers-color-scheme: dark)').addEventListener('change', event => { 46 | DoxygenAwesomeDarkModeToggle.onSystemPreferenceChanged() 47 | }) 48 | // Update the color scheme when the tab is made visible again. 49 | // It is possible that the appearance was changed in another tab 50 | // while this tab was in the background. 51 | document.addEventListener("visibilitychange", visibilityState => { 52 | if (document.visibilityState === 'visible') { 53 | DoxygenAwesomeDarkModeToggle.onSystemPreferenceChanged() 54 | } 55 | }); 56 | }() 57 | 58 | static init() { 59 | $(function() { 60 | $(document).ready(function() { 61 | const toggleButton = document.createElement('doxygen-awesome-dark-mode-toggle') 62 | toggleButton.title = DoxygenAwesomeDarkModeToggle.title 63 | toggleButton.updateIcon() 64 | 65 | window.matchMedia('(prefers-color-scheme: dark)').addEventListener('change', event => { 66 | toggleButton.updateIcon() 67 | }) 68 | document.addEventListener("visibilitychange", visibilityState => { 69 | if (document.visibilityState === 'visible') { 70 | toggleButton.updateIcon() 71 | } 72 | }); 73 | 74 | $(document).ready(function(){ 75 | document.getElementById("MSearchBox").parentNode.appendChild(toggleButton) 76 | }) 77 | $(window).resize(function(){ 78 | document.getElementById("MSearchBox").parentNode.appendChild(toggleButton) 79 | }) 80 | }) 81 | }) 82 | } 83 | 84 | constructor() { 85 | super(); 86 | this.onclick=this.toggleDarkMode 87 | } 88 | 89 | /** 90 | * @returns `true` for dark-mode, `false` for light-mode system preference 91 | */ 92 | static get systemPreference() { 93 | return window.matchMedia('(prefers-color-scheme: dark)').matches 94 | } 95 | 96 | /** 97 | * @returns `true` for dark-mode, `false` for light-mode user preference 98 | */ 99 | static get userPreference() { 100 | return (!DoxygenAwesomeDarkModeToggle.systemPreference && localStorage.getItem(DoxygenAwesomeDarkModeToggle.prefersDarkModeInLightModeKey)) || 101 | (DoxygenAwesomeDarkModeToggle.systemPreference && !localStorage.getItem(DoxygenAwesomeDarkModeToggle.prefersLightModeInDarkModeKey)) 102 | } 103 | 104 | static set userPreference(userPreference) { 105 | DoxygenAwesomeDarkModeToggle.darkModeEnabled = userPreference 106 | if(!userPreference) { 107 | if(DoxygenAwesomeDarkModeToggle.systemPreference) { 108 | localStorage.setItem(DoxygenAwesomeDarkModeToggle.prefersLightModeInDarkModeKey, true) 109 | } else { 110 | localStorage.removeItem(DoxygenAwesomeDarkModeToggle.prefersDarkModeInLightModeKey) 111 | } 112 | } else { 113 | if(!DoxygenAwesomeDarkModeToggle.systemPreference) { 114 | localStorage.setItem(DoxygenAwesomeDarkModeToggle.prefersDarkModeInLightModeKey, true) 115 | } else { 116 | localStorage.removeItem(DoxygenAwesomeDarkModeToggle.prefersLightModeInDarkModeKey) 117 | } 118 | } 119 | DoxygenAwesomeDarkModeToggle.onUserPreferenceChanged() 120 | } 121 | 122 | static enableDarkMode(enable) { 123 | if(enable) { 124 | DoxygenAwesomeDarkModeToggle.darkModeEnabled = true 125 | document.documentElement.classList.add("dark-mode") 126 | document.documentElement.classList.remove("light-mode") 127 | } else { 128 | DoxygenAwesomeDarkModeToggle.darkModeEnabled = false 129 | document.documentElement.classList.remove("dark-mode") 130 | document.documentElement.classList.add("light-mode") 131 | } 132 | } 133 | 134 | static onSystemPreferenceChanged() { 135 | DoxygenAwesomeDarkModeToggle.darkModeEnabled = DoxygenAwesomeDarkModeToggle.userPreference 136 | DoxygenAwesomeDarkModeToggle.enableDarkMode(DoxygenAwesomeDarkModeToggle.darkModeEnabled) 137 | } 138 | 139 | static onUserPreferenceChanged() { 140 | DoxygenAwesomeDarkModeToggle.enableDarkMode(DoxygenAwesomeDarkModeToggle.darkModeEnabled) 141 | } 142 | 143 | toggleDarkMode() { 144 | DoxygenAwesomeDarkModeToggle.userPreference = !DoxygenAwesomeDarkModeToggle.userPreference 145 | this.updateIcon() 146 | } 147 | 148 | updateIcon() { 149 | if(DoxygenAwesomeDarkModeToggle.darkModeEnabled) { 150 | this.innerHTML = DoxygenAwesomeDarkModeToggle.darkModeIcon 151 | } else { 152 | this.innerHTML = DoxygenAwesomeDarkModeToggle.lightModeIcon 153 | } 154 | } 155 | } 156 | 157 | customElements.define("doxygen-awesome-dark-mode-toggle", DoxygenAwesomeDarkModeToggle); 158 | -------------------------------------------------------------------------------- /docs/doxygen-awesome-css/doxygen-awesome-fragment-copy-button.js: -------------------------------------------------------------------------------- 1 | /** 2 | 3 | Doxygen Awesome 4 | https://github.com/jothepro/doxygen-awesome-css 5 | 6 | MIT License 7 | 8 | Copyright (c) 2022 jothepro 9 | 10 | Permission is hereby granted, free of charge, to any person obtaining a copy 11 | of this software and associated documentation files (the "Software"), to deal 12 | in the Software without restriction, including without limitation the rights 13 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 | copies of the Software, and to permit persons to whom the Software is 15 | furnished to do so, subject to the following conditions: 16 | 17 | The above copyright notice and this permission notice shall be included in all 18 | copies or substantial portions of the Software. 19 | 20 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 23 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 26 | SOFTWARE. 27 | 28 | */ 29 | 30 | class DoxygenAwesomeFragmentCopyButton extends HTMLElement { 31 | constructor() { 32 | super(); 33 | this.onclick=this.copyContent 34 | } 35 | static title = "Copy to clipboard" 36 | static copyIcon = `` 37 | static successIcon = `` 38 | static successDuration = 980 39 | static init() { 40 | $(function() { 41 | $(document).ready(function() { 42 | if(navigator.clipboard) { 43 | const fragments = document.getElementsByClassName("fragment") 44 | for(const fragment of fragments) { 45 | const fragmentWrapper = document.createElement("div") 46 | fragmentWrapper.className = "doxygen-awesome-fragment-wrapper" 47 | const fragmentCopyButton = document.createElement("doxygen-awesome-fragment-copy-button") 48 | fragmentCopyButton.innerHTML = DoxygenAwesomeFragmentCopyButton.copyIcon 49 | fragmentCopyButton.title = DoxygenAwesomeFragmentCopyButton.title 50 | 51 | fragment.parentNode.replaceChild(fragmentWrapper, fragment) 52 | fragmentWrapper.appendChild(fragment) 53 | fragmentWrapper.appendChild(fragmentCopyButton) 54 | 55 | } 56 | } 57 | }) 58 | }) 59 | } 60 | 61 | 62 | copyContent() { 63 | const content = this.previousSibling.cloneNode(true) 64 | // filter out line number from file listings 65 | content.querySelectorAll(".lineno, .ttc").forEach((node) => { 66 | node.remove() 67 | }) 68 | let textContent = content.textContent 69 | // remove trailing newlines that appear in file listings 70 | let numberOfTrailingNewlines = 0 71 | while(textContent.charAt(textContent.length - (numberOfTrailingNewlines + 1)) == '\n') { 72 | numberOfTrailingNewlines++; 73 | } 74 | textContent = textContent.substring(0, textContent.length - numberOfTrailingNewlines) 75 | navigator.clipboard.writeText(textContent); 76 | this.classList.add("success") 77 | this.innerHTML = DoxygenAwesomeFragmentCopyButton.successIcon 78 | window.setTimeout(() => { 79 | this.classList.remove("success") 80 | this.innerHTML = DoxygenAwesomeFragmentCopyButton.copyIcon 81 | }, DoxygenAwesomeFragmentCopyButton.successDuration); 82 | } 83 | } 84 | 85 | customElements.define("doxygen-awesome-fragment-copy-button", DoxygenAwesomeFragmentCopyButton) 86 | -------------------------------------------------------------------------------- /docs/doxygen-awesome-css/doxygen-awesome-interactive-toc.js: -------------------------------------------------------------------------------- 1 | /** 2 | 3 | Doxygen Awesome 4 | https://github.com/jothepro/doxygen-awesome-css 5 | 6 | MIT License 7 | 8 | Copyright (c) 2022 jothepro 9 | 10 | Permission is hereby granted, free of charge, to any person obtaining a copy 11 | of this software and associated documentation files (the "Software"), to deal 12 | in the Software without restriction, including without limitation the rights 13 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 | copies of the Software, and to permit persons to whom the Software is 15 | furnished to do so, subject to the following conditions: 16 | 17 | The above copyright notice and this permission notice shall be included in all 18 | copies or substantial portions of the Software. 19 | 20 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 23 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 26 | SOFTWARE. 27 | 28 | */ 29 | 30 | class DoxygenAwesomeInteractiveToc { 31 | static topOffset = 38 32 | static hideMobileMenu = true 33 | static headers = [] 34 | 35 | static init() { 36 | window.addEventListener("load", () => { 37 | let toc = document.querySelector(".contents > .toc") 38 | if(toc) { 39 | toc.classList.add("interactive") 40 | if(!DoxygenAwesomeInteractiveToc.hideMobileMenu) { 41 | toc.classList.add("open") 42 | } 43 | document.querySelector(".contents > .toc > h3")?.addEventListener("click", () => { 44 | if(toc.classList.contains("open")) { 45 | toc.classList.remove("open") 46 | } else { 47 | toc.classList.add("open") 48 | } 49 | }) 50 | 51 | document.querySelectorAll(".contents > .toc > ul a").forEach((node) => { 52 | let id = node.getAttribute("href").substring(1) 53 | DoxygenAwesomeInteractiveToc.headers.push({ 54 | node: node, 55 | headerNode: document.getElementById(id) 56 | }) 57 | 58 | document.getElementById("doc-content")?.addEventListener("scroll", () => { 59 | DoxygenAwesomeInteractiveToc.update() 60 | }) 61 | }) 62 | DoxygenAwesomeInteractiveToc.update() 63 | } 64 | }) 65 | } 66 | 67 | static update() { 68 | let active = DoxygenAwesomeInteractiveToc.headers[0]?.node 69 | DoxygenAwesomeInteractiveToc.headers.forEach((header) => { 70 | let position = header.headerNode.getBoundingClientRect().top 71 | header.node.classList.remove("active") 72 | header.node.classList.remove("aboveActive") 73 | if(position < DoxygenAwesomeInteractiveToc.topOffset) { 74 | active = header.node 75 | active?.classList.add("aboveActive") 76 | } 77 | }) 78 | active?.classList.add("active") 79 | active?.classList.remove("aboveActive") 80 | } 81 | } -------------------------------------------------------------------------------- /docs/doxygen-awesome-css/doxygen-awesome-paragraph-link.js: -------------------------------------------------------------------------------- 1 | /** 2 | 3 | Doxygen Awesome 4 | https://github.com/jothepro/doxygen-awesome-css 5 | 6 | MIT License 7 | 8 | Copyright (c) 2022 jothepro 9 | 10 | Permission is hereby granted, free of charge, to any person obtaining a copy 11 | of this software and associated documentation files (the "Software"), to deal 12 | in the Software without restriction, including without limitation the rights 13 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 | copies of the Software, and to permit persons to whom the Software is 15 | furnished to do so, subject to the following conditions: 16 | 17 | The above copyright notice and this permission notice shall be included in all 18 | copies or substantial portions of the Software. 19 | 20 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 23 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 26 | SOFTWARE. 27 | 28 | */ 29 | 30 | class DoxygenAwesomeParagraphLink { 31 | // Icon from https://fonts.google.com/icons 32 | // Licensed under the Apache 2.0 license: 33 | // https://www.apache.org/licenses/LICENSE-2.0.html 34 | static icon = `` 35 | static title = "Permanent Link" 36 | static init() { 37 | $(function() { 38 | $(document).ready(function() { 39 | document.querySelectorAll(".contents a.anchor[id], .contents .groupheader > a[id]").forEach((node) => { 40 | let anchorlink = document.createElement("a") 41 | anchorlink.setAttribute("href", `#${node.getAttribute("id")}`) 42 | anchorlink.setAttribute("title", DoxygenAwesomeParagraphLink.title) 43 | anchorlink.classList.add("anchorlink") 44 | node.classList.add("anchor") 45 | anchorlink.innerHTML = DoxygenAwesomeParagraphLink.icon 46 | node.parentElement.appendChild(anchorlink) 47 | }) 48 | }) 49 | }) 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /docs/doxygen-awesome-css/doxygen-awesome-sidebar-only-darkmode-toggle.css: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | 4 | Doxygen Awesome 5 | https://github.com/jothepro/doxygen-awesome-css 6 | 7 | MIT License 8 | 9 | Copyright (c) 2021 jothepro 10 | 11 | Permission is hereby granted, free of charge, to any person obtaining a copy 12 | of this software and associated documentation files (the "Software"), to deal 13 | in the Software without restriction, including without limitation the rights 14 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 15 | copies of the Software, and to permit persons to whom the Software is 16 | furnished to do so, subject to the following conditions: 17 | 18 | The above copyright notice and this permission notice shall be included in all 19 | copies or substantial portions of the Software. 20 | 21 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 22 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 23 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 24 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 25 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 26 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 27 | SOFTWARE. 28 | 29 | */ 30 | 31 | @media screen and (min-width: 768px) { 32 | 33 | #MSearchBox { 34 | width: calc(var(--side-nav-fixed-width) - calc(2 * var(--spacing-medium)) - var(--searchbar-height) - 1px); 35 | } 36 | 37 | #MSearchField { 38 | width: calc(var(--side-nav-fixed-width) - calc(2 * var(--spacing-medium)) - 66px - var(--searchbar-height)); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /docs/doxygen-awesome-css/doxygen-awesome-sidebar-only.css: -------------------------------------------------------------------------------- 1 | /** 2 | 3 | Doxygen Awesome 4 | https://github.com/jothepro/doxygen-awesome-css 5 | 6 | MIT License 7 | 8 | Copyright (c) 2021 jothepro 9 | 10 | Permission is hereby granted, free of charge, to any person obtaining a copy 11 | of this software and associated documentation files (the "Software"), to deal 12 | in the Software without restriction, including without limitation the rights 13 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 | copies of the Software, and to permit persons to whom the Software is 15 | furnished to do so, subject to the following conditions: 16 | 17 | The above copyright notice and this permission notice shall be included in all 18 | copies or substantial portions of the Software. 19 | 20 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 23 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 26 | SOFTWARE. 27 | 28 | */ 29 | 30 | html { 31 | /* side nav width. MUST be = `TREEVIEW_WIDTH`. 32 | * Make sure it is wide enough to contain the page title (logo + title + version) 33 | */ 34 | --side-nav-fixed-width: 335px; 35 | --menu-display: none; 36 | 37 | --top-height: 120px; 38 | --toc-sticky-top: -25px; 39 | --toc-max-height: calc(100vh - 2 * var(--spacing-medium) - 25px); 40 | } 41 | 42 | #projectname { 43 | white-space: nowrap; 44 | } 45 | 46 | 47 | @media screen and (min-width: 768px) { 48 | html { 49 | --searchbar-background: var(--page-background-color); 50 | } 51 | 52 | #side-nav { 53 | min-width: var(--side-nav-fixed-width); 54 | max-width: var(--side-nav-fixed-width); 55 | top: var(--top-height); 56 | overflow: visible; 57 | } 58 | 59 | #nav-tree, #side-nav { 60 | height: calc(100vh - var(--top-height)) !important; 61 | } 62 | 63 | #nav-tree { 64 | padding: 0; 65 | } 66 | 67 | #top { 68 | display: block; 69 | border-bottom: none; 70 | height: var(--top-height); 71 | margin-bottom: calc(0px - var(--top-height)); 72 | max-width: var(--side-nav-fixed-width); 73 | overflow: hidden; 74 | background: var(--side-nav-background); 75 | } 76 | #main-nav { 77 | float: left; 78 | padding-right: 0; 79 | } 80 | 81 | .ui-resizable-handle { 82 | cursor: default; 83 | width: 1px !important; 84 | box-shadow: 0 calc(-2 * var(--top-height)) 0 0 var(--separator-color); 85 | } 86 | 87 | #nav-path { 88 | position: fixed; 89 | right: 0; 90 | left: var(--side-nav-fixed-width); 91 | bottom: 0; 92 | width: auto; 93 | } 94 | 95 | #doc-content { 96 | height: calc(100vh - 31px) !important; 97 | padding-bottom: calc(3 * var(--spacing-large)); 98 | padding-top: calc(var(--top-height) - 80px); 99 | box-sizing: border-box; 100 | margin-left: var(--side-nav-fixed-width) !important; 101 | } 102 | 103 | #MSearchBox { 104 | width: calc(var(--side-nav-fixed-width) - calc(2 * var(--spacing-medium))); 105 | } 106 | 107 | #MSearchField { 108 | width: calc(var(--side-nav-fixed-width) - calc(2 * var(--spacing-medium)) - 65px); 109 | } 110 | 111 | #MSearchResultsWindow { 112 | left: var(--spacing-medium) !important; 113 | right: auto; 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /docs/doxygen-awesome-css/doxygen-custom/custom.css: -------------------------------------------------------------------------------- 1 | .github-corner svg { 2 | fill: var(--primary-light-color); 3 | color: var(--page-background-color); 4 | width: 72px; 5 | height: 72px; 6 | } 7 | 8 | @media screen and (max-width: 767px) { 9 | .github-corner svg { 10 | width: 50px; 11 | height: 50px; 12 | } 13 | #projectnumber { 14 | margin-right: 22px; 15 | } 16 | } 17 | 18 | .alter-theme-button { 19 | display: inline-block; 20 | cursor: pointer; 21 | background: var(--primary-color); 22 | color: var(--page-background-color) !important; 23 | border-radius: var(--border-radius-medium); 24 | padding: var(--spacing-small) var(--spacing-medium); 25 | text-decoration: none; 26 | } 27 | 28 | .next_section_button { 29 | display: block; 30 | padding: var(--spacing-large) 0 var(--spacing-small) 0; 31 | color: var(--page-background-color); 32 | user-select: none; 33 | } 34 | 35 | .next_section_button::after { 36 | /* clearfix */ 37 | content: ""; 38 | clear: both; 39 | display: table; 40 | } 41 | 42 | .next_section_button a { 43 | overflow: hidden; 44 | float: right; 45 | border: 1px solid var(--separator-color); 46 | padding: var(--spacing-medium) calc(var(--spacing-large) / 2) var(--spacing-medium) var(--spacing-large); 47 | border-radius: var(--border-radius-medium); 48 | color: var(--page-secondary-foreground-color) !important; 49 | text-decoration: none; 50 | background-color: var(--page-background-color); 51 | transition: color .08s ease-in-out, background-color .1s ease-in-out; 52 | } 53 | 54 | .next_section_button a:hover { 55 | color: var(--page-foreground-color) !important; 56 | background-color: var(--odd-color); 57 | } 58 | 59 | .next_section_button a::after { 60 | content: '〉'; 61 | color: var(--page-secondary-foreground-color) !important; 62 | padding-left: var(--spacing-large); 63 | display: inline-block; 64 | transition: color .08s ease-in-out, transform .09s ease-in-out; 65 | } 66 | 67 | .next_section_button a:hover::after { 68 | color: var(--page-foreground-color) !important; 69 | transform: translateX(3px); 70 | } 71 | 72 | .alter-theme-button:hover { 73 | background: var(--primary-dark-color); 74 | } 75 | 76 | html.dark-mode .darkmode_inverted_image img, /* < doxygen 1.9.3 */ 77 | html.dark-mode .darkmode_inverted_image object[type="image/svg+xml"] /* doxygen 1.9.3 */ { 78 | filter: brightness(87%) hue-rotate(180deg) invert(); 79 | } 80 | 81 | .bordered_image { 82 | border-radius: var(--border-radius-small); 83 | border: 1px solid var(--separator-color); 84 | display: inline-block; 85 | overflow: hidden; 86 | } 87 | 88 | html.dark-mode .bordered_image img, /* < doxygen 1.9.3 */ 89 | html.dark-mode .bordered_image object[type="image/svg+xml"] /* doxygen 1.9.3 */ { 90 | border-radius: var(--border-radius-small); 91 | } 92 | 93 | .title_screenshot { 94 | filter: drop-shadow(0px 3px 10px rgba(0,0,0,0.22)); 95 | max-width: 500px; 96 | margin: var(--spacing-large) 0; 97 | } 98 | 99 | .title_screenshot .caption { 100 | display: none; 101 | } -------------------------------------------------------------------------------- /docs/doxygen-awesome-css/doxygen-custom/header.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | $title 17 | $title 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 32 | 51 | $treeview 52 | $search 53 | $mathjax 54 | 55 | $extrastylesheet 56 | 57 | 58 | 59 | 60 | 61 | 63 | 64 | 65 |
66 | 67 | 68 |
69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 82 | 83 | 84 | 85 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 |
77 |
$projectname 78 |  $projectnumber 79 |
80 |
$projectbrief
81 |
86 |
$projectbrief
87 |
$searchbox
98 |
99 | 100 | 101 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # html2md 2 | 3 | [TOC] 4 | 5 | ## What does it do 6 | 7 | html2md is a fast and reliable C++ library for converting HTML content into markdown. It offers support for a wide range of HTML tags, including those for formatting text, creating lists, and inserting images and links. In addition, html2md is the only HTML to markdown converter that offers support for table formatting, making it a valuable tool for users who need to convert HTML tables into markdown. 8 | 9 | ## How to use this library 10 | 11 | ### CMake 12 | 13 | Install html2md. Either use the pre-built packages found on [GitHub releases](https://github.com/tim-gromeyer/html2md/releases) or build and install it yourself. 14 | 15 | 16 | Afterwards: 17 | 18 | ```cmake 19 | find_package(html2md) 20 | target_link_library(your_target PRIVATE html2md) 21 | ``` 22 | 23 | ### Manually 24 | 25 | To use html2md, follow these steps: 26 | 27 | 1. Clone the library: `git clone https://github.com/tim-gromeyer/html2md` 28 | 2. Add the files `include/html2md.h` and `src/html2md.cpp` to your project 29 | 3. Include the `html2md.h` header in your code 30 | 4. Use the `html2md::Convert` function to convert your HTML content into markdown 31 | 32 | Here is an example of how to use the `html2md::Convert` function: 33 | 34 | ```cpp 35 | #include 36 | 37 | //... 38 | 39 | std::cout << html2md::Convert("

foo

"); // # foo 40 | ``` 41 | 42 | ## Supported Tags 43 | 44 | html2md supports the following HTML tags: 45 | 46 | 47 | | Tag | Description | Comment | 48 | | ------------ | ------------------ | ------------------------------------------ | 49 | | `a` | Anchor or link | Supports the `href` and `name` attributes. | 50 | | `b` | Bold | | 51 | | `blockquote` | Indented paragraph | | 52 | | `br` | Line break | | 53 | | `cite` | Inline citation | Same as `i`. | 54 | | `code` | Code | | 55 | | `dd` | Definition data | | 56 | | `del` | Strikethrough | | 57 | | `dfn` | Definition | Same as `i`. | 58 | | `div` | Document division | | 59 | | `em` | Emphasized | Same as `i`. | 60 | | `h1` | Level 1 heading | | 61 | | `h2` | Level 2 heading | | 62 | | `h3` | Level 3 heading | | 63 | | `h4` | Level 4 heading | | 64 | | `h5` | Level 5 heading | | 65 | | `h6` | Level 6 heading | | 66 | | `head` | Document header | Ignored. | 67 | | `hr` | Horizontal line | | 68 | | `i` | Italic | | 69 | | `img` | Image | Supports the `src` and `alt` attributes. | 70 | | `li` | List item | | 71 | | `meta` | Meta-information | Ignored. | 72 | | `ol` | Ordered list | Don't use other lists in this list. | 73 | | `p` | Paragraph | | 74 | | `pre` | Preformatted text | Works only with `code`. | 75 | | `s` | Strikethrough | Same as `del`. | 76 | | `span` | Grouped elements | | 77 | | `strong` | Strong | Same as `b`. | 78 | | `table` | Table | | 79 | | `td` | Table data cell | Uses `align` from `th`. | 80 | | `th` | Table header cell | Supports the `align` attribute. | 81 | | `title` | Document title | Same as `h1`. | 82 | | `tr` | Table row | | 83 | | `u` | Underlined | Uses HTML. | 84 | | `ul` | Unordered list | | 85 | 86 | ## Bindings 87 | 88 | - [Python](../python/README.md) 89 | 90 | ## Requirements 91 | 92 | 1. A compiler with **c++11** support like *g++>=9* 93 | 94 | That's all! 95 | 96 | ## License 97 | 98 | html2md is licensed under [The MIT License (MIT)](https://opensource.org/licenses/MIT) 99 | -------------------------------------------------------------------------------- /html2md.pc.in: -------------------------------------------------------------------------------- 1 | prefix=@CMAKE_INSTALL_PREFIX@ 2 | exec_prefix=@CMAKE_INSTALL_PREFIX@ 3 | libdir=${exec_prefix}/@CMAKE_INSTALL_LIBDIR@ 4 | includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@/html2md/ 5 | 6 | Name: @PROJECT_NAME@ 7 | Description: @PROJECT_DESCRIPTION@ 8 | Version: @PROJECT_VERSION@ 9 | 10 | Requires: 11 | Libs: -L${libdir} -lhtml2md 12 | Cflags: -I${includedir} 13 | -------------------------------------------------------------------------------- /html2mdConfig.cmake.in: -------------------------------------------------------------------------------- 1 | @PACKAGE_INIT@ 2 | 3 | include("${CMAKE_CURRENT_LIST_DIR}/@PROJECT_NAME@Targets.cmake") 4 | 5 | set(html2md_FOUND TRUE) 6 | set(HTML2MD_FOUND TRUE) 7 | 8 | -------------------------------------------------------------------------------- /include/html2md.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Tim Gromeyer 2 | // Licensed under the MIT License - https://opensource.org/licenses/MIT 3 | 4 | #ifndef HTML2MD_H 5 | #define HTML2MD_H 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | /*! 13 | * \brief html2md namespace 14 | * 15 | * The html2md namespace provides: 16 | * 1. The Converter class 17 | * 2. Static wrapper around Converter class 18 | * 19 | * \note Do NOT try to convert HTML that contains a list in an ordered list or a 20 | * `blockquote` in a list!\n This will be a **total** mess! 21 | */ 22 | namespace html2md { 23 | 24 | /*! 25 | * \brief Options for the conversion from HTML to Markdown 26 | * \warning Make sure to pass valid options; otherwise, the output will be 27 | * invalid! 28 | * 29 | * Example from `tests/main.cpp`: 30 | * 31 | * ```cpp 32 | * auto *options = new html2md::Options(); 33 | * options->splitLines = false; 34 | * 35 | * html2md::Converter c(html, options); 36 | * auto md = c.convert(); 37 | * ``` 38 | */ 39 | struct Options { 40 | /*! 41 | * \brief Add new line when a certain number of characters is reached 42 | * 43 | * \see softBreak 44 | * \see hardBreak 45 | */ 46 | bool splitLines = true; 47 | 48 | /*! 49 | * \brief softBreak Wrap after ... characters when the next space is reached 50 | * and as long as it's not in a list, table, image or anchor (link). 51 | */ 52 | int softBreak = 80; 53 | 54 | /*! 55 | * \brief hardBreak Force a break after ... characters in a line 56 | */ 57 | int hardBreak = 100; 58 | 59 | /*! 60 | * \brief The char used for unordered lists 61 | * 62 | * Valid: 63 | * - `-` 64 | * - `+` 65 | * - `*` 66 | * 67 | * Example: 68 | * 69 | * ```markdown 70 | * - List 71 | * + Also a list 72 | * * And this to 73 | * ``` 74 | */ 75 | char unorderedList = '-'; 76 | 77 | /*! 78 | * \brief The char used after the number of the item 79 | * 80 | * Valid: 81 | * - `.` 82 | * - `)` 83 | * 84 | * Example: 85 | * 86 | * ```markdown 87 | * 1. Hello 88 | * 2) World! 89 | * ``` 90 | */ 91 | char orderedList = '.'; 92 | 93 | /*! 94 | * \brief Whether title is added as h1 heading at the very beginning of the 95 | * markdown 96 | * 97 | * Whether title is added as h1 heading at the very beginning of the markdown. 98 | * Default is true. 99 | */ 100 | bool includeTitle = true; 101 | 102 | /*! 103 | * \brief Whetever to format Markdown Tables 104 | * 105 | * Whetever to format Markdown Tables. 106 | * Default is true. 107 | */ 108 | bool formatTable = true; 109 | 110 | inline bool operator==(html2md::Options o) const { 111 | return splitLines == o.splitLines && unorderedList == o.unorderedList && 112 | orderedList == o.orderedList && includeTitle == o.includeTitle && 113 | softBreak == o.softBreak && hardBreak == o.hardBreak; 114 | }; 115 | }; 116 | 117 | /*! 118 | * \brief Class for converting HTML to Markdown 119 | * 120 | * This class converts HTML to Markdown. 121 | * There is also a static wrapper for this class (see html2md::Convert). 122 | * 123 | * ## Usage example 124 | * 125 | * Option 1: Use the class: 126 | * 127 | * ```cpp 128 | * std::string html = "

example

"; 129 | * html2md::Converter c(html); 130 | * auto md = c.convert(); 131 | * 132 | * if (!c.ok()) std::cout << "There was something wrong in the HTML\n"; 133 | * std::cout << md; // # example 134 | * ``` 135 | * 136 | * Option 2: Use the static wrapper: 137 | * 138 | * ```cpp 139 | * std::string html = "

example

"; 140 | * 141 | * auto md = html2md::Convert(html); 142 | * std::cout << md; 143 | * ``` 144 | * 145 | * Advanced: use Options: 146 | * 147 | * ```cpp 148 | * std::string html = "

example

"; 149 | * 150 | * auto *options = new html2md::Options(); 151 | * options->splitLines = false; 152 | * options->unorderedList = '*'; 153 | * 154 | * html2md::Converter c(html, options); 155 | * auto md = c.convert(); 156 | * if (!c.ok()) std::cout << "There was something wrong in the HTML\n"; 157 | * std::cout << md; // # example 158 | * ``` 159 | */ 160 | class Converter { 161 | public: 162 | /*! 163 | * \brief Standard initializer, takes HTML as parameter. Also prepares 164 | * everything. \param html The HTML as std::string. \param options Options for 165 | * the Conversation. See html2md::Options() for more. 166 | * 167 | * \note Don't pass anything else than HTML, otherwise the output will be a 168 | * **mess**! 169 | * 170 | * This is the default initializer.
171 | * You can use appendToMd() to append something to the beginning of the 172 | * generated output. 173 | */ 174 | explicit inline Converter(const std::string &html, 175 | struct Options *options = nullptr) { 176 | *this = Converter(&html, options); 177 | } 178 | 179 | /*! 180 | * \brief Convert HTML into Markdown. 181 | * \return Returns the converted Markdown. 182 | * 183 | * This function actually converts the HTML into Markdown. 184 | * It also cleans up the Markdown so you don't have to do anything. 185 | */ 186 | [[nodiscard]] std::string convert(); 187 | 188 | /*! 189 | * \brief Append a char to the Markdown. 190 | * \param ch The char to append. 191 | * \return Returns a copy of the instance with the char appended. 192 | */ 193 | Converter *appendToMd(char ch); 194 | 195 | /*! 196 | * \brief Append a char* to the Markdown. 197 | * \param str The char* to append. 198 | * \return Returns a copy of the instance with the char* appended. 199 | */ 200 | Converter *appendToMd(const char *str); 201 | 202 | /*! 203 | * \brief Append a string to the Markdown. 204 | * \param s The string to append. 205 | * \return Returns a copy of the instance with the string appended. 206 | */ 207 | inline Converter *appendToMd(const std::string &s) { 208 | return appendToMd(s.c_str()); 209 | } 210 | 211 | /*! 212 | * \brief Appends a ' ' in certain cases. 213 | * \return Copy of the instance with(maybe) the appended space. 214 | * 215 | * This function appends ' ' if: 216 | * - md does not end with `*` 217 | * - md does not end with `\n` aka newline 218 | */ 219 | Converter *appendBlank(); 220 | 221 | /*! 222 | * \brief Add an HTML symbol conversion 223 | * \param htmlSymbol The HTML symbol to convert 224 | * \param replacement The replacement string 225 | * \note This is useful for converting HTML entities to their Markdown 226 | * equivalents. For example, you can add a conversion for " " to 227 | * " " (space) or "<" to "<" (less than). 228 | * \note This is not a standard feature of the Converter class, but it can 229 | * be added to the class to allow for more flexibility in the conversion 230 | * process. You can use this feature to add custom conversions for any HTML 231 | * symbol that you want to convert to a specific Markdown representation. 232 | */ 233 | void addHtmlSymbolConversion(const std::string &htmlSymbol, 234 | const std::string &replacement) { 235 | htmlSymbolConversions_[htmlSymbol] = replacement; 236 | } 237 | 238 | /*! 239 | * \brief Remove an HTML symbol conversion 240 | * \param htmlSymbol The HTML symbol to remove 241 | * \note This is useful for removing custom conversions that you have added 242 | * previously. 243 | */ 244 | void removeHtmlSymbolConversion(const std::string &htmlSymbol) { 245 | htmlSymbolConversions_.erase(htmlSymbol); 246 | } 247 | 248 | /*! 249 | * \brief Clear all HTML symbol conversions 250 | * \note This is useful for clearing the conversion map (it's empty afterwards). 251 | */ 252 | void clearHtmlSymbolConversions() { htmlSymbolConversions_.clear(); } 253 | 254 | /*! 255 | * \brief Checks if everything was closed properly(in the HTML). 256 | * \return Returns false if there is a unclosed tag. 257 | * \note As long as you have not called convert(), it always returns true. 258 | */ 259 | [[nodiscard]] bool ok() const; 260 | 261 | /*! 262 | * \brief Reset the generated Markdown 263 | */ 264 | void reset(); 265 | 266 | /*! 267 | * \brief Checks if the HTML matches and the options are the same. 268 | * \param The Converter object to compare with 269 | * \return true if the HTML and options matches otherwise false 270 | */ 271 | inline bool operator==(const Converter *c) const { return *this == *c; } 272 | 273 | inline bool operator==(const Converter &c) const { 274 | return html_ == c.html_ && option == c.option; 275 | } 276 | 277 | /*! 278 | * \brief Returns ok(). 279 | */ 280 | inline explicit operator bool() const { return ok(); }; 281 | 282 | private: 283 | // Attributes 284 | static constexpr const char *kAttributeHref = "href"; 285 | static constexpr const char *kAttributeAlt = "alt"; 286 | static constexpr const char *kAttributeTitle = "title"; 287 | static constexpr const char *kAttributeClass = "class"; 288 | static constexpr const char *kAttributeSrc = "src"; 289 | static constexpr const char *kAttrinuteAlign = "align"; 290 | 291 | static constexpr const char *kTagAnchor = "a"; 292 | static constexpr const char *kTagBreak = "br"; 293 | static constexpr const char *kTagCode = "code"; 294 | static constexpr const char *kTagDiv = "div"; 295 | static constexpr const char *kTagHead = "head"; 296 | static constexpr const char *kTagLink = "link"; 297 | static constexpr const char *kTagListItem = "li"; 298 | static constexpr const char *kTagMeta = "meta"; 299 | static constexpr const char *kTagNav = "nav"; 300 | static constexpr const char *kTagNoScript = "noscript"; 301 | static constexpr const char *kTagOption = "option"; 302 | static constexpr const char *kTagOrderedList = "ol"; 303 | static constexpr const char *kTagParagraph = "p"; 304 | static constexpr const char *kTagPre = "pre"; 305 | static constexpr const char *kTagScript = "script"; 306 | static constexpr const char *kTagSpan = "span"; 307 | static constexpr const char *kTagStyle = "style"; 308 | static constexpr const char *kTagTemplate = "template"; 309 | static constexpr const char *kTagTitle = "title"; 310 | static constexpr const char *kTagUnorderedList = "ul"; 311 | static constexpr const char *kTagImg = "img"; 312 | static constexpr const char *kTagSeperator = "hr"; 313 | 314 | // Text format 315 | static constexpr const char *kTagBold = "b"; 316 | static constexpr const char *kTagStrong = "strong"; 317 | static constexpr const char *kTagItalic = "em"; 318 | static constexpr const char *kTagItalic2 = "i"; 319 | static constexpr const char *kTagCitation = "cite"; 320 | static constexpr const char *kTagDefinition = "dfn"; 321 | static constexpr const char *kTagUnderline = "u"; 322 | static constexpr const char *kTagStrighthrought = "del"; 323 | static constexpr const char *kTagStrighthrought2 = "s"; 324 | 325 | static constexpr const char *kTagBlockquote = "blockquote"; 326 | 327 | // Header 328 | static constexpr const char *kTagHeader1 = "h1"; 329 | static constexpr const char *kTagHeader2 = "h2"; 330 | static constexpr const char *kTagHeader3 = "h3"; 331 | static constexpr const char *kTagHeader4 = "h4"; 332 | static constexpr const char *kTagHeader5 = "h5"; 333 | static constexpr const char *kTagHeader6 = "h6"; 334 | 335 | // Table 336 | static constexpr const char *kTagTable = "table"; 337 | static constexpr const char *kTagTableRow = "tr"; 338 | static constexpr const char *kTagTableHeader = "th"; 339 | static constexpr const char *kTagTableData = "td"; 340 | 341 | size_t index_ch_in_html_ = 0; 342 | 343 | bool is_closing_tag_ = false; 344 | bool is_in_attribute_value_ = false; 345 | bool is_in_code_ = false; 346 | bool is_in_list_ = false; 347 | bool is_in_p_ = false; 348 | bool is_in_pre_ = false; 349 | bool is_in_table_ = false; 350 | bool is_in_table_row_ = false; 351 | bool is_in_tag_ = false; 352 | bool is_self_closing_tag_ = false; 353 | 354 | // relevant for
  • only, false = is in unordered list 355 | bool is_in_ordered_list_ = false; 356 | uint8_t index_ol = 0; 357 | 358 | // store the table start 359 | size_t table_start = 0; 360 | 361 | // number of lists 362 | uint8_t index_li = 0; 363 | 364 | uint8_t index_blockquote = 0; 365 | 366 | char prev_ch_in_md_ = 0, prev_prev_ch_in_md_ = 0; 367 | char prev_ch_in_html_ = 'x'; 368 | 369 | std::string html_; 370 | 371 | uint16_t offset_lt_ = 0; 372 | std::string current_tag_; 373 | std::string prev_tag_; 374 | 375 | // Line which separates header from data 376 | std::string tableLine; 377 | 378 | size_t chars_in_curr_line_ = 0; 379 | 380 | std::string md_; 381 | 382 | Options option; 383 | 384 | std::unordered_map htmlSymbolConversions_ = { 385 | {""", "\""}, {"<", "<"}, {">", ">"}, 386 | {"&", "&"}, {" ", " "}, {"→", "→"}}; 387 | 388 | // Tag: base class for tag types 389 | struct Tag { 390 | virtual void OnHasLeftOpeningTag(Converter *c) = 0; 391 | virtual void OnHasLeftClosingTag(Converter *c) = 0; 392 | }; 393 | 394 | // Tag types 395 | 396 | // tags that are not printed (nav, script, noscript, ...) 397 | struct TagIgnored : Tag { 398 | void OnHasLeftOpeningTag(Converter *c) override {}; 399 | void OnHasLeftClosingTag(Converter *c) override {}; 400 | }; 401 | 402 | struct TagAnchor : Tag { 403 | void OnHasLeftOpeningTag(Converter *c) override; 404 | void OnHasLeftClosingTag(Converter *c) override; 405 | 406 | std::string current_href_; 407 | std::string current_title_; 408 | }; 409 | 410 | struct TagBold : Tag { 411 | void OnHasLeftOpeningTag(Converter *c) override; 412 | void OnHasLeftClosingTag(Converter *c) override; 413 | }; 414 | 415 | struct TagItalic : Tag { 416 | void OnHasLeftOpeningTag(Converter *c) override; 417 | void OnHasLeftClosingTag(Converter *c) override; 418 | }; 419 | 420 | struct TagUnderline : Tag { 421 | void OnHasLeftOpeningTag(Converter *c) override; 422 | void OnHasLeftClosingTag(Converter *c) override; 423 | }; 424 | 425 | struct TagStrikethrought : Tag { 426 | void OnHasLeftOpeningTag(Converter *c) override; 427 | void OnHasLeftClosingTag(Converter *c) override; 428 | }; 429 | 430 | struct TagBreak : Tag { 431 | void OnHasLeftOpeningTag(Converter *c) override; 432 | void OnHasLeftClosingTag(Converter *c) override; 433 | }; 434 | 435 | struct TagDiv : Tag { 436 | void OnHasLeftOpeningTag(Converter *c) override; 437 | void OnHasLeftClosingTag(Converter *c) override; 438 | }; 439 | 440 | struct TagHeader1 : Tag { 441 | void OnHasLeftOpeningTag(Converter *c) override; 442 | void OnHasLeftClosingTag(Converter *c) override; 443 | }; 444 | 445 | struct TagHeader2 : Tag { 446 | void OnHasLeftOpeningTag(Converter *c) override; 447 | void OnHasLeftClosingTag(Converter *c) override; 448 | }; 449 | 450 | struct TagHeader3 : Tag { 451 | void OnHasLeftOpeningTag(Converter *c) override; 452 | void OnHasLeftClosingTag(Converter *c) override; 453 | }; 454 | 455 | struct TagHeader4 : Tag { 456 | void OnHasLeftOpeningTag(Converter *c) override; 457 | void OnHasLeftClosingTag(Converter *c) override; 458 | }; 459 | 460 | struct TagHeader5 : Tag { 461 | void OnHasLeftOpeningTag(Converter *c) override; 462 | void OnHasLeftClosingTag(Converter *c) override; 463 | }; 464 | 465 | struct TagHeader6 : Tag { 466 | void OnHasLeftOpeningTag(Converter *c) override; 467 | void OnHasLeftClosingTag(Converter *c) override; 468 | }; 469 | 470 | struct TagListItem : Tag { 471 | void OnHasLeftOpeningTag(Converter *c) override; 472 | void OnHasLeftClosingTag(Converter *c) override; 473 | }; 474 | 475 | struct TagOption : Tag { 476 | void OnHasLeftOpeningTag(Converter *c) override; 477 | void OnHasLeftClosingTag(Converter *c) override; 478 | }; 479 | 480 | struct TagOrderedList : Tag { 481 | void OnHasLeftOpeningTag(Converter *c) override; 482 | void OnHasLeftClosingTag(Converter *c) override; 483 | }; 484 | 485 | struct TagParagraph : Tag { 486 | void OnHasLeftOpeningTag(Converter *c) override; 487 | void OnHasLeftClosingTag(Converter *c) override; 488 | }; 489 | 490 | struct TagPre : Tag { 491 | void OnHasLeftOpeningTag(Converter *c) override; 492 | void OnHasLeftClosingTag(Converter *c) override; 493 | }; 494 | 495 | struct TagCode : Tag { 496 | void OnHasLeftOpeningTag(Converter *c) override; 497 | void OnHasLeftClosingTag(Converter *c) override; 498 | }; 499 | 500 | struct TagSpan : Tag { 501 | void OnHasLeftOpeningTag(Converter *c) override; 502 | void OnHasLeftClosingTag(Converter *c) override; 503 | }; 504 | 505 | struct TagTitle : Tag { 506 | void OnHasLeftOpeningTag(Converter *c) override; 507 | void OnHasLeftClosingTag(Converter *c) override; 508 | }; 509 | 510 | struct TagUnorderedList : Tag { 511 | void OnHasLeftOpeningTag(Converter *c) override; 512 | void OnHasLeftClosingTag(Converter *c) override; 513 | }; 514 | 515 | struct TagImage : Tag { 516 | void OnHasLeftOpeningTag(Converter *c) override; 517 | void OnHasLeftClosingTag(Converter *c) override; 518 | }; 519 | 520 | struct TagSeperator : Tag { 521 | void OnHasLeftOpeningTag(Converter *c) override; 522 | void OnHasLeftClosingTag(Converter *c) override; 523 | }; 524 | 525 | struct TagTable : Tag { 526 | void OnHasLeftOpeningTag(Converter *c) override; 527 | void OnHasLeftClosingTag(Converter *c) override; 528 | }; 529 | 530 | struct TagTableRow : Tag { 531 | void OnHasLeftOpeningTag(Converter *c) override; 532 | void OnHasLeftClosingTag(Converter *c) override; 533 | }; 534 | 535 | struct TagTableHeader : Tag { 536 | void OnHasLeftOpeningTag(Converter *c) override; 537 | void OnHasLeftClosingTag(Converter *c) override; 538 | }; 539 | 540 | struct TagTableData : Tag { 541 | void OnHasLeftOpeningTag(Converter *c) override; 542 | void OnHasLeftClosingTag(Converter *c) override; 543 | }; 544 | 545 | struct TagBlockquote : Tag { 546 | void OnHasLeftOpeningTag(Converter *c) override; 547 | void OnHasLeftClosingTag(Converter *c) override; 548 | }; 549 | 550 | std::unordered_map> tags_; 551 | 552 | explicit Converter(const std::string *html, struct Options *options); 553 | 554 | void CleanUpMarkdown(); 555 | 556 | // Trim from start (in place) 557 | static void LTrim(std::string *s); 558 | 559 | // Trim from end (in place) 560 | Converter *RTrim(std::string *s, bool trim_only_blank = false); 561 | 562 | // Trim from both ends (in place) 563 | Converter *Trim(std::string *s); 564 | 565 | // 1. trim all lines 566 | // 2. reduce consecutive newlines to maximum 3 567 | void TidyAllLines(std::string *str); 568 | 569 | std::string ExtractAttributeFromTagLeftOf(const std::string &attr); 570 | 571 | void TurnLineIntoHeader1(); 572 | 573 | void TurnLineIntoHeader2(); 574 | 575 | // Current char: '<' 576 | void OnHasEnteredTag(); 577 | 578 | Converter *UpdatePrevChFromMd(); 579 | 580 | /** 581 | * Handle next char within <...> tag 582 | * 583 | * @param ch current character 584 | * @return continue surrounding iteration? 585 | */ 586 | bool ParseCharInTag(char ch); 587 | 588 | // Current char: '>' 589 | bool OnHasLeftTag(); 590 | 591 | inline static bool TagContainsAttributesToHide(std::string *tag) { 592 | using std::string; 593 | 594 | return (*tag).find(" aria=\"hidden\"") != string::npos || 595 | (*tag).find("display:none") != string::npos || 596 | (*tag).find("visibility:hidden") != string::npos || 597 | (*tag).find("opacity:0") != string::npos || 598 | (*tag).find("Details-content--hidden-not-important") != string::npos; 599 | } 600 | 601 | Converter *ShortenMarkdown(size_t chars = 1); 602 | inline bool shortIfPrevCh(char prev) { 603 | if (prev_ch_in_md_ == prev) { 604 | ShortenMarkdown(); 605 | return true; 606 | } 607 | return false; 608 | }; 609 | 610 | /** 611 | * @param ch 612 | * @return continue iteration surrounding this method's invocation? 613 | */ 614 | bool ParseCharInTagContent(char ch); 615 | 616 | // Replace previous space (if any) in current markdown line by newline 617 | bool ReplacePreviousSpaceInLineByNewline(); 618 | 619 | static inline bool IsIgnoredTag(const std::string &tag) { 620 | return (tag[0] == '-' || kTagTemplate == tag || kTagStyle == tag || 621 | kTagScript == tag || kTagNoScript == tag || kTagNav == tag); 622 | 623 | // meta: not ignored to tolerate if closing is omitted 624 | } 625 | 626 | [[nodiscard]] bool IsInIgnoredTag() const; 627 | }; // Converter 628 | 629 | /*! 630 | * \brief Static wrapper around the Converter class 631 | * \param html The HTML passed to Converter 632 | * \param ok Optional: Pass a reference to a local bool to store the output of 633 | * Converter::ok() \return Returns the by Converter generated Markdown 634 | */ 635 | inline std::string Convert(const std::string &html, bool *ok = nullptr) { 636 | Converter c(html); 637 | auto md = c.convert(); 638 | if (ok != nullptr) 639 | *ok = c.ok(); 640 | return md; 641 | } 642 | 643 | #ifndef PYTHON_BINDINGS 644 | inline std::string Convert(const std::string &&html, bool *ok = nullptr) { 645 | return Convert(html, ok); 646 | } 647 | #endif 648 | 649 | } // namespace html2md 650 | 651 | #endif // HTML2MD_H 652 | -------------------------------------------------------------------------------- /include/table.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Tim Gromeyer 2 | // Licensed under the MIT License - https://opensource.org/licenses/MIT 3 | 4 | #ifndef TABLE_H 5 | #define TABLE_H 6 | 7 | #include 8 | 9 | [[nodiscard]] std::string formatMarkdownTable(const std::string &inputTable); 10 | 11 | #endif // TABLE_H 12 | -------------------------------------------------------------------------------- /js/bindings.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "html2md.h" 3 | 4 | using namespace emscripten; 5 | 6 | EMSCRIPTEN_BINDINGS(html2md) { 7 | class_("Options") 8 | .constructor<>() 9 | .property("splitLines", &html2md::options::splitLines, &html2md::options::splitLines) 10 | .property("unorderedList", &html2md::options::unorderedList, &html2md::options::unorderedList) 11 | .property("orderedList", &html2md::options::orderedList, &html2md::options::orderedList) 12 | .property("includeTitle", &html2md::options::includeTitle, &html2md::options::includeTitle); 13 | 14 | class_("Converter") 15 | .constructor() 16 | .function("convert2Md", &html2md::Converter::Convert2Md) 17 | .function("ok", &html2md::Converter::ok); 18 | 19 | function("convert", &html2md::Convert); 20 | } 21 | 22 | -------------------------------------------------------------------------------- /objc/html2md_objc.mm: -------------------------------------------------------------------------------- 1 | // 2 | // html2md.m 3 | // html2md 4 | // 5 | // Created by 秋星桥 on 2/17/25. 6 | // 7 | 8 | #import 9 | 10 | #include "html2md.h" 11 | #include "include/html2md_objc.h" 12 | 13 | #include 14 | 15 | @implementation HTML2MD 16 | 17 | + (NSString *)convertHTMLToMarkdown:(NSString *)html { 18 | const char *htmlStr = [html UTF8String]; 19 | std::string outputMarkdown = html2md::Convert(htmlStr); 20 | NSString *markdownStr = [NSString stringWithUTF8String:outputMarkdown.c_str()]; 21 | return markdownStr; 22 | } 23 | 24 | @end 25 | -------------------------------------------------------------------------------- /objc/include/html2md_objc.h: -------------------------------------------------------------------------------- 1 | // 2 | // Header.h 3 | // html2md 4 | // 5 | // Created by 秋星桥 on 2/17/25. 6 | // 7 | 8 | #ifndef html2md_objc_h 9 | #define html2md_objc_h 10 | 11 | #include 12 | 13 | @interface HTML2MD : NSObject 14 | 15 | + (NSString *)convertHTMLToMarkdown:(NSString *)html; 16 | 17 | @end 18 | 19 | #endif /* html2md_objc_h */ 20 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["scikit-build-core", "pybind11>=2.12,<2.14"] 3 | build-backend = "scikit_build_core.build" 4 | 5 | [project] 6 | name = "pyhtml2md" 7 | authors = [ 8 | { name = "Tim Gromeyer", email = "sakul8826@gmail.com" } 9 | ] 10 | description = "Transform your HTML into clean, easy-to-read markdown with pyhtml2md." 11 | readme = "python/README.md" 12 | requires-python = ">=3.7" 13 | license = { text = "MIT" } 14 | version = "1.7.0" 15 | classifiers = [ 16 | "Intended Audience :: Developers", 17 | "License :: OSI Approved :: MIT License", 18 | "Programming Language :: C++", 19 | "Programming Language :: Python", 20 | "Programming Language :: Python :: 3", 21 | "Programming Language :: Python :: 3 :: Only", 22 | "Programming Language :: Python :: 3.7", 23 | "Programming Language :: Python :: 3.8", 24 | "Programming Language :: Python :: 3.9", 25 | "Topic :: File Formats", 26 | "Topic :: Text Processing :: Markup :: Markdown", 27 | "Topic :: Text Processing :: Markup :: HTML", 28 | ] 29 | keywords = [ 30 | "html", "markdown", "html-to-markdown", 31 | "python3", "cpp17", "cpp-library", 32 | "html2markdown", "html2md" 33 | ] 34 | 35 | [project.urls] 36 | Repository = "https://github.com/tim-gromeyer/html2md" 37 | 38 | [project.optional-dependencies] 39 | test = ["pytest>=6.0"] 40 | 41 | [tool.scikit-build] 42 | cmake.verbose = true 43 | logging.level = "INFO" 44 | minimum-version = "0.8" 45 | # TODO: Figure out when CMake added FindPython 46 | cmake.version = ">=3.12" 47 | 48 | [tool.scikit-build.cmake.define] 49 | PYTHON_BINDINGS = "ON" 50 | PYBIND11_FINDPYTHON = "ON" 51 | 52 | [tool.isort] 53 | profile = "black" 54 | 55 | [tool.mypy] 56 | files = "setup.py" 57 | python_version = "3.7" 58 | strict = true 59 | show_error_codes = true 60 | enable_error_code = ["ignore-without-code", "redundant-expr", "truthy-bool"] 61 | warn_unreachable = true 62 | 63 | [[tool.mypy.overrides]] 64 | module = ["ninja"] 65 | ignore_missing_imports = true 66 | 67 | 68 | [tool.pytest.ini_options] 69 | minversion = "6.0" 70 | addopts = ["-ra", "--showlocals", "--strict-markers", "--strict-config"] 71 | xfail_strict = true 72 | filterwarnings = ["error"] 73 | testpaths = ["tests"] 74 | 75 | [tool.cibuildwheel] 76 | test-command = "pytest {project}/tests/python" 77 | test-extras = ["test"] 78 | test-skip = ["*universal2:arm64"] 79 | # Setuptools bug causes collision between pypy and cpython artifacts 80 | before-build = "rm -rf {project}/build" 81 | 82 | -------------------------------------------------------------------------------- /python/README.md: -------------------------------------------------------------------------------- 1 | # pyhtml2md 2 | 3 | pyhtml2md provides a way to use the html2md C++ library in Python. html2md is a fast and reliable library for converting HTML content into markdown. 4 | 5 |
    6 | 7 | - [Installation](#installation) 8 | - [Basic usage](#basic-usage) 9 | - [Advanced usage](#advanced-usage) 10 | - [Supported Tags](#supported-tags) 11 | - [License](#license) 12 | 13 |
    14 | 15 | 20 | 21 | 22 | ## Installation 23 | 24 | You can install using pip: 25 | 26 | ```bash 27 | pip3 install pyhtml2md 28 | ``` 29 | 30 | ## Basic usage 31 | 32 | Here is an example of how to use the pyhtml2md to convert HTML to markdown: 33 | 34 | ```python 35 | import pyhtml2md 36 | 37 | markdown = pyhtml2md.convert("

    Hello, world!

    ") 38 | print(markdown) 39 | ``` 40 | 41 | The `convert` function takes an HTML string as input and returns a markdown string. 42 | 43 | ## Advanced usage 44 | 45 | pyhtml2md provides a `Options` class to customize the generation process. 46 | You can find all information on the c++ [documentation](https://tim-gromeyer.github.io/html2md/index.html) 47 | 48 | Here is an example: 49 | 50 | ```python 51 | import pyhtml2md 52 | 53 | options = pyhtml2md.Options() 54 | options.splitLines = False 55 | 56 | converter = pyhtml2md.Converter("

    Hello Python!

    ", options) 57 | markdown = converter.convert() 58 | print(markdown) 59 | print(converter.ok()) 60 | ``` 61 | 62 | ## Supported Tags 63 | 64 | pyhtml2md supports the following HTML tags: 65 | 66 | | Tag | Description | Comment | 67 | |--------------|--------------------|-----------------------------------------------------| 68 | | `a` | Anchor or link | Supports the `href`, `name` and `title` attributes. | 69 | | `b` | Bold | | 70 | | `blockquote` | Indented paragraph | | 71 | | `br` | Line break | | 72 | | `cite` | Inline citation | Same as `i`. | 73 | | `code` | Code | | 74 | | `dd` | Definition data | | 75 | | `del` | Strikethrough | | 76 | | `dfn` | Definition | Same as `i`. | 77 | | `div` | Document division | | 78 | | `em` | Emphasized | Same as `i`. | 79 | | `h1` | Level 1 heading | | 80 | | `h2` | Level 2 heading | | 81 | | `h3` | Level 3 heading | | 82 | | `h4` | Level 4 heading | | 83 | | `h5` | Level 5 heading | | 84 | | `h6` | Level 6 heading | | 85 | | `head` | Document header | Ignored. | 86 | | `hr` | Horizontal line | | 87 | | `i` | Italic | | 88 | | `img` | Image | Supports `src`, `alt`, `title` attributes. | 89 | | `li` | List item | | 90 | | `meta` | Meta-information | Ignored. | 91 | | `ol` | Ordered list | | 92 | | `p` | Paragraph | | 93 | | `pre` | Preformatted text | Works only with `code`. | 94 | | `s` | Strikethrough | Same as `del`. | 95 | | `span` | Grouped elements | Does nothing. | 96 | | `strong` | Strong | Same as `b`. | 97 | | `table` | Table | Tables are formatted! | 98 | | `tbody` | Table body | Does nothing. | 99 | | `td` | Table data cell | Uses `align` from `th`. | 100 | | `tfoot` | Table footer | Does nothing. | 101 | | `th` | Table header cell | Supports the `align` attribute. | 102 | | `thead` | Table header | Does nothing. | 103 | | `title` | Document title | Same as `h1`. | 104 | | `tr` | Table row | | 105 | | `u` | Underlined | Uses HTML. | 106 | | `ul` | Unordered list | | 107 | 108 | ## License 109 | 110 | pyhtml2md is licensed under [The MIT License (MIT)](https://opensource.org/licenses/MIT) 111 | -------------------------------------------------------------------------------- /python/bindings.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | namespace py = pybind11; 4 | 5 | PYBIND11_MODULE(pyhtml2md, m) { 6 | m.doc() = "Python bindings for html2md"; // optional module docstring 7 | 8 | // Options class bindings 9 | py::class_(m, "Options") 10 | .def(py::init<>()) 11 | .def_readwrite( 12 | "splitLines", &html2md::Options::splitLines, 13 | "Add new line when a certain number of characters is reached") 14 | .def_readwrite("softBreak", &html2md::Options::softBreak, 15 | "Wrap after ... characters when the next space is reached") 16 | .def_readwrite("hardBreak", &html2md::Options::hardBreak, 17 | "Force a break after ... characters in a line") 18 | .def_readwrite("unorderedList", &html2md::Options::unorderedList, 19 | "The char used for unordered lists") 20 | .def_readwrite("orderedList", &html2md::Options::orderedList, 21 | "The char used after the number of the item") 22 | .def_readwrite("includeTitle", &html2md::Options::includeTitle, 23 | "Whether title is added as h1 heading at the very " 24 | "beginning of the markdown") 25 | .def_readwrite("formatTable", &html2md::Options::formatTable, 26 | "Whether to format Markdown Tables") 27 | .def("__eq__", &html2md::Options::operator==); 28 | 29 | py::class_(m, "Converter") 30 | .def(py::init(), 31 | "Class for converting HTML to Markdown", py::arg("html"), 32 | py::arg("options") = py::none()) 33 | .def("convert", &html2md::Converter::convert, 34 | "This function actually converts the HTML into Markdown.") 35 | .def("ok", &html2md::Converter::ok, 36 | "Checks if everything was closed properly(in the HTML).") 37 | .def("add_html_symbol_conversion", 38 | &html2md::Converter::addHtmlSymbolConversion, 39 | "Add or modify an HTML symbol conversion", py::arg("html_symbol"), 40 | py::arg("replacement")) 41 | .def("remove_html_symbol_conversion", 42 | &html2md::Converter::removeHtmlSymbolConversion, 43 | "Remove an HTML symbol conversion", py::arg("html_symbol")) 44 | .def("clear_html_symbol_conversions", 45 | &html2md::Converter::clearHtmlSymbolConversions, 46 | "Clear all HTML symbol conversions") 47 | .def("__call__", &html2md::Converter::operator bool); 48 | 49 | m.def("convert", &html2md::Convert, 50 | "Static wrapper around the Converter class", py::arg("html"), 51 | py::arg("ok") = py::none()); 52 | } -------------------------------------------------------------------------------- /scripts/clang-format.sh: -------------------------------------------------------------------------------- 1 | cd $(dirname "$0") 2 | cd .. 3 | clang-format -style=llvm -i cli/*.cpp include/*.h python/*.cpp src/*.cpp tests/*.cpp js/*.cpp 4 | -------------------------------------------------------------------------------- /src/html2md.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Tim Gromeyer 2 | // Licensed under the MIT License - https://opensource.org/licenses/MIT 3 | 4 | #include "html2md.h" 5 | #include "table.h" 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | using std::make_shared; 14 | using std::string; 15 | using std::vector; 16 | 17 | namespace { 18 | bool startsWith(const string &str, const string &prefix) { 19 | return str.size() >= prefix.size() && 20 | 0 == str.compare(0, prefix.size(), prefix); 21 | } 22 | 23 | bool endsWith(const string &str, const string &suffix) { 24 | return str.size() >= suffix.size() && 25 | 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix); 26 | } 27 | 28 | size_t ReplaceAll(string *haystack, const string &needle, 29 | const string &replacement) { 30 | // Get first occurrence 31 | size_t pos = (*haystack).find(needle); 32 | 33 | size_t amount_replaced = 0; 34 | 35 | // Repeat until end is reached 36 | while (pos != string::npos) { 37 | // Replace this occurrence of sub string 38 | (*haystack).replace(pos, needle.size(), replacement); 39 | 40 | // Get the next occurrence from the current position 41 | pos = (*haystack).find(needle, pos + replacement.size()); 42 | 43 | ++amount_replaced; 44 | } 45 | 46 | return amount_replaced; 47 | } 48 | 49 | size_t ReplaceAll(string *haystack, const string &needle, const char c) { 50 | return ReplaceAll(haystack, needle, string({c})); 51 | } 52 | 53 | // Split given string by given character delimiter into vector of strings 54 | vector Split(string const &str, char delimiter) { 55 | vector result; 56 | std::stringstream iss(str); 57 | 58 | for (string token; getline(iss, token, delimiter);) 59 | result.push_back(token); 60 | 61 | return result; 62 | } 63 | 64 | string Repeat(const string &str, size_t amount) { 65 | if (amount == 0) 66 | return ""; 67 | else if (amount == 1) 68 | return str; 69 | 70 | string out; 71 | 72 | for (size_t i = 0; i < amount; ++i) 73 | out.append(str); 74 | 75 | return out; 76 | } 77 | 78 | string toLower(const string &str) { 79 | string lower; 80 | lower.reserve(str.size()); 81 | for (char ch : str) { 82 | lower += tolower(ch); 83 | } 84 | return lower; 85 | } 86 | 87 | } // namespace 88 | 89 | namespace html2md { 90 | 91 | Converter::Converter(const string *html, Options *options) : html_(*html) { 92 | if (options) 93 | option = *options; 94 | 95 | md_.reserve(html->size() * 0.8); 96 | tags_.reserve(41); 97 | 98 | // non-printing tags 99 | auto tagIgnored = make_shared(); 100 | tags_[kTagHead] = tagIgnored; 101 | tags_[kTagMeta] = tagIgnored; 102 | tags_[kTagNav] = tagIgnored; 103 | tags_[kTagNoScript] = tagIgnored; 104 | tags_[kTagScript] = tagIgnored; 105 | tags_[kTagStyle] = tagIgnored; 106 | tags_[kTagTemplate] = tagIgnored; 107 | 108 | // printing tags 109 | tags_[kTagAnchor] = make_shared(); 110 | tags_[kTagBreak] = make_shared(); 111 | tags_[kTagDiv] = make_shared(); 112 | tags_[kTagHeader1] = make_shared(); 113 | tags_[kTagHeader2] = make_shared(); 114 | tags_[kTagHeader3] = make_shared(); 115 | tags_[kTagHeader4] = make_shared(); 116 | tags_[kTagHeader5] = make_shared(); 117 | tags_[kTagHeader6] = make_shared(); 118 | tags_[kTagListItem] = make_shared(); 119 | tags_[kTagOption] = make_shared(); 120 | tags_[kTagOrderedList] = make_shared(); 121 | tags_[kTagPre] = make_shared(); 122 | tags_[kTagCode] = make_shared(); 123 | tags_[kTagParagraph] = make_shared(); 124 | tags_[kTagSpan] = make_shared(); 125 | tags_[kTagUnorderedList] = make_shared(); 126 | tags_[kTagTitle] = make_shared(); 127 | tags_[kTagImg] = make_shared(); 128 | tags_[kTagSeperator] = make_shared(); 129 | 130 | // Text formatting 131 | auto tagBold = make_shared(); 132 | tags_[kTagBold] = tagBold; 133 | tags_[kTagStrong] = tagBold; 134 | 135 | auto tagItalic = make_shared(); 136 | tags_[kTagItalic] = tagItalic; 137 | tags_[kTagItalic2] = tagItalic; 138 | tags_[kTagDefinition] = tagItalic; 139 | tags_[kTagCitation] = tagItalic; 140 | 141 | tags_[kTagUnderline] = make_shared(); 142 | 143 | auto tagStrighthrought = make_shared(); 144 | tags_[kTagStrighthrought] = tagStrighthrought; 145 | tags_[kTagStrighthrought2] = tagStrighthrought; 146 | 147 | tags_[kTagBlockquote] = make_shared(); 148 | 149 | // Tables 150 | tags_[kTagTable] = make_shared(); 151 | tags_[kTagTableRow] = make_shared(); 152 | tags_[kTagTableHeader] = make_shared(); 153 | tags_[kTagTableData] = make_shared(); 154 | } 155 | 156 | void Converter::CleanUpMarkdown() { 157 | TidyAllLines(&md_); 158 | std::string buffer; 159 | buffer.reserve(md_.size()); 160 | 161 | // Replace HTML symbols during the initial pass 162 | for (size_t i = 0; i < md_.size();) { 163 | bool replaced = false; 164 | 165 | // C++11 compatible iteration over htmlSymbolConversions_ 166 | for (const auto &symbol_replacement : htmlSymbolConversions_) { 167 | const std::string &symbol = symbol_replacement.first; 168 | const std::string &replacement = symbol_replacement.second; 169 | 170 | if (md_.compare(i, symbol.size(), symbol) == 0) { 171 | buffer.append(replacement); 172 | i += symbol.size(); 173 | replaced = true; 174 | break; 175 | } 176 | } 177 | 178 | if (!replaced) { 179 | buffer.push_back(md_[i++]); 180 | } 181 | } 182 | 183 | // Use swap instead of move assignment for better pre-C++11 compatibility 184 | md_.swap(buffer); 185 | 186 | // Optimized replacement sequence 187 | const char *replacements[][2] = { 188 | {" , ", ", "}, {"\n.\n", ".\n"}, {"\n↵\n", " ↵\n"}, {"\n*\n", "\n"}, 189 | {"\n. ", ".\n"}, {"\t\t ", "\t\t"}, 190 | }; 191 | 192 | for (const auto &replacement : replacements) { 193 | ReplaceAll(&md_, replacement[0], replacement[1]); 194 | } 195 | } 196 | 197 | Converter *Converter::appendToMd(char ch) { 198 | if (IsInIgnoredTag()) 199 | return this; 200 | 201 | if (index_blockquote != 0 && ch == '\n') { 202 | if (is_in_pre_) { 203 | md_ += ch; 204 | chars_in_curr_line_ = 0; 205 | appendToMd(Repeat("> ", index_blockquote)); 206 | } 207 | 208 | return this; 209 | } 210 | 211 | md_ += ch; 212 | 213 | if (ch == '\n') 214 | chars_in_curr_line_ = 0; 215 | else 216 | ++chars_in_curr_line_; 217 | 218 | return this; 219 | } 220 | 221 | Converter *Converter::appendToMd(const char *str) { 222 | if (IsInIgnoredTag()) 223 | return this; 224 | 225 | md_ += str; 226 | 227 | auto str_len = strlen(str); 228 | 229 | for (auto i = 0; i < str_len; ++i) { 230 | if (str[i] == '\n') 231 | chars_in_curr_line_ = 0; 232 | else 233 | ++chars_in_curr_line_; 234 | } 235 | 236 | return this; 237 | } 238 | 239 | Converter *Converter::appendBlank() { 240 | UpdatePrevChFromMd(); 241 | 242 | if (prev_ch_in_md_ == '\n' || 243 | (prev_ch_in_md_ == '*' && prev_prev_ch_in_md_ == '*')) 244 | return this; 245 | 246 | return appendToMd(' '); 247 | } 248 | 249 | bool Converter::ok() const { 250 | return !is_in_pre_ && !is_in_list_ && !is_in_p_ && !is_in_table_ && 251 | !is_in_tag_ && index_blockquote == 0 && index_li == 0; 252 | } 253 | 254 | void Converter::LTrim(string *s) { 255 | (*s).erase((*s).begin(), 256 | find_if((*s).begin(), (*s).end(), 257 | [](unsigned char ch) { return !std::isspace(ch); })); 258 | } 259 | 260 | Converter *Converter::RTrim(string *s, bool trim_only_blank) { 261 | (*s).erase(find_if((*s).rbegin(), (*s).rend(), 262 | [trim_only_blank](unsigned char ch) { 263 | if (trim_only_blank) 264 | return !isblank(ch); 265 | 266 | return !isspace(ch); 267 | }) 268 | .base(), 269 | (*s).end()); 270 | 271 | return this; 272 | } 273 | 274 | // NOTE: Pay attention when changing one of the trim functions. It can break the 275 | // output! 276 | Converter *Converter::Trim(string *s) { 277 | if (!startsWith(*s, "\t")) 278 | LTrim(s); 279 | 280 | if (!(startsWith(*s, " "), endsWith(*s, " "))) 281 | RTrim(s); 282 | 283 | return this; 284 | } 285 | 286 | void Converter::TidyAllLines(string *str) { 287 | auto lines = Split(*str, '\n'); 288 | string res; 289 | 290 | uint8_t amount_newlines = 0; 291 | bool in_code_block = false; 292 | 293 | for (auto line : lines) { 294 | if (startsWith(line, "```") || startsWith(line, "~~~")) 295 | in_code_block = !in_code_block; 296 | if (in_code_block) { 297 | res += line + '\n'; 298 | continue; 299 | } 300 | 301 | Trim(&line); 302 | 303 | if (line.empty()) { 304 | if (amount_newlines < 2 && !res.empty()) { 305 | res += '\n'; 306 | amount_newlines++; 307 | } 308 | } else { 309 | amount_newlines = 0; 310 | 311 | res += line + '\n'; 312 | } 313 | } 314 | 315 | *str = res; 316 | } 317 | 318 | string Converter::ExtractAttributeFromTagLeftOf(const string &attr) { 319 | // Extract the whole tag from current offset, e.g. from '>', backwards 320 | auto tag = html_.substr(offset_lt_, index_ch_in_html_ - offset_lt_); 321 | string lowerTag = toLower(tag); // Convert tag to lowercase for comparison 322 | 323 | // locate given attribute (case-insensitive) 324 | auto offset_attr = lowerTag.find(attr); 325 | 326 | if (offset_attr == string::npos) 327 | return ""; 328 | 329 | // locate attribute-value pair's '=' 330 | auto offset_equals = tag.find('=', offset_attr); 331 | 332 | if (offset_equals == string::npos) 333 | return ""; 334 | 335 | // locate value's surrounding quotes 336 | auto offset_double_quote = tag.find('"', offset_equals); 337 | auto offset_single_quote = tag.find('\'', offset_equals); 338 | 339 | bool has_double_quote = offset_double_quote != string::npos; 340 | bool has_single_quote = offset_single_quote != string::npos; 341 | 342 | if (!has_double_quote && !has_single_quote) 343 | return ""; 344 | 345 | char wrapping_quote = 0; 346 | 347 | size_t offset_opening_quote = 0; 348 | size_t offset_closing_quote = 0; 349 | 350 | if (has_double_quote) { 351 | if (!has_single_quote) { 352 | wrapping_quote = '"'; 353 | offset_opening_quote = offset_double_quote; 354 | } else { 355 | if (offset_double_quote < offset_single_quote) { 356 | wrapping_quote = '"'; 357 | offset_opening_quote = offset_double_quote; 358 | } else { 359 | wrapping_quote = '\''; 360 | offset_opening_quote = offset_single_quote; 361 | } 362 | } 363 | } else { 364 | // has only single quote 365 | wrapping_quote = '\''; 366 | offset_opening_quote = offset_single_quote; 367 | } 368 | 369 | if (offset_opening_quote == string::npos) 370 | return ""; 371 | 372 | offset_closing_quote = tag.find(wrapping_quote, offset_opening_quote + 1); 373 | 374 | if (offset_closing_quote == string::npos) 375 | return ""; 376 | 377 | return tag.substr(offset_opening_quote + 1, 378 | offset_closing_quote - 1 - offset_opening_quote); 379 | } 380 | 381 | void Converter::TurnLineIntoHeader1() { 382 | appendToMd('\n' + Repeat("=", chars_in_curr_line_) + "\n\n"); 383 | 384 | chars_in_curr_line_ = 0; 385 | } 386 | 387 | void Converter::TurnLineIntoHeader2() { 388 | appendToMd('\n' + Repeat("-", chars_in_curr_line_) + "\n\n"); 389 | 390 | chars_in_curr_line_ = 0; 391 | } 392 | 393 | string Converter::convert() { 394 | // We already converted 395 | if (index_ch_in_html_ == html_.size()) 396 | return md_; 397 | 398 | reset(); 399 | 400 | for (char ch : html_) { 401 | ++index_ch_in_html_; 402 | 403 | if (!is_in_tag_ && ch == '<') { 404 | OnHasEnteredTag(); 405 | 406 | continue; 407 | } 408 | 409 | if (is_in_tag_) 410 | ParseCharInTag(ch); 411 | else 412 | ParseCharInTagContent(ch); 413 | } 414 | 415 | CleanUpMarkdown(); 416 | 417 | return md_; 418 | } 419 | 420 | void Converter::OnHasEnteredTag() { 421 | offset_lt_ = index_ch_in_html_; 422 | is_in_tag_ = true; 423 | is_closing_tag_ = false; 424 | prev_tag_ = current_tag_; 425 | current_tag_ = ""; 426 | 427 | if (!md_.empty()) { 428 | UpdatePrevChFromMd(); 429 | } 430 | } 431 | 432 | Converter *Converter::UpdatePrevChFromMd() { 433 | if (!md_.empty()) { 434 | prev_ch_in_md_ = md_[md_.length() - 1]; 435 | 436 | if (md_.length() > 1) 437 | prev_prev_ch_in_md_ = md_[md_.length() - 2]; 438 | } 439 | 440 | return this; 441 | } 442 | 443 | bool Converter::ParseCharInTag(char ch) { 444 | static bool skipping_leading_whitespace = true; 445 | 446 | if (ch == '/' && !is_in_attribute_value_) { 447 | is_closing_tag_ = current_tag_.empty(); 448 | is_self_closing_tag_ = !is_closing_tag_; 449 | skipping_leading_whitespace = true; // Reset for next tag 450 | return true; 451 | } 452 | 453 | if (ch == '>') { 454 | // Trim trailing whitespace by removing characters from current_tag_ 455 | while (!current_tag_.empty() && std::isspace(current_tag_.back())) { 456 | current_tag_.pop_back(); 457 | } 458 | skipping_leading_whitespace = true; // Reset for next tag 459 | if (!is_self_closing_tag_) 460 | return OnHasLeftTag(); 461 | else { 462 | OnHasLeftTag(); 463 | is_self_closing_tag_ = false; 464 | is_closing_tag_ = true; 465 | return OnHasLeftTag(); 466 | } 467 | } 468 | 469 | if (ch == '"') { 470 | if (is_in_attribute_value_) { 471 | is_in_attribute_value_ = false; 472 | } else { 473 | size_t pos = current_tag_.length(); 474 | while (pos > 0 && isspace(current_tag_[pos - 1])) { 475 | pos--; 476 | } 477 | if (pos > 0 && current_tag_[pos - 1] == '=') { 478 | is_in_attribute_value_ = true; 479 | } 480 | } 481 | skipping_leading_whitespace = false; // Stop skipping after attribute 482 | return true; 483 | } 484 | 485 | // Handle whitespace: skip leading whitespace, keep others 486 | if (isspace(ch) && skipping_leading_whitespace) { 487 | return true; // Ignore leading whitespace 488 | } 489 | 490 | // Once we encounter a non-whitespace character, stop skipping 491 | skipping_leading_whitespace = false; 492 | current_tag_ += tolower(ch); 493 | return false; 494 | } 495 | 496 | bool Converter::OnHasLeftTag() { 497 | is_in_tag_ = false; 498 | 499 | UpdatePrevChFromMd(); 500 | 501 | if (!is_closing_tag_) 502 | if (TagContainsAttributesToHide(¤t_tag_)) 503 | return true; 504 | 505 | auto cut_tags = Split(current_tag_, ' '); 506 | if (cut_tags.empty()) 507 | return true; 508 | 509 | current_tag_ = cut_tags[0]; 510 | 511 | auto tag = tags_[current_tag_]; 512 | 513 | if (!tag) 514 | return true; 515 | 516 | if (!is_closing_tag_) { 517 | tag->OnHasLeftOpeningTag(this); 518 | } 519 | else { 520 | is_closing_tag_ = false; 521 | 522 | tag->OnHasLeftClosingTag(this); 523 | } 524 | 525 | return true; 526 | } 527 | 528 | Converter *Converter::ShortenMarkdown(size_t chars) { 529 | md_ = md_.substr(0, md_.length() - chars); 530 | 531 | if (chars > chars_in_curr_line_) 532 | chars_in_curr_line_ = 0; 533 | else 534 | chars_in_curr_line_ = chars_in_curr_line_ - chars; 535 | 536 | return this->UpdatePrevChFromMd(); 537 | } 538 | 539 | bool Converter::ParseCharInTagContent(char ch) { 540 | if (is_in_code_) { 541 | md_ += ch; 542 | 543 | if (index_blockquote != 0 && ch == '\n') 544 | appendToMd(Repeat("> ", index_blockquote)); 545 | 546 | return true; 547 | } 548 | 549 | if (IsInIgnoredTag() || current_tag_ == kTagLink) { 550 | prev_ch_in_html_ = ch; 551 | 552 | return true; 553 | } 554 | 555 | if (ch == '\n') { 556 | if (index_blockquote != 0) { 557 | md_ += '\n'; 558 | chars_in_curr_line_ = 0; 559 | appendToMd(Repeat("> ", index_blockquote)); 560 | } 561 | 562 | return true; 563 | } 564 | 565 | switch (ch) { 566 | case '*': 567 | appendToMd("\\*"); 568 | break; 569 | case '`': 570 | appendToMd("\\`"); 571 | break; 572 | case '\\': 573 | appendToMd("\\\\"); 574 | break; 575 | default: 576 | md_ += ch; 577 | ++chars_in_curr_line_; 578 | break; 579 | } 580 | 581 | if (chars_in_curr_line_ > option.softBreak && !is_in_table_ && !is_in_list_ && 582 | current_tag_ != kTagImg && current_tag_ != kTagAnchor && 583 | option.splitLines) { 584 | if (ch == ' ') { // If the next char is - it will become a list 585 | md_ += '\n'; 586 | chars_in_curr_line_ = 0; 587 | } else if (chars_in_curr_line_ > option.hardBreak) { 588 | ReplacePreviousSpaceInLineByNewline(); 589 | } 590 | } 591 | 592 | return false; 593 | } 594 | 595 | bool Converter::ReplacePreviousSpaceInLineByNewline() { 596 | if (current_tag_ == kTagParagraph || 597 | is_in_table_ && (prev_tag_ != kTagCode && prev_tag_ != kTagPre)) 598 | return false; 599 | 600 | auto offset = md_.length() - 1; 601 | 602 | if (md_.length() == 0) 603 | return true; 604 | 605 | do { 606 | if (md_[offset] == '\n') 607 | return false; 608 | 609 | if (md_[offset] == ' ') { 610 | md_[offset] = '\n'; 611 | chars_in_curr_line_ = md_.length() - offset; 612 | 613 | return true; 614 | } 615 | 616 | --offset; 617 | } while (offset > 0); 618 | 619 | return false; 620 | } 621 | 622 | void Converter::TagAnchor::OnHasLeftOpeningTag(Converter *c) { 623 | if (c->prev_tag_ == kTagImg) 624 | c->appendToMd('\n'); 625 | 626 | current_title_ = c->ExtractAttributeFromTagLeftOf(kAttributeTitle); 627 | 628 | c->appendToMd('['); 629 | current_href_ = c->ExtractAttributeFromTagLeftOf(kAttributeHref); 630 | } 631 | 632 | void Converter::TagAnchor::OnHasLeftClosingTag(Converter *c) { 633 | if (!c->shortIfPrevCh('[')) { 634 | c->appendToMd("](")->appendToMd(current_href_); 635 | 636 | // If title is set append it 637 | if (!current_title_.empty()) { 638 | c->appendToMd(" \"")->appendToMd(current_title_)->appendToMd('"'); 639 | current_title_.clear(); 640 | } 641 | 642 | c->appendToMd(')'); 643 | 644 | if (c->prev_tag_ == kTagImg) 645 | c->appendToMd('\n'); 646 | } 647 | } 648 | 649 | void Converter::TagBold::OnHasLeftOpeningTag(Converter *c) { 650 | c->appendToMd("**"); 651 | } 652 | 653 | void Converter::TagBold::OnHasLeftClosingTag(Converter *c) { 654 | c->appendToMd("**"); 655 | } 656 | 657 | void Converter::TagItalic::OnHasLeftOpeningTag(Converter *c) { 658 | c->appendToMd('*'); 659 | } 660 | 661 | void Converter::TagItalic::OnHasLeftClosingTag(Converter *c) { 662 | c->appendToMd('*'); 663 | } 664 | 665 | void Converter::TagUnderline::OnHasLeftOpeningTag(Converter *c) { 666 | c->appendToMd(""); 667 | } 668 | 669 | void Converter::TagUnderline::OnHasLeftClosingTag(Converter *c) { 670 | c->appendToMd(""); 671 | } 672 | 673 | void Converter::TagStrikethrought::OnHasLeftOpeningTag(Converter *c) { 674 | c->appendToMd('~'); 675 | } 676 | 677 | void Converter::TagStrikethrought::OnHasLeftClosingTag(Converter *c) { 678 | c->appendToMd('~'); 679 | } 680 | 681 | void Converter::TagBreak::OnHasLeftOpeningTag(Converter *c) { 682 | if (c->is_in_list_) { // When it's in a list, it's not in a paragraph 683 | c->appendToMd(" \n"); 684 | c->appendToMd(Repeat(" ", c->index_li)); 685 | } else if (c->is_in_table_) { 686 | c->appendToMd("
    "); 687 | } else if (!c->md_.empty()) 688 | c->appendToMd(" \n"); 689 | } 690 | 691 | void Converter::TagBreak::OnHasLeftClosingTag(Converter *c) {} 692 | 693 | void Converter::TagDiv::OnHasLeftOpeningTag(Converter *c) { 694 | if (c->prev_ch_in_md_ != '\n') 695 | c->appendToMd('\n'); 696 | 697 | if (c->prev_prev_ch_in_md_ != '\n') 698 | c->appendToMd('\n'); 699 | } 700 | 701 | void Converter::TagDiv::OnHasLeftClosingTag(Converter *c) {} 702 | 703 | void Converter::TagHeader1::OnHasLeftOpeningTag(Converter *c) { 704 | c->appendToMd("\n# "); 705 | } 706 | 707 | void Converter::TagHeader1::OnHasLeftClosingTag(Converter *c) { 708 | if (c->prev_prev_ch_in_md_ != ' ') 709 | c->appendToMd('\n'); 710 | } 711 | 712 | void Converter::TagHeader2::OnHasLeftOpeningTag(Converter *c) { 713 | c->appendToMd("\n## "); 714 | } 715 | 716 | void Converter::TagHeader2::OnHasLeftClosingTag(Converter *c) { 717 | if (c->prev_prev_ch_in_md_ != ' ') 718 | c->appendToMd('\n'); 719 | } 720 | 721 | void Converter::TagHeader3::OnHasLeftOpeningTag(Converter *c) { 722 | c->appendToMd("\n### "); 723 | } 724 | 725 | void Converter::TagHeader3::OnHasLeftClosingTag(Converter *c) { 726 | if (c->prev_prev_ch_in_md_ != ' ') 727 | c->appendToMd('\n'); 728 | } 729 | 730 | void Converter::TagHeader4::OnHasLeftOpeningTag(Converter *c) { 731 | c->appendToMd("\n#### "); 732 | } 733 | 734 | void Converter::TagHeader4::OnHasLeftClosingTag(Converter *c) { 735 | if (c->prev_prev_ch_in_md_ != ' ') 736 | c->appendToMd('\n'); 737 | } 738 | 739 | void Converter::TagHeader5::OnHasLeftOpeningTag(Converter *c) { 740 | c->appendToMd("\n##### "); 741 | } 742 | 743 | void Converter::TagHeader5::OnHasLeftClosingTag(Converter *c) { 744 | if (c->prev_prev_ch_in_md_ != ' ') 745 | c->appendToMd('\n'); 746 | } 747 | 748 | void Converter::TagHeader6::OnHasLeftOpeningTag(Converter *c) { 749 | c->appendToMd("\n###### "); 750 | } 751 | 752 | void Converter::TagHeader6::OnHasLeftClosingTag(Converter *c) { 753 | if (c->prev_prev_ch_in_md_ != ' ') 754 | c->appendToMd('\n'); 755 | } 756 | 757 | void Converter::TagListItem::OnHasLeftOpeningTag(Converter *c) { 758 | if (c->is_in_table_) 759 | return; 760 | 761 | if (!c->is_in_ordered_list_) { 762 | c->appendToMd(string({c->option.unorderedList, ' '})); 763 | return; 764 | } 765 | 766 | ++c->index_ol; 767 | 768 | string num = std::to_string(c->index_ol); 769 | num.append({c->option.orderedList, ' '}); 770 | c->appendToMd(num); 771 | } 772 | 773 | void Converter::TagListItem::OnHasLeftClosingTag(Converter *c) { 774 | if (c->is_in_table_) 775 | return; 776 | 777 | if (c->prev_ch_in_md_ != '\n') 778 | c->appendToMd('\n'); 779 | } 780 | 781 | void Converter::TagOption::OnHasLeftOpeningTag(Converter *c) {} 782 | 783 | void Converter::TagOption::OnHasLeftClosingTag(Converter *c) { 784 | if (c->md_.length() > 0) 785 | c->appendToMd(" \n"); 786 | } 787 | 788 | void Converter::TagOrderedList::OnHasLeftOpeningTag(Converter *c) { 789 | if (c->is_in_table_) 790 | return; 791 | 792 | c->is_in_list_ = true; 793 | c->is_in_ordered_list_ = true; 794 | c->index_ol = 0; 795 | 796 | ++c->index_li; 797 | 798 | c->ReplacePreviousSpaceInLineByNewline(); 799 | 800 | c->appendToMd('\n'); 801 | } 802 | 803 | void Converter::TagOrderedList::OnHasLeftClosingTag(Converter *c) { 804 | if (c->is_in_table_) 805 | return; 806 | 807 | c->is_in_ordered_list_ = false; 808 | 809 | if (c->index_li != 0) 810 | --c->index_li; 811 | 812 | c->is_in_list_ = c->index_li != 0; 813 | 814 | c->appendToMd('\n'); 815 | } 816 | 817 | void Converter::TagParagraph::OnHasLeftOpeningTag(Converter *c) { 818 | c->is_in_p_ = true; 819 | 820 | if (c->is_in_list_ && c->prev_tag_ == kTagParagraph) 821 | c->appendToMd("\n\t"); 822 | else if (!c->is_in_list_) 823 | c->appendToMd('\n'); 824 | } 825 | 826 | void Converter::TagParagraph::OnHasLeftClosingTag(Converter *c) { 827 | c->is_in_p_ = false; 828 | 829 | if (!c->md_.empty()) 830 | c->appendToMd("\n"); // Workaround \n restriction for blockquotes 831 | 832 | if (c->index_blockquote != 0) 833 | c->appendToMd(Repeat("> ", c->index_blockquote)); 834 | } 835 | 836 | void Converter::TagPre::OnHasLeftOpeningTag(Converter *c) { 837 | c->is_in_pre_ = true; 838 | 839 | if (c->prev_ch_in_md_ != '\n') 840 | c->appendToMd('\n'); 841 | 842 | if (c->prev_prev_ch_in_md_ != '\n') 843 | c->appendToMd('\n'); 844 | 845 | if (c->is_in_list_ && c->prev_tag_ != kTagParagraph) 846 | c->ShortenMarkdown(2); 847 | 848 | if (c->is_in_list_) 849 | c->appendToMd("\t\t"); 850 | else 851 | c->appendToMd("```"); 852 | } 853 | 854 | void Converter::TagPre::OnHasLeftClosingTag(Converter *c) { 855 | c->is_in_pre_ = false; 856 | 857 | if (c->is_in_list_) 858 | return; 859 | 860 | c->appendToMd("```"); 861 | c->appendToMd('\n'); // Don't combine because of blockquote 862 | } 863 | 864 | void Converter::TagCode::OnHasLeftOpeningTag(Converter *c) { 865 | c->is_in_code_ = true; 866 | 867 | if (c->is_in_pre_) { 868 | if (c->is_in_list_) 869 | return; 870 | 871 | auto code = c->ExtractAttributeFromTagLeftOf(kAttributeClass); 872 | if (!code.empty()) { 873 | if (startsWith(code, "language-")) 874 | code.erase(0, 9); // remove language- 875 | c->appendToMd(code); 876 | } 877 | c->appendToMd('\n'); 878 | } else 879 | c->appendToMd('`'); 880 | } 881 | 882 | void Converter::TagCode::OnHasLeftClosingTag(Converter *c) { 883 | c->is_in_code_ = false; 884 | 885 | if (c->is_in_pre_) 886 | return; 887 | 888 | c->appendToMd('`'); 889 | } 890 | 891 | void Converter::TagSpan::OnHasLeftOpeningTag(Converter *c) {} 892 | 893 | void Converter::TagSpan::OnHasLeftClosingTag(Converter *c) {} 894 | 895 | void Converter::TagTitle::OnHasLeftOpeningTag(Converter *c) {} 896 | 897 | void Converter::TagTitle::OnHasLeftClosingTag(Converter *c) { 898 | c->TurnLineIntoHeader1(); 899 | } 900 | 901 | void Converter::TagUnorderedList::OnHasLeftOpeningTag(Converter *c) { 902 | if (c->is_in_list_ || c->is_in_table_) 903 | return; 904 | 905 | c->is_in_list_ = true; 906 | 907 | ++c->index_li; 908 | 909 | c->appendToMd('\n'); 910 | } 911 | 912 | void Converter::TagUnorderedList::OnHasLeftClosingTag(Converter *c) { 913 | if (c->is_in_table_) 914 | return; 915 | 916 | if (c->index_li != 0) 917 | --c->index_li; 918 | 919 | c->is_in_list_ = c->index_li != 0; 920 | 921 | if (c->prev_prev_ch_in_md_ == '\n' && c->prev_ch_in_md_ == '\n') 922 | c->ShortenMarkdown(); 923 | else if (c->prev_ch_in_md_ != '\n') 924 | c->appendToMd('\n'); 925 | } 926 | 927 | void Converter::TagImage::OnHasLeftOpeningTag(Converter *c) { 928 | if (c->prev_tag_ != kTagAnchor && c->prev_ch_in_md_ != '\n') 929 | c->appendToMd('\n'); 930 | 931 | c->appendToMd("![") 932 | ->appendToMd(c->ExtractAttributeFromTagLeftOf(kAttributeAlt)) 933 | ->appendToMd("](") 934 | ->appendToMd(c->ExtractAttributeFromTagLeftOf(kAttributeSrc)); 935 | 936 | auto title = c->ExtractAttributeFromTagLeftOf(kAttributeTitle); 937 | if (!title.empty()) { 938 | c->appendToMd(" \"")->appendToMd(title)->appendToMd('"'); 939 | } 940 | 941 | c->appendToMd(")"); 942 | } 943 | 944 | void Converter::TagImage::OnHasLeftClosingTag(Converter *c) { 945 | if (c->prev_tag_ == kTagAnchor) 946 | c->appendToMd('\n'); 947 | } 948 | 949 | void Converter::TagSeperator::OnHasLeftOpeningTag(Converter *c) { 950 | c->appendToMd("\n---\n"); // NOTE: We can make this an option 951 | } 952 | 953 | void Converter::TagSeperator::OnHasLeftClosingTag(Converter *c) {} 954 | 955 | void Converter::TagTable::OnHasLeftOpeningTag(Converter *c) { 956 | c->is_in_table_ = true; 957 | c->appendToMd('\n'); 958 | c->table_start = c->md_.length(); 959 | } 960 | 961 | void Converter::TagTable::OnHasLeftClosingTag(Converter *c) { 962 | c->is_in_table_ = false; 963 | c->appendToMd('\n'); 964 | 965 | if (!c->option.formatTable) 966 | return; 967 | 968 | string table = c->md_.substr(c->table_start); 969 | table = formatMarkdownTable(table); 970 | c->ShortenMarkdown(c->md_.size() - c->table_start); 971 | c->appendToMd(table); 972 | } 973 | 974 | void Converter::TagTableRow::OnHasLeftOpeningTag(Converter *c) { 975 | c->appendToMd('\n'); 976 | } 977 | 978 | void Converter::TagTableRow::OnHasLeftClosingTag(Converter *c) { 979 | c->UpdatePrevChFromMd(); 980 | if (c->prev_ch_in_md_ == '|') 981 | c->appendToMd('\n'); // There's a bug 982 | else 983 | c->appendToMd('|'); 984 | 985 | if (!c->tableLine.empty()) { 986 | if (c->prev_ch_in_md_ != '\n') 987 | c->appendToMd('\n'); 988 | 989 | c->tableLine.append("|\n"); 990 | c->appendToMd(c->tableLine); 991 | c->tableLine.clear(); 992 | } 993 | } 994 | 995 | void Converter::TagTableHeader::OnHasLeftOpeningTag(Converter *c) { 996 | auto align = c->ExtractAttributeFromTagLeftOf(kAttrinuteAlign); 997 | 998 | string line = "| "; 999 | 1000 | if (align == "left" || align == "center") 1001 | line += ':'; 1002 | 1003 | line += '-'; 1004 | 1005 | if (align == "right" || align == "center") 1006 | line += ": "; 1007 | else 1008 | line += ' '; 1009 | 1010 | c->tableLine.append(line); 1011 | 1012 | c->appendToMd("| "); 1013 | } 1014 | 1015 | void Converter::TagTableHeader::OnHasLeftClosingTag(Converter *c) {} 1016 | 1017 | void Converter::TagTableData::OnHasLeftOpeningTag(Converter *c) { 1018 | if (c->prev_prev_ch_in_md_ != '|') 1019 | c->appendToMd("| "); 1020 | } 1021 | 1022 | void Converter::TagTableData::OnHasLeftClosingTag(Converter *c) {} 1023 | 1024 | void Converter::TagBlockquote::OnHasLeftOpeningTag(Converter *c) { 1025 | ++c->index_blockquote; 1026 | c->appendToMd("\n"); 1027 | c->appendToMd(Repeat("> ", c->index_blockquote)); 1028 | } 1029 | 1030 | void Converter::TagBlockquote::OnHasLeftClosingTag(Converter *c) { 1031 | --c->index_blockquote; 1032 | // Only shorten if a "> " was added (i.e., a newline was processed in the blockquote) 1033 | if (!c->md_.empty() && c->md_.length() >= 2 && 1034 | c->md_.substr(c->md_.length() - 2) == "> ") { 1035 | c->ShortenMarkdown(2); // Remove the '> ' only if it exists 1036 | } 1037 | } 1038 | 1039 | void Converter::reset() { 1040 | md_.clear(); 1041 | prev_ch_in_md_ = 0; 1042 | prev_prev_ch_in_md_ = 0; 1043 | index_ch_in_html_ = 0; 1044 | } 1045 | 1046 | bool Converter::IsInIgnoredTag() const { 1047 | if (current_tag_ == kTagTitle && !option.includeTitle) 1048 | return true; 1049 | 1050 | return IsIgnoredTag(current_tag_); 1051 | } 1052 | } // namespace html2md 1053 | -------------------------------------------------------------------------------- /src/table.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Tim Gromeyer 2 | // Licensed under the MIT License - https://opensource.org/licenses/MIT 3 | 4 | #include "table.h" 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | using std::string; 12 | using std::vector; 13 | 14 | const size_t MIN_LINE_LENGTH = 3; // Minimum length of line 15 | 16 | void removeLeadingTrailingSpaces(string &str) { 17 | size_t firstNonSpace = str.find_first_not_of(' '); 18 | if (firstNonSpace == string::npos) { 19 | str.clear(); // Entire string is spaces 20 | return; 21 | } 22 | 23 | size_t lastNonSpace = str.find_last_not_of(' '); 24 | str = str.substr(firstNonSpace, lastNonSpace - firstNonSpace + 1); 25 | } 26 | 27 | string enlargeTableHeaderLine(const string &str, size_t length) { 28 | if (str.empty() || length < MIN_LINE_LENGTH) 29 | return ""; 30 | 31 | size_t first = str.find_first_of(':'); 32 | size_t last = str.find_last_of(':'); 33 | 34 | if (first == 0 && first == last) 35 | last = string::npos; 36 | 37 | string line = string(length, '-'); 38 | 39 | if (first == 0) 40 | line[0] = ':'; 41 | if (last == str.length() - 1) 42 | line[length - 1] = ':'; 43 | 44 | return line; 45 | } 46 | 47 | string formatMarkdownTable(const string &inputTable) { 48 | std::istringstream iss(inputTable); 49 | string line; 50 | vector> tableData; 51 | 52 | // Parse the input table into a 2D vector 53 | while (std::getline(iss, line)) { 54 | std::istringstream lineStream(line); 55 | string cell; 56 | vector rowData; 57 | 58 | while (std::getline(lineStream, cell, '|')) { 59 | if (!cell.empty()) { 60 | removeLeadingTrailingSpaces(cell); // Use the trim function 61 | rowData.push_back(cell); 62 | } 63 | } 64 | 65 | if (!rowData.empty()) { 66 | tableData.push_back(std::move(rowData)); // Move rowData to avoid copying 67 | } 68 | } 69 | 70 | if (tableData.empty()) { 71 | return ""; 72 | } 73 | 74 | // Determine maximum width of each column 75 | vector columnWidths(tableData[0].size(), 0); 76 | for (const auto &row : tableData) { 77 | if (columnWidths.size() < row.size()) { 78 | columnWidths.resize(row.size(), 0); 79 | } 80 | 81 | for (size_t i = 0; i < row.size(); ++i) { 82 | columnWidths[i] = std::max(columnWidths[i], row[i].size()); 83 | } 84 | } 85 | 86 | // Build the formatted table 87 | std::ostringstream formattedTable; 88 | for (size_t rowNumber = 0; rowNumber < tableData.size(); ++rowNumber) { 89 | const auto &row = tableData[rowNumber]; 90 | 91 | formattedTable << "|"; 92 | 93 | for (size_t i = 0; i < row.size(); ++i) { 94 | if (rowNumber == 1) { 95 | formattedTable << enlargeTableHeaderLine(row[i], columnWidths[i] + 2) 96 | << "|"; 97 | continue; 98 | } 99 | formattedTable << " " << std::setw(columnWidths[i]) << std::left << row[i] 100 | << " |"; 101 | } 102 | formattedTable << "\n"; 103 | } 104 | 105 | return formattedTable.str(); 106 | } 107 | -------------------------------------------------------------------------------- /tests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | project(tests LANGUAGES C CXX) 2 | 3 | if (NOT EXISTS ${CMAKE_CURRENT_LIST_DIR}/md4c/src/) 4 | include(FindGit) 5 | if(NOT GIT_FOUND) 6 | message(WARNING "git not found. Please download md4c manually or disable tests.") 7 | return() 8 | endif() 9 | get_directory_property(dir PARENT_DIRECTORY) 10 | execute_process(COMMAND ${GIT_EXECUTABLE} submodule update --init --depth=1 11 | WORKING_DIRECTORY ${dir}) 12 | endif() 13 | 14 | set(MD4C_FILES 15 | md4c/src/entity.c 16 | md4c/src/entity.h 17 | md4c/src/md4c-html.c 18 | md4c/src/md4c-html.h 19 | md4c/src/md4c.c 20 | md4c/src/md4c.h 21 | ) 22 | 23 | add_library(md4c-html STATIC ${MD4C_FILES}) 24 | target_include_directories(md4c-html PUBLIC md4c/src) 25 | 26 | # Existing test executable 27 | add_executable(test-exe main.cpp) 28 | target_link_libraries(test-exe md4c-html html2md-static) 29 | target_compile_definitions(test-exe PUBLIC DIR="${CMAKE_CURRENT_LIST_DIR}") 30 | set_target_properties(test-exe PROPERTIES OUTPUT_NAME "tests") 31 | target_compile_features(test-exe PUBLIC cxx_std_17) 32 | 33 | # New benchmark executable 34 | add_executable(benchmark-exe benchmark.cpp) 35 | target_link_libraries(benchmark-exe md4c-html html2md-static) 36 | target_compile_definitions(benchmark-exe PUBLIC DIR="${CMAKE_CURRENT_LIST_DIR}") 37 | set_target_properties(benchmark-exe PROPERTIES OUTPUT_NAME "benchmarks") 38 | target_compile_features(benchmark-exe PUBLIC cxx_std_17) 39 | 40 | if (CMAKE_VERSION VERSION_LESS 3.11.0) 41 | return() 42 | endif() 43 | 44 | add_custom_target(test 45 | COMMAND $ 46 | COMMENT Running tests.. 47 | DEPENDS test-exe 48 | ) 49 | 50 | add_custom_target(benchmark 51 | COMMAND $ 52 | COMMENT Running benchmarks.. 53 | DEPENDS benchmark-exe 54 | ) -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | ## How does the test work? 2 | 3 | Well, the program searches(in this dir) for files ending with `.md`. 4 | 5 | 1. It then converts the Markdown to HTML using [md4c](https://github.com/tim-gromeyer/MarkdownEdit_md4c). 6 | 2. Afterwards it converts the HTML back to Markdown. 7 | 3. The generated Markdown gets converted back to HTML 8 | 4. It compares the HTML generated from the original Markdown 9 | and the HTML generated from the converted Markdown. 10 | -------------------------------------------------------------------------------- /tests/benchmark.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include "html2md.h" 13 | #include "md4c-html.h" 14 | #include "table.h" 15 | 16 | using std::cerr; 17 | using std::cout; 18 | using std::ifstream; 19 | using std::string; 20 | using std::stringstream; 21 | using std::vector; 22 | using std::chrono::duration; 23 | using std::chrono::high_resolution_clock; 24 | using std::chrono::microseconds; 25 | namespace fs = std::filesystem; 26 | 27 | // Markdown and HTML utility functions 28 | namespace markdown { 29 | void captureHtmlFragment(const MD_CHAR *data, const MD_SIZE data_size, 30 | void *userData) { 31 | auto *str = static_cast(userData); 32 | str->write(data, data_size); 33 | } 34 | 35 | string toHTML(const string &md) { 36 | stringstream html; 37 | static MD_TOC_OPTIONS options; 38 | md_html(md.c_str(), md.size(), &captureHtmlFragment, &html, MD_DIALECT_GITHUB, 39 | MD_HTML_FLAG_SKIP_UTF8_BOM, &options); 40 | return html.str(); 41 | } 42 | 43 | string fromHTML(string &html) { 44 | static html2md::Options options; 45 | options.splitLines = false; 46 | html2md::Converter c(html, &options); 47 | return c.convert(); 48 | } 49 | } // namespace markdown 50 | 51 | // Benchmark result structure 52 | struct BenchmarkResult { 53 | string test_name; 54 | double avg_time_us; // Average time in microseconds 55 | double std_dev_us; // Standard deviation in microseconds 56 | size_t input_size; // Input size in bytes 57 | double throughput_mbps; // Throughput in megabytes per second 58 | }; 59 | 60 | // Benchmark runner class 61 | class BenchmarkRunner { 62 | public: 63 | void addTest(const string &name, const string &input, bool is_markdown) { 64 | tests_.push_back({name, input, is_markdown}); 65 | } 66 | 67 | void run(int iterations) { 68 | auto start_total = high_resolution_clock::now(); // Start total timer 69 | for (const auto &test : tests_) { 70 | runTest(test, iterations); 71 | } 72 | auto end_total = high_resolution_clock::now(); // End total timer 73 | total_duration_ms_ = 74 | duration(end_total - start_total).count(); 75 | printSummary(); 76 | } 77 | 78 | private: 79 | struct Test { 80 | string name; 81 | string input; 82 | bool is_markdown; // false for HTML-to-Markdown 83 | }; 84 | 85 | vector tests_; 86 | vector results_; 87 | double total_duration_ms_ = 0.0; // Total duration in milliseconds 88 | 89 | void runTest(const Test &test, int iterations) { 90 | vector times_us(iterations); 91 | size_t input_size = test.input.size(); 92 | 93 | for (int i = 0; i < iterations; ++i) { 94 | auto start = high_resolution_clock::now(); 95 | if (test.is_markdown) { 96 | markdown::toHTML(test.input); 97 | } else { 98 | string input_copy = test.input; // fromHTML modifies input 99 | markdown::fromHTML(input_copy); 100 | } 101 | auto end = high_resolution_clock::now(); 102 | times_us[i] = duration(end - start).count(); 103 | } 104 | 105 | // Calculate average and standard deviation 106 | double sum = 0.0; 107 | for (double t : times_us) 108 | sum += t; 109 | double avg_time_us = sum / iterations; 110 | 111 | double variance_sum = 0.0; 112 | for (double t : times_us) { 113 | variance_sum += (t - avg_time_us) * (t - avg_time_us); 114 | } 115 | double std_dev_us = std::sqrt(variance_sum / iterations); 116 | 117 | // Calculate throughput (MB/s) 118 | double avg_time_s = avg_time_us / 1e6; 119 | double throughput_mbps = (input_size / (1024.0 * 1024.0)) / avg_time_s; 120 | 121 | results_.push_back( 122 | {test.name, avg_time_us, std_dev_us, input_size, throughput_mbps}); 123 | } 124 | 125 | void printSummary() { 126 | cout << "\n=== Benchmark Summary ===\n"; 127 | cout << std::left << std::setw(30) << "Test Name" << std::setw(15) 128 | << "Avg Time (us)" << std::setw(15) << "Std Dev (us)" << std::setw(15) 129 | << "Input Size (B)" << std::setw(15) << "Throughput (MB/s)\n"; 130 | cout << std::string(90, '-') << "\n"; 131 | 132 | for (const auto &result : results_) { 133 | cout << std::left << std::setw(30) << result.test_name << std::fixed 134 | << std::setprecision(2) << std::setw(15) << result.avg_time_us 135 | << std::setw(15) << result.std_dev_us << std::setw(15) 136 | << result.input_size << std::setw(15) << result.throughput_mbps 137 | << "\n"; 138 | } 139 | 140 | cout << "\nTotal Benchmark Duration: " << std::fixed << std::setprecision(2) 141 | << total_duration_ms_ << " ms\n"; 142 | } 143 | }; 144 | 145 | namespace file { 146 | string readAll(const string &name) { 147 | ifstream in(name); 148 | stringstream buffer; 149 | buffer << in.rdbuf(); 150 | return buffer.str(); 151 | } 152 | } // namespace file 153 | 154 | int main() { 155 | using namespace markdown; 156 | BenchmarkRunner runner; 157 | const int iterations = 10000; // Number of iterations per test 158 | 159 | // Add tests for Markdown files in the directory 160 | vector files; 161 | static vector markdownExtensions = {".md", ".markdown", ".mkd"}; 162 | for (const auto &p : fs::recursive_directory_iterator(DIR)) { 163 | if (std::find(markdownExtensions.begin(), markdownExtensions.end(), 164 | p.path().extension()) != markdownExtensions.end() && 165 | p.path().parent_path() == DIR) { 166 | files.emplace_back(p.path().string()); 167 | } 168 | } 169 | std::sort(files.begin(), files.end()); 170 | 171 | for (const auto &file : files) { 172 | string md = file::readAll(file); 173 | string html = markdown::toHTML(md); 174 | string filename = fs::path(file).filename().string(); 175 | runner.addTest(filename, html, false); 176 | } 177 | 178 | // Run benchmarks 179 | cout << "Running benchmarks with " << iterations 180 | << " iterations per test...\n"; 181 | runner.run(iterations); 182 | 183 | return 0; 184 | } -------------------------------------------------------------------------------- /tests/blockquote.md: -------------------------------------------------------------------------------- 1 | # Blockquote Demonstration 2 | 3 | Blockquotes can contain various Markdown elements, including code blocks and other formatting. 4 | 5 | ## Syntax 6 | 7 | You can create a blockquote by adding a `>` character before the quoted text. You can also nest blockquotes by using multiple `>` characters. 8 | 9 | ### Example 10 | 11 | > This is a simple blockquote. 12 | > It can span multiple lines. 13 | 14 | You can also include other Markdown elements within blockquotes: 15 | 16 | > Here's a list: 17 | > - Item 1 18 | > - Item 2 19 | > - Item 3 20 | 21 | And you can nest blockquotes as well: 22 | 23 | > This is a level 1 blockquote. 24 | > 25 | > > This is a nested level 2 blockquote. 26 | > > 27 | > > > This is a nested level 3 blockquote. 28 | 29 | ## Code Blocks 30 | 31 | You can include code blocks within blockquotes: 32 | 33 | > Here's an example of a code block: 34 | > 35 | > ``` 36 | > def greet(name): 37 | > print(f"Hello, {name}!") 38 | > ``` 39 | > 40 | > And here's inline code: `print("Markdown is great!")` 41 | 42 | ## Links and Images 43 | 44 | Links and images can also be included in blockquotes: 45 | 46 | > Check out the [Markdown Guide](https://www.markdownguide.org/) for more information. 47 | > 48 | > ![Markdown Logo](https://markdown-here.com/img/icon256.png) 49 | 50 | ## Conclusion 51 | 52 | Blockquotes are a versatile tool in Markdown that allow you to emphasize and format various types of content within a quoted context. 53 | -------------------------------------------------------------------------------- /tests/breaks.md: -------------------------------------------------------------------------------- 1 | # Line Breaks Demo 2 | 3 | ## Double Space Method 4 | 5 | This is the first line. 6 | This line has a line break after it. 7 | 8 | This is another paragraph. 9 | And this line has a line break too. 10 | 11 | ## `
    ` Tag Method 12 | 13 | This line will be followed by a line break.
    14 | And this line will be on the next line. 15 | 16 | You can also use the `
    ` tag without closing it:
    17 | This will continue on the same line, but with a space after. 18 | 19 | -------------------------------------------------------------------------------- /tests/code.md: -------------------------------------------------------------------------------- 1 | # Code Example Markdown 2 | 3 | ## Python Code 4 | 5 | You can include Python code blocks like this: 6 | 7 | ```python 8 | def factorial(n): 9 | if n == 0: 10 | return 1 11 | else: 12 | return n * factorial(n - 1) 13 | 14 | result = factorial(5) 15 | print("Factorial of 5:", result) 16 | ``` 17 | 18 | ## JavaScript Code 19 | 20 | JavaScript code can be included like this: 21 | 22 | ```javascript 23 | function fibonacci(n) { 24 | if (n <= 1) { 25 | return n; 26 | } else { 27 | return fibonacci(n - 1) + fibonacci(n - 2); 28 | } 29 | } 30 | 31 | const fibResult = fibonacci(6); 32 | console.log(`Fibonacci of 6: ${fibResult}`); 33 | ``` 34 | 35 | ## Inline Code 36 | 37 | You can also include inline code using backticks. For example, `print("Hello, World!")` is a simple Python print statement. 38 | 39 | ## Syntax Highlighting 40 | 41 | Markdown supports syntax highlighting for various programming languages, making your code more readable. For instance, you can specify the language after the triple backticks: 42 | 43 | ```java 44 | public class HelloWorld { 45 | public static void main(String[] args) { 46 | System.out.println("Hello, World!"); 47 | } 48 | } 49 | ``` 50 | 51 | Enjoy using code snippets in your Markdown files! 52 | -------------------------------------------------------------------------------- /tests/comment.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | HTML Comment Example 5 | 6 | 7 |

    Welcome to My Website

    8 | 9 |

    This is some content on my page.

    10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /tests/escaping.md: -------------------------------------------------------------------------------- 1 | # Escaping Special Symbols Demo 2 | 3 | This is a demonstration of how special symbols like `*` are escaped in Markdown. 4 | 5 | ## Asterisks 6 | 7 | To display an asterisk (\*) without triggering Markdown formatting, you can use a backslash: `\*`. 8 | 9 | ## Code Blocks 10 | 11 | You can also display code blocks inline using backticks (\`). For example, `var x = 5;`. 12 | 13 | ## Backslashes 14 | 15 | To display a backslash (\\) itself, you need to escape it with another backslash: \\\\. 16 | -------------------------------------------------------------------------------- /tests/formating.md: -------------------------------------------------------------------------------- 1 | # Formatting Demo 2 | 3 | This is a demonstration of various formatting options available in Markdown. 4 | 5 | ## Bold 6 | 7 | **This text is bold.** 8 | 9 | ## Italic 10 | 11 | *This text is italic.* 12 | 13 | ## Strikethrough 14 | 15 | ~~This text is strikethrough.~~ 16 | 17 | ## Underline 18 | 19 | This text is underlined using HTML inline styling. 20 | 21 | ## Combination 22 | 23 | You can also combine formatting options. For example, ***this text is bold and italic.*** 24 | 25 | ## Nested Formatting 26 | 27 | You can also nest formatting, such as combining **bold and *italic*** or ***bold and italic together.*** 28 | 29 | -------------------------------------------------------------------------------- /tests/links.md: -------------------------------------------------------------------------------- 1 | # Markdown Links and Images 2 | 3 | ## Regular Links 4 | 5 | - [Google](https://www.google.com) 6 | - [OpenAI](https://www.openai.com) 7 | - [GitHub](https://www.github.com) 8 | 9 | ## Inline Links 10 | 11 | Check out this [awesome website](https://www.example.com)! 12 | 13 | ## Link with 14 | 15 | [gaminginlinux](gamingonlinux.com "Gaming rocks!") 16 | 17 | ## Reference Links 18 | 19 | - [Markdown Syntax][markdown] 20 | - [Markdown Cheatsheet][cheatsheet] 21 | 22 | [markdown]: https://www.markdownguide.org/basic-syntax/ 23 | [cheatsheet]: https://www.markdownguide.org/cheat-sheet/ 24 | 25 | ## Images 26 | 27 | ![Nature](https://www.example.com/images/nature.jpg) 28 | ![Space](https://www.example.com/images/space.jpg) 29 | 30 | ## Images with Alt Text 31 | 32 | ![Mountains](https://www.example.com/images/mountains.jpg "Beautiful Mountains") 33 | ![Beach](https://www.example.com/images/beach.jpg "Sunny Beach") 34 | 35 | ## Images with Links 36 | 37 | [![Sunset](https://www.example.com/images/sunset.jpg)](https://www.example.com) 38 | 39 | ## Images with References 40 | 41 | [![Forest][forest-image]][forest-link] 42 | 43 | [forest-image]: https://www.example.com/images/forest.jpg 44 | [forest-link]: https://www.example.com/nature/forest 45 | 46 | -------------------------------------------------------------------------------- /tests/lists.md: -------------------------------------------------------------------------------- 1 | - foo 2 | - - bar 3 | 4 | 1. foo 5 | 2. bar 6 | 7 | - list entry with 8 | break 9 | - - Another 10 | break 11 | foo 12 | bar 13 | - Hello World 14 | -------------------------------------------------------------------------------- /tests/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "html2md.h" 11 | #include "md4c-html.h" 12 | #include "table.h" 13 | 14 | using std::cerr; 15 | using std::cout; 16 | using std::ifstream; 17 | using std::string; 18 | using std::stringstream; 19 | using std::vector; 20 | using std::chrono::duration; 21 | using std::chrono::high_resolution_clock; 22 | using std::chrono::milliseconds; 23 | namespace fs = std::filesystem; 24 | 25 | namespace markdown { 26 | void captureHtmlFragment(const MD_CHAR *data, const MD_SIZE data_size, 27 | void *userData) { 28 | auto *str = static_cast(userData); 29 | 30 | str->write(data, data_size); 31 | } 32 | 33 | string toHTML(const string &md) { 34 | stringstream html; 35 | 36 | static MD_TOC_OPTIONS options; 37 | 38 | md_html(md.c_str(), md.size(), &captureHtmlFragment, &html, MD_DIALECT_GITHUB, 39 | MD_HTML_FLAG_SKIP_UTF8_BOM, &options); 40 | 41 | return html.str(); 42 | }; 43 | 44 | string fromHTML(string &html) { 45 | static html2md::Options options; 46 | options.splitLines = false; 47 | 48 | html2md::Converter c(html, &options); 49 | return c.convert(); 50 | } 51 | } // namespace markdown 52 | 53 | namespace file { 54 | string readAll(const string &name) { 55 | ifstream in(name); 56 | stringstream buffer; 57 | buffer << in.rdbuf(); 58 | return buffer.str(); 59 | }; 60 | } // namespace file 61 | 62 | // Log the error 63 | void log(const string &file, const string &origMd, const string &generatedMd) { 64 | cerr << "Task " << fs::path(file).filename() << " failed:\nOriginal Md:\n" 65 | << origMd << "\nGenerated Markdown:\n" 66 | << generatedMd << '\n'; 67 | } 68 | 69 | // Print "Running " + filename 70 | void running(const string &file) { 71 | cout << "Running test " << fs::path(file).filename() << "...\t"; 72 | } 73 | 74 | // Print "Passed!" in green 75 | void passed() { cout << "\x1B[32mPassed!\033[0m\n"; } 76 | 77 | // Print "Failed!" in red 78 | void error() { cout << "\x1B[31mFailed!\033[0m\n"; } 79 | 80 | void runTest(const string &file, short *errorCount) { 81 | // Read the markdown file 82 | const string md = file::readAll(file); 83 | 84 | running(file); 85 | 86 | // Convert the Md to HTML 87 | string html = markdown::toHTML(md); 88 | 89 | // Generate Md from the HTML 90 | string convertedMd = markdown::fromHTML(html); 91 | 92 | // Convert it back to HTML 93 | string testHTML = markdown::toHTML(convertedMd); 94 | 95 | // Compare original and result HTML 96 | if (html == testHTML) 97 | passed(); 98 | else { 99 | error(); 100 | log(file, md, convertedMd); 101 | ++*errorCount; 102 | } 103 | } 104 | 105 | void testOption(const char *name) { 106 | cout << "Test option \"" << name << "\"...\t"; 107 | } 108 | 109 | bool testUnorderedList() { 110 | testOption("unorderedList"); 111 | 112 | string html = "
    • List
    "; 113 | 114 | html2md::Options o; 115 | o.unorderedList = '*'; 116 | 117 | html2md::Converter c(html, &o); 118 | 119 | auto md = c.convert(); 120 | 121 | return md.find("* List\n") != string::npos; 122 | } 123 | 124 | bool testOrderedList() { 125 | testOption("orderedList"); 126 | 127 | string html = "
    1. List
    "; 128 | 129 | html2md::Options o; 130 | o.orderedList = ')'; 131 | 132 | html2md::Converter c(html, &o); 133 | 134 | auto md = c.convert(); 135 | 136 | return md.find("1) List\n") != string::npos; 137 | } 138 | 139 | bool testDisableTitle() { 140 | testOption("includeTitle"); 141 | 142 | string html = "HTML title"; 143 | 144 | html2md::Options o; 145 | o.includeTitle = false; 146 | 147 | html2md::Converter c(html, &o); 148 | 149 | auto md = c.convert(); 150 | 151 | return md.empty() && 152 | html2md::Convert(html).find("HTML title") != string::npos; 153 | } 154 | 155 | bool testFormatTable() { 156 | testOption("formatTable"); 157 | 158 | constexpr const char *inputTable = "| 1 | 2 | 3 |\n" 159 | "| :-- | :-: | --: |\n" 160 | "| Hello | World | ! |\n" 161 | "| foo | bar | buzz |\n"; 162 | 163 | constexpr const char *expectedOutput = "| 1 | 2 | 3 |\n" 164 | "|:------|:-----:|-----:|\n" 165 | "| Hello | World | ! |\n" 166 | "| foo | bar | buzz |\n"; 167 | 168 | string formattedTable = formatMarkdownTable(inputTable); 169 | 170 | return formattedTable == expectedOutput; 171 | } 172 | 173 | bool testAttributeWhitespace() { 174 | testOption("attributeWhitespace"); 175 | 176 | // Test different variations of whitespace around equals sign 177 | vector testCases = { 178 | "no space", 179 | "space before", 180 | "space after", 181 | "space both sides"}; 182 | 183 | for (const auto &html : testCases) { 184 | html2md::Converter c(html); 185 | auto md = c.convert(); 186 | 187 | // Basic check that the conversion worked 188 | if (md.empty()) { 189 | cerr << "Failed to convert: " << html << "\n"; 190 | return false; 191 | } 192 | 193 | // For anchor tags, check if URL was properly extracted 194 | if (html.find(" testCases = {"
    Uppercase div
    ", 209 | "

    Uppercase paragraph

    ", 210 | "Uppercase strong", 211 | "Uppercase em", 212 | "

    Uppercase h1

    ", 213 | "
    Uppercase blockquote
    "}; 214 | 215 | for (const auto &html : testCases) { 216 | html2md::Converter c(html); 217 | auto md = c.convert(); 218 | 219 | if (md.empty()) { 220 | cerr << "Failed to convert uppercase tag: " << html << "\n"; 221 | return false; 222 | } 223 | 224 | // Check that content was properly converted 225 | if (md.find("Uppercase") == string::npos) { 226 | cerr << "Content missing from uppercase tag conversion: " << html << "\n"; 227 | return false; 228 | } 229 | } 230 | 231 | return true; 232 | } 233 | 234 | bool testUppercaseAttributes() { 235 | testOption("uppercaseAttributes"); 236 | 237 | vector testCases = { 238 | "link", 239 | "\"Image\"", 240 | "
    content
    "}; 241 | 242 | for (const auto &html : testCases) { 243 | html2md::Converter c(html); 244 | auto md = c.convert(); 245 | 246 | if (md.empty()) { 247 | cerr << "Failed to convert uppercase attributes: " << html << "\n"; 248 | return false; 249 | } 250 | 251 | // For anchor tags, check if URL was properly extracted 252 | if (html.find(" testCases = {"
    Mixed case div
    ", 277 | "

    Mixed case paragraph

    ", 278 | "Mixed case strong", 279 | "Mixed case em", 280 | "

    Mixed case h1

    ", 281 | "
    Mixed case blockquote
    "}; 282 | 283 | for (const auto &html : testCases) { 284 | html2md::Converter c(html); 285 | auto md = c.convert(); 286 | 287 | if (md.empty()) { 288 | cerr << "Failed to convert mixed case tag: " << html << "\n"; 289 | return false; 290 | } 291 | 292 | // Check that content was properly converted 293 | if (md.find("Mixed case") == string::npos) { 294 | cerr << "Content missing from mixed case tag conversion: " << html 295 | << "\n"; 296 | return false; 297 | } 298 | } 299 | 300 | return true; 301 | } 302 | 303 | bool testSelfClosingUppercaseTags() { 304 | testOption("selfClosingUppercaseTags"); 305 | 306 | vector testCases = {"
    ", "
    ", "", 307 | ""}; 308 | 309 | for (const auto &html : testCases) { 310 | html2md::Converter c(html); 311 | auto md = c.convert(); 312 | 313 | if (html.find("> testCases = { 334 | // { HTML input, Expected Markdown output } 335 | {"< p >Hello", "Hello\n"}, 336 | {"< p>Text", "Text\n"}, 337 | {"

    Text

    ", "Text\n"} 338 | }; 339 | 340 | for (const auto &[html, expectedMd] : testCases) { 341 | html2md::Converter c(html); 342 | auto md = c.convert(); 343 | 344 | if (md != expectedMd) { 345 | cout << "Failed to convert whitespace tag: " << html << "\n" 346 | << "Expected Markdown: " << expectedMd << "\n" 347 | << "Generated Markdown: " << md << "\n"; 348 | return false; 349 | } 350 | } 351 | 352 | return true; 353 | } 354 | 355 | // Test self closing tags First
    then second 356 | bool testSelfClosingTags() { 357 | testOption("selfClosingTags"); 358 | 359 | string html = "First
    then second"; 360 | 361 | html2md::Converter c(html); 362 | auto md = c.convert(); 363 | 364 | return md.find("[First](http://example1.com/)") != string::npos && 365 | md.find("[second](http://example2.com)") != string::npos && 366 | md.find(" \n") != string::npos; 367 | } 368 | 369 | bool testZeroWidthSpaceWithBlockquote() { 370 | testOption("zeroWidthSpaceWithBlockquote"); 371 | 372 | std::vector> testCases = { 373 | // { HTML input, Expected Markdown output } 374 | {"Text\xe2\x80\x8b
    a
    ", 376 | "Text\u200b\n> a\n"}, 377 | {"Text
    a
    ", 378 | "Text\n> a\n"}, 379 | {"Text
    a\nb
    ", 380 | "Text\n> a\n> b\n"}}; 381 | 382 | for (const auto &[html, expectedMd] : testCases) { 383 | html2md::Converter c(html); 384 | auto md = c.convert(); 385 | 386 | if (md != expectedMd) { 387 | cout << "Failed to convert HTML with zero-width space and blockquote: " 388 | << html << "\n" 389 | << "Expected Markdown: " << expectedMd << "\n" 390 | << "Generated Markdown: " << md << "\n"; 391 | return false; 392 | } 393 | } 394 | 395 | return true; 396 | } 397 | 398 | bool testInvalidTags() { 399 | testOption("invalidTags"); 400 | 401 | // Test cases with various invalid tags 402 | vector 403 | testCases = 404 | { 405 | "

    Valid tag

    ", 406 | "

    Self-closing

    ", 407 | "

    Nested " 408 | "tags

    ", 409 | "

    Vi example

    ", // The specific 411 | // test case from 412 | // the issue 413 | "

    Text with <123invalid>tag

    ", 414 | "

    Text with content

    "}; 415 | 416 | vector expectedOutputs = { 417 | "Valid tag\n", 418 | "Self-closing\n", 419 | "Nested tags\n", 420 | "Vi [example](http://example.com/)\n", // Expected output for 421 | // the specific test 422 | // case 423 | "Text with tag\n", 424 | "Text with content\n"}; 425 | 426 | for (size_t i = 0; i < testCases.size(); i++) { 427 | html2md::Converter c(testCases[i]); 428 | auto md = c.convert(); 429 | 430 | if (md != expectedOutputs[i]) { 431 | cout << "Failed to handle invalid tags:\n" 432 | << "Input: " << testCases[i] << "\n" 433 | << "Expected: " << expectedOutputs[i] << "\n" 434 | << "Got: " << md << "\n"; 435 | return false; 436 | } 437 | } 438 | 439 | return true; 440 | } 441 | 442 | int main(int argc, const char **argv) { 443 | // List to store all markdown files in this dir 444 | vector files; 445 | 446 | static vector markdownExtensions = {".md", ".markdown", ".mkd"}; 447 | 448 | // Find the files 449 | for (const auto &p : fs::recursive_directory_iterator(DIR)) { 450 | if (std::find(markdownExtensions.begin(), markdownExtensions.end(), 451 | p.path().extension()) != markdownExtensions.end() && 452 | p.path().parent_path() == DIR) 453 | files.emplace_back(p.path().string()); 454 | } 455 | 456 | // Test files passed as argument 457 | for (int i = 1; i < argc; i++) { 458 | // Check if the argument is a valid file path and ends with ".md" 459 | string file = argv[i]; 460 | if (fs::is_regular_file(file) && file.find(".md") == file.size() - 3) { 461 | files.emplace_back(file); 462 | } 463 | } 464 | 465 | // Sort file names 466 | sort(files.begin(), files.end()); 467 | 468 | // File name 469 | const char *errorFileName = DIR "/error.log"; 470 | 471 | // Redirect errors to error.log 472 | FILE *errorFile = freopen(errorFileName, "w", stderr); 473 | if (!errorFile) 474 | cerr << "Failed to open " << errorFileName 475 | << " for whatever reason!\n" 476 | "Errors will be printed to the terminal instead of written to the " 477 | "mentioned file above."; 478 | 479 | // For measuring time. 480 | auto t1 = high_resolution_clock::now(); 481 | 482 | // Count the errors 483 | short errorCount = 0; 484 | 485 | // Run the tests 486 | for (auto &file : files) 487 | runTest(file, &errorCount); 488 | 489 | // Test the options 490 | auto tests = {&testDisableTitle, 491 | &testUnorderedList, 492 | &testOrderedList, 493 | &testFormatTable, 494 | &testAttributeWhitespace, 495 | &testUppercaseTags, 496 | &testUppercaseAttributes, 497 | &testMixedCaseTags, 498 | &testSelfClosingUppercaseTags, 499 | &testWhitespaceTags, 500 | &testSelfClosingTags, 501 | &testZeroWidthSpaceWithBlockquote, 502 | &testInvalidTags, 503 | }; 504 | 505 | for (const auto &test : tests) 506 | if (!test()) { 507 | ++errorCount; 508 | error(); 509 | } else 510 | passed(); 511 | 512 | auto t2 = high_resolution_clock::now(); 513 | 514 | /* Getting number of milliseconds as a double. */ 515 | duration ms_double = t2 - t1; 516 | 517 | cout << files.size() + tests.size() << " tests executed in " 518 | << ms_double.count() << "ms. " << errorCount << " failed.\n"; 519 | 520 | return 0; 521 | } 522 | -------------------------------------------------------------------------------- /tests/python/test_advanced.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import pyhtml2md 3 | 4 | def test_basic_conversion(): 5 | # Test basic header conversion 6 | assert pyhtml2md.convert("

    Hello Python!

    ") == "# Hello Python!\n" 7 | 8 | # Test basic paragraph 9 | assert pyhtml2md.convert("

    Simple paragraph

    ") == "Simple paragraph\n" 10 | 11 | def test_converter_class(): 12 | # Test converter initialization and conversion 13 | converter = pyhtml2md.Converter("

    Hello Python!

    ") 14 | assert converter.convert() == "# Hello Python!\n" 15 | assert converter.ok() == True 16 | 17 | # Test boolean operator 18 | assert bool(converter) == True 19 | 20 | def test_options(): 21 | # Test options configuration 22 | options = pyhtml2md.Options() 23 | options.splitLines = False 24 | options.unorderedList = '*' 25 | options.orderedList = ')' 26 | options.includeTitle = False 27 | 28 | html = "
    • First
    • Second
    " 29 | converter = pyhtml2md.Converter(html, options) 30 | result = converter.convert() 31 | assert result.startswith('* First') 32 | assert converter.ok() 33 | 34 | def test_complex_formatting(): 35 | html = """ 36 |

    Main Title

    37 |

    Bold text and italic text

    38 |
      39 |
    • First item
    • 40 |
    • Second item
    • 41 |
    42 |
      43 |
    1. Numbered one
    2. 44 |
    3. Numbered two
    4. 45 |
    46 | """ 47 | options = pyhtml2md.Options() 48 | options.splitLines = False 49 | converter = pyhtml2md.Converter(html, options) 50 | result = converter.convert() 51 | 52 | assert "# Main Title" in result 53 | assert "**Bold text**" in result 54 | assert "*italic text*" in result 55 | assert "1. Numbered one" in result 56 | assert "2. Numbered two" in result 57 | 58 | def test_line_breaks(): 59 | # Test br outside paragraphs 60 | assert "Text \nText2" in pyhtml2md.convert("Text
    Text2") 61 | 62 | # Test br inside paragraphs 63 | assert "Line 1 \nLine 2" in pyhtml2md.convert("

    Line 1
    Line 2

    ") 64 | 65 | # Test br with bullet points in paragraph 66 | assert "Primary Colors: \n• Red \n• Blue \n• Yellow" in pyhtml2md.convert("

    Primary Colors:
    • Red
    • Blue
    • Yellow

    ") 67 | 68 | # Test soft line break settings 69 | html = "A very long line of text that should be wrapped according to the soft break and hard break settings" 70 | options = pyhtml2md.Options() 71 | options.splitLines = True 72 | options.softBreak = 20 73 | options.hardBreak = 30 74 | 75 | converter = pyhtml2md.Converter(html, options) 76 | result = converter.convert() 77 | lines = result.split('\n') 78 | assert any(len(line) <= 30 for line in lines) 79 | 80 | def test_table_formatting(): 81 | html = """ 82 | 83 | 84 | 85 |
    Header 1Header 2
    Data 1Data 2
    86 | """ 87 | options = pyhtml2md.Options() 88 | options.formatTable = True 89 | converter = pyhtml2md.Converter(html, options) 90 | result = converter.convert() 91 | 92 | assert "|" in result 93 | assert "Data 1" in result 94 | 95 | def test_error_handling(): 96 | # Test with malformed HTML 97 | html = "

    Unclosed paragraph" 98 | converter = pyhtml2md.Converter(html) 99 | converter.convert() 100 | assert not converter.ok() 101 | 102 | def test_options_equality(): 103 | options1 = pyhtml2md.Options() 104 | options2 = pyhtml2md.Options() 105 | 106 | assert options1 == options2 107 | 108 | options2.splitLines = False 109 | assert options1 != options2 110 | 111 | def test_special_characters(): 112 | html = "

    <special> & "characters"

    " 113 | result = pyhtml2md.convert(html) 114 | assert "" in result 115 | assert '"characters"' in result 116 | assert "&" in result 117 | 118 | def test_html_entities(): 119 | html = """ 120 |

    "Double quotes" <less than> >greater than< & ampersand   non-breaking space → right arrow

    121 | """ 122 | options = pyhtml2md.Options() 123 | options.splitLines = False 124 | converter = pyhtml2md.Converter(html, options) 125 | result = converter.convert() 126 | 127 | assert '"Double quotes"' in result 128 | assert "" in result 129 | assert ">greater than<" in result 130 | assert "& ampersand" in result 131 | assert " non-breaking space" in result 132 | assert "→ right arrow" in result 133 | 134 | def test_nested_structures(): 135 | html = """ 136 |
    137 |

    Quoted text with bold and italic

    138 |
      139 |
    • Nested list
    • 140 |
    141 |
    142 | """ 143 | result = pyhtml2md.convert(html) 144 | assert ">" in result # blockquote marker 145 | assert "**bold**" in result 146 | assert "*italic*" in result 147 | assert "**list**" in result 148 | 149 | if __name__ == "__main__": 150 | pytest.main([__file__]) -------------------------------------------------------------------------------- /tests/python/test_basic.py: -------------------------------------------------------------------------------- 1 | import pyhtml2md 2 | 3 | def test_main(): 4 | assert pyhtml2md.convert("

    Hello, world!

    ") == "# Hello, world!\n" 5 | 6 | -------------------------------------------------------------------------------- /tests/python/test_html_symbol_conversions.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import pyhtml2md 3 | 4 | def test_default_html_symbol_conversions(): 5 | """Test that default HTML symbol conversions work correctly""" 6 | 7 | assert '"' in pyhtml2md.convert(""") 8 | assert "<" in pyhtml2md.convert("<") 9 | assert ">" in pyhtml2md.convert(">") 10 | assert "&" in pyhtml2md.convert("&") 11 | assert "→" in pyhtml2md.convert("→") 12 | assert "→" in pyhtml2md.convert("→") 13 | 14 | def test_add_html_symbol_conversion(): 15 | """Test adding new HTML symbol conversions""" 16 | converter = pyhtml2md.Converter("© ® &custom;") 17 | 18 | # Before adding conversions 19 | result = converter.convert() 20 | assert "©" in result 21 | assert "®" in result 22 | assert "&custom;" in result 23 | 24 | # Add new conversions 25 | converter = pyhtml2md.Converter("© ® &custom;") 26 | converter.add_html_symbol_conversion("©", "©") 27 | converter.add_html_symbol_conversion("®", "®") 28 | converter.add_html_symbol_conversion("&custom;", "CUSTOM") 29 | 30 | result = converter.convert() 31 | assert "©" in result 32 | assert "®" in result 33 | assert "CUSTOM" in result 34 | 35 | def test_modify_html_symbol_conversion(): 36 | """Test modifying existing HTML symbol conversions""" 37 | converter = pyhtml2md.Converter(" ") 38 | 39 | # Default conversion 40 | assert " " in converter.convert() 41 | 42 | # Modify the conversion 43 | converter = pyhtml2md.Converter(" ") 44 | converter.add_html_symbol_conversion(" ", "\t") 45 | assert "\t" in converter.convert() 46 | 47 | def test_remove_html_symbol_conversion(): 48 | """Test removing HTML symbol conversions""" 49 | converter = pyhtml2md.Converter("& <") 50 | 51 | # Remove & conversion 52 | converter.remove_html_symbol_conversion("&") 53 | result = converter.convert() 54 | assert "&" in result # Should remain unconverted 55 | assert "<" in result # < should still be converted 56 | 57 | def test_clear_html_symbol_conversions(): 58 | """Test clearing all HTML symbol conversions""" 59 | converter = pyhtml2md.Converter("" < > &  ") 60 | 61 | # Clear all conversions 62 | converter.clear_html_symbol_conversions() 63 | result = converter.convert() 64 | 65 | # All symbols should remain unconverted 66 | assert """ in result 67 | assert "<" in result 68 | assert ">" in result 69 | assert "&" in result 70 | assert " " in result 71 | 72 | def test_multiple_conversions_in_text(): 73 | """Test that multiple conversions work in the same text""" 74 | html = """ 75 |

    76 | "Quoted text" with <tags> and & entities   separated by → arrows 77 |

    78 | """ 79 | options = pyhtml2md.Options() 80 | options.splitLines = False 81 | converter = pyhtml2md.Converter(html, options) 82 | 83 | # Add some custom conversions 84 | converter.add_html_symbol_conversion("→", "->") 85 | converter.add_html_symbol_conversion(" ", "\t") 86 | 87 | result = converter.convert() 88 | assert '"Quoted text"' in result 89 | assert "" in result 90 | assert "& entities" in result # & becomes & 91 | assert "\t separated by" in result 92 | assert "-> arrows" in result 93 | 94 | def test_html_symbols_in_attributes(): 95 | """Test that HTML symbols in attributes are converted""" 96 | html = 'Link' 97 | converter = pyhtml2md.Converter(html) 98 | 99 | result = converter.convert() 100 | assert '[Link](page.html?p1=1&p2=2 "' in result 101 | # assert '"Title")' in result # TODO: Fix this assertion 102 | 103 | def test_special_case_conversions(): 104 | """Test special cases like numeric and named entities""" 105 | converter = pyhtml2md.Converter("© ® 😀") 106 | 107 | # Add numeric entity conversions 108 | converter.add_html_symbol_conversion("©", "©") # © 109 | converter.add_html_symbol_conversion("®", "®") # ® 110 | converter.add_html_symbol_conversion("😀", "😀") # 😀 111 | 112 | result = converter.convert() 113 | assert "©" in result 114 | assert "®" in result 115 | assert "😀" in result 116 | 117 | if __name__ == "__main__": 118 | pytest.main([__file__]) -------------------------------------------------------------------------------- /tests/tables.md: -------------------------------------------------------------------------------- 1 | Simple table: 2 | 3 | | foo | 1 | 4 | |-----|---| 5 | | 1 | 3 | 6 | | bar | 5 | 7 | 8 | 9 | Table with alignment: 10 | 11 | | Syntax | Description | Test Text | 12 | | :-------- | :---------: | ----------: | 13 | | Header | Title | Here's this | 14 | | Paragraph | Text | And more | 15 | 16 | Table with line breaks: 17 | 18 | | From | To | 19 | |-------------- |----------------------------------------------- | 20 | | **Plain** | C-string
    Sorted
    MD5
    SHA256
    SHA512 | 21 | | **Markdown** | HTML
    Plain | 22 | | **HTML** | Markdown
    Plain | 23 | | **C-string** | Plain | 24 | 25 | Table with code: 26 | 27 | | table | 28 | |:-------:| 29 | | `code` | 30 | | no code | 31 | --------------------------------------------------------------------------------