├── .clang-format ├── .github ├── CODEOWNERS ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── feature_request.md │ └── question.md ├── dependabot.yml └── workflows │ ├── build.yml │ ├── release.yml │ ├── website.yml │ └── wheels.yml ├── .gitignore ├── .gitmodules ├── CHANGELOG.md ├── CMakeLists.txt ├── CONTRIBUTING.md ├── COPYING ├── MANIFEST.in ├── Package.swift ├── README.md ├── SECURITY.md ├── cli └── main.cpp ├── cmake ├── Doc.cmake └── Packaging.cmake ├── docs ├── Doxyfile ├── doxygen-awesome-css │ ├── .gitignore │ ├── LICENSE │ ├── doxygen-awesome-darkmode-toggle.js │ ├── doxygen-awesome-fragment-copy-button.js │ ├── doxygen-awesome-interactive-toc.js │ ├── doxygen-awesome-paragraph-link.js │ ├── doxygen-awesome-sidebar-only-darkmode-toggle.css │ ├── doxygen-awesome-sidebar-only.css │ ├── doxygen-awesome.css │ └── doxygen-custom │ │ ├── custom.css │ │ └── header.html └── index.md ├── html2md.pc.in ├── html2mdConfig.cmake.in ├── include ├── html2md.h └── table.h ├── js └── bindings.cpp ├── objc ├── html2md_objc.mm └── include │ └── html2md_objc.h ├── pyproject.toml ├── python ├── README.md └── bindings.cpp ├── scripts └── clang-format.sh ├── src ├── html2md.cpp └── table.cpp └── tests ├── CMakeLists.txt ├── README.md ├── blockquote.md ├── breaks.md ├── code.md ├── comment.html ├── escaping.md ├── formating.md ├── links.md ├── lists.md ├── main.cpp ├── tables.md ├── test_advanced.py └── test_basic.py /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | Language: Cpp 3 | # BasedOnStyle: LLVM 4 | AccessModifierOffset: -2 5 | AlignAfterOpenBracket: Align 6 | AlignArrayOfStructures: None 7 | AlignConsecutiveMacros: None 8 | AlignConsecutiveAssignments: None 9 | AlignConsecutiveBitFields: None 10 | AlignConsecutiveDeclarations: None 11 | AlignEscapedNewlines: Right 12 | AlignOperands: Align 13 | AlignTrailingComments: true 14 | AllowAllArgumentsOnNextLine: true 15 | AllowAllParametersOfDeclarationOnNextLine: true 16 | AllowShortEnumsOnASingleLine: true 17 | AllowShortBlocksOnASingleLine: Never 18 | AllowShortCaseLabelsOnASingleLine: false 19 | AllowShortFunctionsOnASingleLine: All 20 | AllowShortLambdasOnASingleLine: All 21 | AllowShortIfStatementsOnASingleLine: Never 22 | AllowShortLoopsOnASingleLine: false 23 | AlwaysBreakAfterDefinitionReturnType: None 24 | AlwaysBreakAfterReturnType: None 25 | AlwaysBreakBeforeMultilineStrings: false 26 | AlwaysBreakTemplateDeclarations: MultiLine 27 | AttributeMacros: 28 | - __capability 29 | BinPackArguments: true 30 | BinPackParameters: true 31 | BraceWrapping: 32 | AfterCaseLabel: false 33 | AfterClass: false 34 | AfterControlStatement: Never 35 | AfterEnum: false 36 | AfterFunction: false 37 | AfterNamespace: false 38 | AfterObjCDeclaration: false 39 | AfterStruct: false 40 | AfterUnion: false 41 | AfterExternBlock: false 42 | BeforeCatch: false 43 | BeforeElse: false 44 | BeforeLambdaBody: false 45 | BeforeWhile: false 46 | IndentBraces: false 47 | SplitEmptyFunction: true 48 | SplitEmptyRecord: true 49 | SplitEmptyNamespace: true 50 | BreakBeforeBinaryOperators: None 51 | BreakBeforeConceptDeclarations: true 52 | BreakBeforeBraces: Attach 53 | BreakBeforeInheritanceComma: false 54 | BreakInheritanceList: BeforeColon 55 | BreakBeforeTernaryOperators: true 56 | BreakConstructorInitializersBeforeComma: false 57 | BreakConstructorInitializers: BeforeColon 58 | BreakAfterJavaFieldAnnotations: false 59 | BreakStringLiterals: true 60 | ColumnLimit: 80 61 | CommentPragmas: '^ IWYU pragma:' 62 | QualifierAlignment: Leave 63 | CompactNamespaces: false 64 | ConstructorInitializerIndentWidth: 4 65 | ContinuationIndentWidth: 4 66 | Cpp11BracedListStyle: true 67 | DeriveLineEnding: true 68 | DerivePointerAlignment: false 69 | DisableFormat: false 70 | EmptyLineAfterAccessModifier: Never 71 | EmptyLineBeforeAccessModifier: LogicalBlock 72 | ExperimentalAutoDetectBinPacking: false 73 | PackConstructorInitializers: BinPack 74 | BasedOnStyle: '' 75 | ConstructorInitializerAllOnOneLineOrOnePerLine: false 76 | AllowAllConstructorInitializersOnNextLine: true 77 | FixNamespaceComments: true 78 | ForEachMacros: 79 | - foreach 80 | - Q_FOREACH 81 | - BOOST_FOREACH 82 | IfMacros: 83 | - KJ_IF_MAYBE 84 | IncludeBlocks: Preserve 85 | IncludeCategories: 86 | - Regex: '^"(llvm|llvm-c|clang|clang-c)/' 87 | Priority: 2 88 | SortPriority: 0 89 | CaseSensitive: false 90 | - Regex: '^(<|"(gtest|gmock|isl|json)/)' 91 | Priority: 3 92 | SortPriority: 0 93 | CaseSensitive: false 94 | - Regex: '.*' 95 | Priority: 1 96 | SortPriority: 0 97 | CaseSensitive: false 98 | IncludeIsMainRegex: '(Test)?$' 99 | IncludeIsMainSourceRegex: '' 100 | IndentAccessModifiers: false 101 | IndentCaseLabels: false 102 | IndentCaseBlocks: false 103 | IndentGotoLabels: true 104 | IndentPPDirectives: None 105 | IndentExternBlock: AfterExternBlock 106 | IndentRequires: false 107 | IndentWidth: 2 108 | IndentWrappedFunctionNames: false 109 | InsertTrailingCommas: None 110 | JavaScriptQuotes: Leave 111 | JavaScriptWrapImports: true 112 | KeepEmptyLinesAtTheStartOfBlocks: true 113 | LambdaBodyIndentation: Signature 114 | MacroBlockBegin: '' 115 | MacroBlockEnd: '' 116 | MaxEmptyLinesToKeep: 1 117 | NamespaceIndentation: None 118 | ObjCBinPackProtocolList: Auto 119 | ObjCBlockIndentWidth: 2 120 | ObjCBreakBeforeNestedBlockParam: true 121 | ObjCSpaceAfterProperty: false 122 | ObjCSpaceBeforeProtocolList: true 123 | PenaltyBreakAssignment: 2 124 | PenaltyBreakBeforeFirstCallParameter: 19 125 | PenaltyBreakComment: 300 126 | PenaltyBreakFirstLessLess: 120 127 | PenaltyBreakOpenParenthesis: 0 128 | PenaltyBreakString: 1000 129 | PenaltyBreakTemplateDeclaration: 10 130 | PenaltyExcessCharacter: 1000000 131 | PenaltyReturnTypeOnItsOwnLine: 60 132 | PenaltyIndentedWhitespace: 0 133 | PointerAlignment: Right 134 | PPIndentWidth: -1 135 | ReferenceAlignment: Pointer 136 | ReflowComments: true 137 | RemoveBracesLLVM: false 138 | SeparateDefinitionBlocks: Leave 139 | ShortNamespaceLines: 1 140 | SortIncludes: CaseSensitive 141 | SortJavaStaticImport: Before 142 | SortUsingDeclarations: true 143 | SpaceAfterCStyleCast: false 144 | SpaceAfterLogicalNot: false 145 | SpaceAfterTemplateKeyword: true 146 | SpaceBeforeAssignmentOperators: true 147 | SpaceBeforeCaseColon: false 148 | SpaceBeforeCpp11BracedList: false 149 | SpaceBeforeCtorInitializerColon: true 150 | SpaceBeforeInheritanceColon: true 151 | SpaceBeforeParens: ControlStatements 152 | SpaceBeforeParensOptions: 153 | AfterControlStatements: true 154 | AfterForeachMacros: true 155 | AfterFunctionDefinitionName: false 156 | AfterFunctionDeclarationName: false 157 | AfterIfMacros: true 158 | AfterOverloadedOperator: false 159 | BeforeNonEmptyParentheses: false 160 | SpaceAroundPointerQualifiers: Default 161 | SpaceBeforeRangeBasedForLoopColon: true 162 | SpaceInEmptyBlock: false 163 | SpaceInEmptyParentheses: false 164 | SpacesBeforeTrailingComments: 1 165 | SpacesInAngles: Never 166 | SpacesInConditionalStatement: false 167 | SpacesInContainerLiterals: true 168 | SpacesInCStyleCastParentheses: false 169 | SpacesInLineCommentPrefix: 170 | Minimum: 1 171 | Maximum: -1 172 | SpacesInParentheses: false 173 | SpacesInSquareBrackets: false 174 | SpaceBeforeSquareBrackets: false 175 | BitFieldColonSpacing: Both 176 | Standard: Latest 177 | StatementAttributeLikeMacros: 178 | - Q_EMIT 179 | StatementMacros: 180 | - Q_UNUSED 181 | - QT_REQUIRE_VERSION 182 | TabWidth: 8 183 | UseCRLF: false 184 | UseTab: Never 185 | WhitespaceSensitiveMacros: 186 | - STRINGIZE 187 | - PP_STRINGIZE 188 | - BOOST_PP_STRINGIZE 189 | - NS_SWIFT_NAME 190 | - CF_SWIFT_NAME 191 | ... 192 | 193 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # Generated by CODEOWNERS.com 2 | 3 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Run '....' 16 | 2. See error 17 | 18 | **Expected behavior** 19 | A clear and concise description of what you expected to happen. 20 | 21 | **Output of `dmesg | tail -n2`** 22 | (for Linux and maybe mac) 23 | 24 | **Desktop (please complete the following information):** 25 | - OS: [e.g. Ubuntu 22.04] 26 | 27 | **Additional context** 28 | Add any other context about the problem here. 29 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/question.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Question 3 | about: Ask a question 4 | title: '' 5 | labels: question 6 | assignees: '' 7 | 8 | --- 9 | 10 | For example 'How customize lists' 11 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | 6 | version: 2 7 | updates: 8 | # Maintain dependencies for GitHub Actions 9 | - package-ecosystem: "github-actions" 10 | directory: "/" 11 | schedule: 12 | interval: "weekly" 13 | 14 | # Keep submodules up to date 15 | - package-ecosystem: "gitsubmodule" 16 | directory: "/" 17 | schedule: 18 | interval: "weekly" 19 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: 'Build' 2 | 3 | on: 4 | push: 5 | branches: 6 | - '*' 7 | pull_request: 8 | workflow_dispatch: 9 | 10 | jobs: 11 | build: 12 | runs-on: ${{ matrix.os }} 13 | strategy: 14 | fail-fast: false 15 | matrix: 16 | os: [ubuntu-latest, macos-latest, windows-latest] 17 | steps: 18 | - name: Checkout 19 | uses: actions/checkout@v4 20 | 21 | # 22 | # Build using CMake 23 | # 24 | - name: Build using CMake 25 | run: | 26 | mkdir build && cd build 27 | cmake .. 28 | cmake --build . -j8 29 | 30 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Create release and update assets 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*' 7 | workflow_dispatch: 8 | 9 | jobs: 10 | build: 11 | name: Build and upload release assetes 12 | runs-on: ${{ matrix.os }} 13 | strategy: 14 | fail-fast: false 15 | matrix: 16 | include: 17 | - os: ubuntu-20.04 18 | files: | 19 | build/packages/html2md*.deb 20 | build/packages/html2md*.tar.gz 21 | - os: ubuntu-22.04 22 | files: | 23 | build/packages/html2md*.deb 24 | - os: windows-latest 25 | files: | 26 | build/packages/html2md*.zip 27 | steps: 28 | - name: Checkout repo 29 | uses: actions/checkout@v4 30 | 31 | - name: Build and package 32 | run: | 33 | mkdir build && cd build 34 | cmake -DBUILD_TEST=OFF -DBUILD_DOC=OFF -DCMAKE_BUILD_TYPE=Release .. 35 | cmake --build . --config Release 36 | cmake --build . --config Release --target package 37 | shell: bash 38 | 39 | - name: Upload package 40 | uses: actions/upload-artifact@v4 41 | with: 42 | path: ${{ matrix.files }} 43 | name: ${{ matrix.os }} 44 | 45 | publish: 46 | name: Create release and upload files 47 | runs-on: ubuntu-22.04 48 | needs: build 49 | permissions: 50 | contents: write 51 | strategy: 52 | fail-fast: false 53 | steps: 54 | - uses: actions/checkout@v4 55 | with: 56 | fetch-depth: 0 57 | 58 | - name: Download release asstets 59 | uses: actions/download-artifact@v4 60 | with: 61 | path: packages 62 | 63 | - name: Find changes and release assets 64 | id: files 65 | run: | 66 | PREVIOS="$(git tag --sort=creatordate | tail -n 2 | head -n1)" 67 | wget https://raw.githubusercontent.com/tim-gromeyer/html2md/$PREVIOS/CHANGELOG.md -O OLD.md 68 | echo "CHANGES<> $GITHUB_ENV 69 | echo "$(grep -Fvxf OLD.md CHANGELOG.md | tail -n +2)" >> $GITHUB_ENV 70 | echo "EOF" >> $GITHUB_ENV 71 | 72 | echo "FILES<> $GITHUB_ENV 73 | find packages/ -name "*" -type f >> $GITHUB_ENV 74 | echo "EOF" >> $GITHUB_ENV 75 | 76 | - name: Release 77 | uses: softprops/action-gh-release@v2 78 | if: startsWith(github.ref, 'refs/tags/') 79 | with: 80 | generate_release_notes: true 81 | body: ${{ env.CHANGES }} 82 | files: ${{ env.FILES }} 83 | -------------------------------------------------------------------------------- /.github/workflows/website.yml: -------------------------------------------------------------------------------- 1 | name: Update website 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*' 7 | workflow_dispatch: 8 | 9 | jobs: 10 | website: 11 | name: Build website and deploy to gh pages 12 | runs-on: ubuntu-22.04 13 | 14 | steps: 15 | - uses: actions/checkout@v4 16 | 17 | - name: Add version to Doxyfile 18 | run: | 19 | echo "PROJECT_NUMBER = ${{ github.ref_name }}" >> docs/Doxyfile 20 | 21 | - name: Run Doxygen 22 | uses: mattnotmitt/doxygen-action@edge 23 | with: 24 | doxyfile-path: 'docs/Doxyfile' 25 | 26 | - name: Deploy 27 | uses: peaceiris/actions-gh-pages@v4 28 | with: 29 | github_token: ${{ secrets.GITHUB_TOKEN }} 30 | publish_dir: ./doc 31 | 32 | -------------------------------------------------------------------------------- /.github/workflows/wheels.yml: -------------------------------------------------------------------------------- 1 | name: Build Python wheels 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*' 7 | workflow_dispatch: 8 | 9 | jobs: 10 | build_sdist: 11 | name: Build SDist 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v4 15 | with: 16 | submodules: true 17 | 18 | - name: Build SDist 19 | run: pipx run build --sdist 20 | 21 | - name: Check metadata 22 | run: pipx run twine check dist/* 23 | 24 | - uses: actions/upload-artifact@v4 25 | with: 26 | path: dist/*.tar.gz 27 | 28 | 29 | build_wheels: 30 | name: Wheels on ${{ matrix.os }} 31 | runs-on: ${{ matrix.os }} 32 | strategy: 33 | fail-fast: false 34 | matrix: 35 | os: [ubuntu-latest, windows-latest, macos-latest] 36 | 37 | steps: 38 | - uses: actions/checkout@v4 39 | with: 40 | submodules: true 41 | 42 | - uses: pypa/cibuildwheel@v2.19.1 43 | env: 44 | CIBW_ARCHS_MACOS: "auto universal2" 45 | CIBW_SKIP: "{cp36-*, cp37-*}" # Skip Python 3.6/3.7 46 | 47 | - name: Verify clean directory 48 | run: git diff --exit-code 49 | shell: bash 50 | 51 | - name: Upload wheels 52 | uses: actions/upload-artifact@v4 53 | with: 54 | path: wheelhouse/*.whl 55 | name: ${{ matrix.os }} 56 | 57 | 58 | upload_all: 59 | name: Upload wheels 60 | needs: [build_wheels, build_sdist] 61 | runs-on: ubuntu-latest 62 | 63 | steps: 64 | - uses: actions/download-artifact@v4 65 | with: 66 | name: artifact 67 | path: dist 68 | 69 | - uses: pypa/gh-action-pypi-publish@v1.9.0 70 | with: 71 | password: ${{ secrets.PYPI_API_TOKEN }} 72 | 73 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.user* 2 | build-linux/ 3 | build-windows/ 4 | build-wasm/ 5 | doc/ 6 | tests/error.log 7 | conan/ 8 | build/ 9 | dist/ 10 | pyhtml2md.egg-info/ 11 | *.whl 12 | wheelhouse/ 13 | tests/__pycache__/ 14 | .DS_Store 15 | /.build 16 | /Packages 17 | xcuserdata/ 18 | DerivedData/ 19 | .swiftpm/configuration/registries.json 20 | .swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata 21 | .netrc 22 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "python/pybind11"] 2 | path = python/pybind11 3 | url = https://github.com/pybind/pybind11 4 | [submodule "tests/md4c"] 5 | path = tests/md4c 6 | url = https://github.com/tim-gromeyer/MarkdownEdit_md4c 7 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Change log 2 | 3 | [TOC] 4 | 5 | ## 1.6.4 6 | - Fix handling of `
` tags outside of paragraphs (`

`) 7 | 8 | ## 1.6.3 9 | - Update python dependencies, hopefully fixes (#133) 10 | 11 | ## 1.6.2 12 | - Fix HTML entities not converted (see #131) 13 | 14 | ## 1.6.0 15 | 16 | - Add option for soft line break 17 | - Add option for hard line break 18 | - Fix handling of self-closing tags 19 | - Updated python package building (see #100) 20 | 21 | ## 1.5.4 22 | 23 | - Fix crash (see #67) 24 | - Add support for newer Python versions 25 | 26 | ## 1.5.3 27 | 28 | - Make `blockquote` work correctly! 29 | - Additional note for 1.5.2: Add Python 12 packages 30 | 31 | ## 1.5.2 32 | 33 | - FIXED: Add `titile` support for images 34 | - FIXED: Code got formatted (Spaces removed) 35 | - Fixed some formatting issues (like a space infront of `!`) 36 | - FIXED: Escaping of `*`, \`, and `\` 37 | - Reduced memory usage 38 | - Improved performance 39 | 40 | ## v1.5.1 41 | 42 | - **~40% Performance Improvement** 43 | 44 | ## v1.5.0 45 | 46 | - **Added a option to Format Markdown Tables** 47 | - More tests 48 | - Reworked cli program for better usability 49 | 50 | ## v1.4.4 51 | 52 | - New release with Python 3.11 support/packages 53 | - Updated internal dependencies 54 | 55 | ## v1.4.3 56 | 57 | - Improved performance 58 | - Updated 3rdparty tools (for creating python packages and creating releases) 59 | - Fix code example 60 | 61 | ## v1.4.2 62 | 63 | - Fixed windows release build are linked against debug libraries 64 | 65 | ## v1.4.1 66 | 67 | - **Fixed ALL memory leaks** 68 | - Fixed bugs(`html2md::Options::includeTitle` not working) 69 | - Added more tests 70 | - Documentation: Updated Doxygen to v1.9.6 71 | - Include Windows to releases 72 | 73 | ## v1.4.0 74 | 75 | - Improved CMake support massively! 76 | - Fixed tests 77 | - Added support for CMake 3.8 78 | - Fix Python source package 79 | 80 | ## v1.3.0 81 | 82 | **BREAKING CHANGES!** 83 | 84 | - Renamed `Converter::Convert2Md` -> `Converter::convert()` 85 | - Renamed `options` -> `Options` 86 | 87 | ## v1.2.2 88 | 89 | - Fixed bug when calling `Convert2Md()` multiple times 90 | - Corrected serval typos. Ignore the rest of the change log. 91 | 92 | ## v1.2.1 93 | 94 | - Added missing python dependency 95 | 96 | ## v1.2.0 97 | 98 | - **Added python bindings** 99 | - Added new option: `includeTable`. 100 | 101 | ## v1.1.5 102 | 103 | - Added more command line options to the executable 104 | 105 | ## v1.1.4 106 | 107 | - Releases now include deb files 108 | 109 | ## v1.1.3 110 | 111 | The user can now test his own Markdown files. Simply specify to the test program as argument. 112 | 113 | ## v1.1.2 114 | 115 | - Add changes for v1.1.1 116 | - Create releases when a new tag is added(automatically) 117 | 118 | ## v.1.1.1 119 | 120 | - Fix windows build(by replacing get) 121 | 122 | ## v1.1.0 123 | 124 | - Reworked command line program 125 | - Renamed `AppendToMd` to `appendToMd` 126 | - Renamed `AppendBlank` to `appendBlank` 127 | - **Require *c++11* instead of *c++17*.** Only the tests require *c++17* now. 128 | - Added more tests 129 | - Fix typos in comments 130 | - Improved documentation 131 | 132 | ## v1.0.1 133 | 134 | - Fixed several bugs 135 | - Added more tests: make test 136 | - Updated documentation: make doc 137 | - Added packaging: make package 138 | 139 | ## v1.0.0 140 | 141 | Initial release. All basics work but `blockquote` needs a rework. 142 | 143 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.8) 2 | project(html2md VERSION 1.6.4 LANGUAGES CXX) 3 | 4 | set(PROJECT_HOMEPAGE_URL "https://tim-gromeyer.github.io/html2md/") 5 | set(html2md_HOMEPAGE_URL "${PROJECT_HOMEPAGE_URL}") 6 | 7 | set(PROJECT_DESCRIPTION "Transform your HTML into clean, easy-to-read markdown with html2md") 8 | set(html2md_DESCRIPTION "${PROJECT_DESCRIPTION}") 9 | 10 | # If build type not specified we use release 11 | if (NOT CMAKE_BUILD_TYPE) 12 | message(STATUS "Build type not specified. Release is used.") 13 | set(CMAKE_BUILD_TYPE "Release") 14 | endif() 15 | 16 | # Improve performance 17 | if(CMAKE_BUILD_TYPE STREQUAL "Release") 18 | string(REPLACE "-O2" "-O3" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") 19 | string(REPLACE "-O2" "-O3" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") 20 | endif() 21 | 22 | # Check if it was included via `add_subdirectory` 23 | get_directory_property(subproject PARENT_DIRECTORY) 24 | 25 | # Create HTML for webassembly 26 | if(EMSCRIPTEN) 27 | set(CMAKE_EXECUTABLE_SUFFIX ".html") 28 | endif() 29 | 30 | # Some options 31 | if (subproject) 32 | option(BUILD_EXE "Build a executable to convert html to markdown." OFF) 33 | else() 34 | option(BUILD_EXE "Build a executable to convert html to markdown." ON) 35 | endif() 36 | option(BUILD_DOC "Build documentation" OFF) 37 | option(BUILD_TEST "Build tests" OFF) 38 | option(PYTHON_BINDINGS "Build python bindings" OFF) 39 | 40 | set(SOURCES 41 | src/html2md.cpp 42 | src/table.cpp 43 | ) 44 | set(HEADERS 45 | include/html2md.h 46 | include/table.h 47 | ) 48 | 49 | if(PYTHON_BINDINGS) 50 | add_subdirectory(python/pybind11) 51 | pybind11_add_module(pyhtml2md python/bindings.cpp ${SOURCES} ${HEADER}) 52 | target_compile_features(pyhtml2md PUBLIC 53 | cxx_auto_type # auto keyword 54 | cxx_constexpr # constexpr support 55 | cxx_range_for # for (auto test : tests) 56 | cxx_std_11 # Require at least c++11 57 | ) 58 | target_compile_definitions(pyhtml2md PRIVATE PYTHON_BINDINGS) 59 | target_include_directories(pyhtml2md PRIVATE include) 60 | if (SKBUILD) 61 | install(TARGETS pyhtml2md DESTINATION "${SKBUILD_PLATLIB_DIR}") 62 | endif() 63 | return() 64 | endif() 65 | 66 | add_library(html2md ${SOURCES}) 67 | set_target_properties(html2md PROPERTIES 68 | VERSION ${PROJECT_VERSION} 69 | SOVERSION ${PROJECT_VERSION_MAJOR} 70 | PUBLIC_HEADER "${HEADERS}" 71 | ) 72 | target_include_directories(html2md PUBLIC $) 73 | target_compile_features(html2md PUBLIC cxx_std_11) # Require at least c++11 74 | 75 | if ((subproject AND BUILD_SHARED_LIBS) OR BUILD_EXE) 76 | add_library(html2md-static STATIC ${HEADERS} ${SOURCES}) 77 | target_include_directories(html2md-static PUBLIC include) 78 | target_compile_features(html2md-static PUBLIC cxx_std_11) # Require at least c++11 79 | endif() 80 | 81 | if(BUILD_EXE) 82 | add_executable(html2md-exe cli/main.cpp) 83 | target_link_libraries(html2md-exe html2md-static) 84 | set_target_properties(html2md-exe PROPERTIES OUTPUT_NAME "html2md") 85 | target_compile_definitions(html2md-exe PUBLIC VERSION="${PROJECT_VERSION}") 86 | target_compile_features(html2md-exe PUBLIC cxx_std_11) # Require at least c++11 87 | endif() 88 | 89 | if(BUILD_TEST) 90 | add_subdirectory(tests) 91 | endif() 92 | 93 | if(BUILD_DOC) 94 | include(cmake/Doc.cmake) 95 | endif() 96 | 97 | # Don't install as a subproject 98 | if(subproject) 99 | return() 100 | endif() 101 | 102 | include(GNUInstallDirs) 103 | include(CMakePackageConfigHelpers) 104 | 105 | install(TARGETS html2md 106 | EXPORT html2mdTargets 107 | ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} 108 | LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} 109 | RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} 110 | PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/html2md 111 | ) 112 | install(EXPORT html2mdTargets 113 | FILE html2mdTargets.cmake 114 | DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/html2md" 115 | ) 116 | 117 | configure_file(html2md.pc.in html2md.pc @ONLY) 118 | install(FILES ${CMAKE_BINARY_DIR}/html2md.pc DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/pkgconfig) 119 | 120 | write_basic_package_version_file( 121 | "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake" 122 | VERSION ${PROJECT_VERSION} 123 | COMPATIBILITY SameMajorVersion 124 | ) 125 | 126 | configure_package_config_file(${CMAKE_CURRENT_SOURCE_DIR}/${PROJECT_NAME}Config.cmake.in 127 | "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake" 128 | INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/html2md 129 | NO_CHECK_REQUIRED_COMPONENTS_MACRO 130 | ) 131 | 132 | install(FILES 133 | "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake" 134 | "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake" 135 | DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/html2md 136 | ) 137 | 138 | if (BUILD_EXE) 139 | install(TARGETS html2md-exe DESTINATION bin) 140 | endif() 141 | 142 | include(cmake/Packaging.cmake) 143 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | Just fork the repo, edit it and then create a pull request! 4 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Tim Gromeyer 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include python/README.md COPYING python/pybind11/LICENSE python/pybind11/CMakeLists.txt CMakeLists.txt python/bindings.cpp 2 | graft python/pybind11/include 3 | graft python/pybind11/tools 4 | graft src 5 | graft include 6 | -------------------------------------------------------------------------------- /Package.swift: -------------------------------------------------------------------------------- 1 | // swift-tools-version:5.5 2 | // The swift-tools-version declares the minimum version of Swift required to build this package. 3 | 4 | import PackageDescription 5 | 6 | let package = Package( 7 | name: "html2md", 8 | products: [ 9 | .library(name: "html2md", targets: ["html2md"]), 10 | ], 11 | targets: [ 12 | .target( 13 | name: "html2md", 14 | dependencies: ["html2md_cpp"], 15 | path: ".", 16 | sources: [ 17 | "objc/html2md_objc.mm", 18 | ], 19 | publicHeadersPath: "objc/include", 20 | cxxSettings: [ 21 | // header is inherited from html2md_cpp 22 | // we should compile this objc file with c++11 23 | .unsafeFlags(["-std=c++11"]), 24 | ] 25 | ), 26 | .target( 27 | name: "html2md_cpp", 28 | path: ".", 29 | sources: [ 30 | "src/html2md.cpp", 31 | "src/table.cpp", 32 | ], 33 | publicHeadersPath: "include", 34 | cxxSettings: [ 35 | .unsafeFlags(["-std=c++11"]), 36 | .unsafeFlags(["-Wno-parentheses", "-Wno-conversion"]), 37 | ] 38 | ), 39 | ] 40 | ) 41 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # html2md 2 | 3 | Transform your HTML into clean, easy-to-read markdown with html2md 4 | 5 | ## Table of Contents 6 | 7 | - [What does it do](#what-does-it-do) 8 | - [How to use this library](#how-to-use-this-library) 9 | - [Supported Tags](#supported-tags) 10 | - [Bindings](#bindings) 11 | - [Requirements](#requirements) 12 | - [License](#license) 13 | 14 | 15 | ## What does it do 16 | 17 | html2md is a fast and reliable C++ library for converting HTML content into markdown. It offers support for a wide range of HTML tags, including those for formatting text, creating lists, and inserting images and links. In addition, html2md is the only HTML to markdown converter that offers support for **table formatting**, making it a valuable tool for users who need to convert HTML tables into markdown. 18 | 19 | 20 | ## How to use this library 21 | 22 | ### CMake 23 | 24 | Install html2md. Use eighter the prebild packages from [GitHub releases](https://github.com/tim-gromeyer/html2md/releases) or build and install it yourself. 25 | 26 | Afterwards: 27 | 28 | ```cmake 29 | find_package(html2md) 30 | target_link_library(your_target PRIVATE html2md) 31 | ``` 32 | 33 | ### Manually 34 | 35 | To use html2md, follow these steps: 36 | 37 | 1. Clone the library: `git clone https://github.com/tim-gromeyer/html2md` 38 | 2. Add the files `include/html2md.h` and `src/html2md.cpp` to your project 39 | 3. Include the `html2md.h` header in your code 40 | 4. Use the `html2md::Convert` function to convert your HTML content into markdown 41 | 42 | Here is an example of how to use the `html2md::Convert` function: 43 | 44 | ```cpp 45 | #include 46 | 47 | //... 48 | 49 | std::cout << html2md::Convert("

foo

"); // # foo 50 | ``` 51 | 52 | ## Supported Tags 53 | 54 | html2md supports the following HTML tags: 55 | 56 | | Tag | Description | Comment | 57 | |--------------|--------------------|-----------------------------------------------------| 58 | | `a` | Anchor or link | Supports the `href`, `name` and `title` attributes. | 59 | | `b` | Bold | | 60 | | `blockquote` | Indented paragraph | | 61 | | `br` | Line break | | 62 | | `cite` | Inline citation | Same as `i`. | 63 | | `code` | Code | | 64 | | `dd` | Definition data | | 65 | | `del` | Strikethrough | | 66 | | `dfn` | Definition | Same as `i`. | 67 | | `div` | Document division | | 68 | | `em` | Emphasized | Same as `i`. | 69 | | `h1` | Level 1 heading | | 70 | | `h2` | Level 2 heading | | 71 | | `h3` | Level 3 heading | | 72 | | `h4` | Level 4 heading | | 73 | | `h5` | Level 5 heading | | 74 | | `h6` | Level 6 heading | | 75 | | `head` | Document header | Ignored. | 76 | | `hr` | Horizontal line | | 77 | | `i` | Italic | | 78 | | `img` | Image | Supports `src`, `alt`, `title` attributes. | 79 | | `li` | List item | | 80 | | `meta` | Meta-information | Ignored. | 81 | | `ol` | Ordered list | | 82 | | `p` | Paragraph | | 83 | | `pre` | Preformatted text | Works only with `code`. | 84 | | `s` | Strikethrough | Same as `del`. | 85 | | `span` | Grouped elements | Does nothing. | 86 | | `strong` | Strong | Same as `b`. | 87 | | `table` | Table | Tables are formatted! | 88 | | `tbody` | Table body | Does nothing. | 89 | | `td` | Table data cell | Uses `align` from `th`. | 90 | | `tfoot` | Table footer | Does nothing. | 91 | | `th` | Table header cell | Supports the `align` attribute. | 92 | | `thead` | Table header | Does nothing. | 93 | | `title` | Document title | Same as `h1`. | 94 | | `tr` | Table row | | 95 | | `u` | Underlined | Uses HTML. | 96 | | `ul` | Unordered list | | 97 | 98 | ## Bindings 99 | 100 | - [Python](python/README.md) 101 | 102 | ## Requirements 103 | 104 | 1. A compiler with **c++11** support like *g++>=9* 105 | 106 | That's all! 107 | 108 | ## License 109 | 110 | html2md is licensed under [The MIT License (MIT)](https://opensource.org/licenses/MIT) 111 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Supported Versions 4 | 5 | | Version | Supported | 6 | | -------- | ------------------ | 7 | | Latest | :white_check_mark: | 8 | | Other | :x: | 9 | 10 | ## Reporting a Vulnerability 11 | 12 | Create a new [issue](https://github.com/tim-gromeyer/html2md/issues/new/choose), tell me where the bug is and I'll try to fix it. 13 | Pull requests are welcome! 14 | -------------------------------------------------------------------------------- /cli/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "html2md.h" 7 | 8 | using std::cerr; 9 | using std::cin; 10 | using std::cout; 11 | using std::endl; 12 | using std::fstream; 13 | using std::ifstream; 14 | using std::ios; 15 | using std::string; 16 | using std::stringstream; 17 | 18 | namespace FileUtils { 19 | bool exists(const std::string &name) { 20 | ifstream f(name.c_str()); 21 | return f.good(); 22 | } 23 | 24 | string readAll(const string &file) { 25 | ifstream in(file); 26 | stringstream buffer; 27 | buffer << in.rdbuf(); 28 | 29 | if (in.bad()) { 30 | throw std::runtime_error("Error reading file: " + file); 31 | } 32 | 33 | return buffer.str(); 34 | } 35 | 36 | void writeFile(const string &file, const string &content) { 37 | fstream out(file, ios::out); 38 | if (!out.is_open()) { 39 | throw std::runtime_error("Error writing file: " + file); 40 | } 41 | 42 | out << content; 43 | out.close(); 44 | 45 | if (out.bad()) { 46 | throw std::runtime_error("Error writing file: " + file); 47 | } 48 | } 49 | } // namespace FileUtils 50 | 51 | constexpr const char *const DESCRIPTION = 52 | " [Options] files...\n\n" 53 | "Simple and fast HTML to Markdown converter with table support.\n\n" 54 | "Options:\n" 55 | " -h, --help\tDisplays this help information.\n" 56 | " -v, --version\tDisplay version information and exit.\n" 57 | " -o, --output\tSets the output file.\n" 58 | " -i, --input\tSets the input text.\n" 59 | " -p, --print\tPrint the generated Markdown.\n" 60 | " -r, --replace\tOverwrite the output file (if it already exists) without " 61 | "asking.\n"; 62 | 63 | struct Options { 64 | bool print = false; 65 | bool replace = false; 66 | string inputFile; 67 | string outputFile; 68 | string inputText; 69 | }; 70 | 71 | void printHelp(const string &programName) { 72 | cout << programName << DESCRIPTION; 73 | } 74 | 75 | void printVersion() { cout << "Version " << VERSION << endl; } 76 | 77 | bool confirmOverride(const string &fileName) { 78 | while (true) { 79 | cout << fileName << " already exists, override? [y/n] "; 80 | string override; 81 | getline(cin, override); 82 | 83 | if (override.empty()) { 84 | continue; 85 | } 86 | 87 | if (override == "y" || override == "Y") { 88 | return true; 89 | } else if (override == "n" || override == "N") { 90 | return false; 91 | } else { 92 | cout << "Invalid input" << endl; 93 | } 94 | } 95 | } 96 | 97 | Options parseCommandLine(int argc, char **argv) { 98 | Options options; 99 | 100 | if (argc == 1) { 101 | printHelp(argv[0]); 102 | exit(EXIT_SUCCESS); 103 | } 104 | 105 | for (int i = 1; i < argc; i++) { 106 | string arg = argv[i]; 107 | 108 | if (arg == "-h" || arg == "--help") { 109 | printHelp(argv[0]); 110 | exit(EXIT_SUCCESS); 111 | } else if (arg == "-v" || arg == "--version") { 112 | printVersion(); 113 | exit(EXIT_SUCCESS); 114 | } else if (arg == "-p" || arg == "--print") { 115 | options.print = true; 116 | } else if (arg == "-r" || arg == "--replace") { 117 | options.replace = true; 118 | } else if (arg == "-o" || arg == "--output") { 119 | if (i + 1 < argc) { 120 | options.outputFile = argv[i + 1]; 121 | i++; 122 | } else { 123 | cerr << "The" << arg << "option requires a file name!\n" << endl; 124 | exit(EXIT_FAILURE); 125 | } 126 | } else if (arg == "-i" || arg == "--input") { 127 | if (i + 1 < argc) { 128 | options.inputText = argv[i + 1]; 129 | i++; 130 | } else { 131 | cerr << "The" << arg << "option requires HTML text!" << endl; 132 | exit(EXIT_FAILURE); 133 | } 134 | } else if (options.inputFile.empty()) { 135 | options.inputFile = arg; 136 | } 137 | } 138 | 139 | return options; 140 | } 141 | 142 | int main(int argc, char **argv) { 143 | Options options = parseCommandLine(argc, argv); 144 | 145 | string input; 146 | if (!options.inputText.empty()) { 147 | input = options.inputText; 148 | } else if (!options.inputFile.empty() && 149 | FileUtils::exists(options.inputFile)) { 150 | input = FileUtils::readAll(options.inputFile); 151 | } else { 152 | cerr << "No valid input provided!" << endl; 153 | return EXIT_FAILURE; 154 | } 155 | 156 | html2md::Converter converter(input); 157 | string md = converter.convert(); 158 | 159 | if (options.print) { 160 | cout << md << endl; 161 | } 162 | 163 | if (!options.outputFile.empty()) { 164 | if (FileUtils::exists(options.outputFile) && !options.replace) { 165 | if (confirmOverride(options.outputFile)) { 166 | FileUtils::writeFile(options.outputFile, md); 167 | cout << "Markdown written to " << options.outputFile << endl; 168 | } else { 169 | cout << "Markdown not written." << endl; 170 | } 171 | } else { 172 | FileUtils::writeFile(options.outputFile, md); 173 | cout << "Markdown written to " << options.outputFile << endl; 174 | } 175 | } 176 | 177 | return EXIT_SUCCESS; 178 | } 179 | -------------------------------------------------------------------------------- /cmake/Doc.cmake: -------------------------------------------------------------------------------- 1 | find_package(Doxygen) 2 | 3 | if(DOXYGEN_FOUND) 4 | add_custom_target( 5 | doc 6 | COMMAND echo "PROJECT_NUMBER = ${PROJECT_VERSION}" >> docs/Doxyfile && ${DOXYGEN_EXECUTABLE} docs/Doxyfile 7 | WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} 8 | COMMENT "Generating API documentation using Doxygen" 9 | VERBATIM 10 | ) 11 | else() 12 | message(WARNING "Doxygen not found. The documentation will not be created!") 13 | endif() 14 | -------------------------------------------------------------------------------- /cmake/Packaging.cmake: -------------------------------------------------------------------------------- 1 | include(InstallRequiredSystemLibraries) 2 | 3 | set(CPACK_STRIP_FILES ON) 4 | set(CPACK_PACKAGE_NAME ${PROJECT_NAME} ) 5 | set(CPACK_PACKAGE_VERSION ${PROJECT_VERSION}) 6 | set(CPACK_PACKAGE_CONTACT "Tim Gromeyer") 7 | set(CPACK_PACKAGE_VENDOR ${CPACK_PACKAGE_CONTACT}) 8 | set(CPACK_PACKAGE_DESCRIPTION_SUMMARY ${PROJECT_DESCRIPTION}) 9 | set(CPACK_PACKAGE_DESCRIPTION "Simple and fast HTML to Markdown conversion library with table support, written in c++.") 10 | 11 | set(CPACK_RESOURCE_FILE_LICENSE ${PROJECT_SOURCE_DIR}/COPYING) 12 | set(CPACK_RESOURCE_FILE_README ${PROJECT_SOURCE_DIR}/README.md) 13 | 14 | # Speed it up! 15 | set(CPACK_THREADS 0) # all 16 | 17 | # Variables specific to CPack RPM generator 18 | set(CPACK_RPM_PACKAGE_DESCRIPTION ${CPACK_PACKAGE_DESCRIPTION}) 19 | set(CPACK_RPM_PACKAGE_LICENSE "MIT") 20 | set(CPACK_RPM_PACKAGE_GROUP "Development/Tools") 21 | set(CPACK_RPM_PACKAGE_URL ${PROJECT_HOMEPAGE_URL}) 22 | # set(CPACK_RPM_PACKAGE_REQUIRES "/sbin/chkconfig, /bin/mktemp, /bin/rm, /bin/mv, libstdc++ >= 2.96") # TODO: Find correct packages 23 | 24 | # Variables specific to CPack DEB generator 25 | set(CPACK_DEBIAN_PACKAGE_DESCRIPTION ${CPACK_PACKAGE_DESCRIPTION}) 26 | set(CPACK_DEBIAN_PACKAGE_SECTION "devel") 27 | set(CPACK_DEBIAN_PACKAGE_HOMEPAGE ${PROJECT_HOMEPAGE_URL}) 28 | set(CPACK_DEBIAN_PACKAGE_SHLIBDEPS YES) 29 | set(CPACK_DEBIAN_PACKAGE_SUGGESTS "") 30 | set(CPACK_DEBIAN_PACKAGE_CONFLICTS "") 31 | set(CPACK_DEBIAN_PACKAGE_MAINTAINER "${CPACK_PACKAGE_CONTACT} ") 32 | 33 | if(WIN32) 34 | set(CPACK_GENERATOR "ZIP") 35 | 36 | elseif(APPLE) 37 | set(CPACK_GENERATOR "ZIP") 38 | set(CPACK_SYSTEM_NAME "OSX") 39 | 40 | elseif(UNIX AND NOT EXMSCRIPTEN AND NOT ANDROID) 41 | # Determine distribution and release 42 | execute_process(COMMAND lsb_release -si OUTPUT_VARIABLE distribution OUTPUT_STRIP_TRAILING_WHITESPACE) 43 | execute_process(COMMAND lsb_release -sc OUTPUT_VARIABLE release OUTPUT_STRIP_TRAILING_WHITESPACE) 44 | execute_process(COMMAND uname -m OUTPUT_VARIABLE CPACK_RPM_PACKAGE_ARCHITECTURE OUTPUT_STRIP_TRAILING_WHITESPACE) 45 | 46 | if(release STREQUAL "n/a") 47 | execute_process(COMMAND lsb_release -sr OUTPUT_VARIABLE release OUTPUT_STRIP_TRAILING_WHITESPACE) 48 | endif() 49 | 50 | if(distribution STREQUAL "Debian" OR distribution STREQUAL "Ubuntu" OR distribution STREQUAL "Linuxmint") 51 | set(CPACK_GENERATOR "DEB") 52 | execute_process(COMMAND dpkg --print-architecture OUTPUT_VARIABLE CPACK_DEBIAN_PACKAGE_ARCHITECTURE OUTPUT_STRIP_TRAILING_WHITESPACE) 53 | set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}_${CPACK_PACKAGE_VERSION}_${CPACK_DEBIAN_PACKAGE_ARCHITECTURE}_${distribution}+${release}) 54 | 55 | elseif(distribution MATCHES "RedHat.*") 56 | # extract the major version from RedHat full version (e.g. 6.7 --> 6) 57 | execute_process(COMMAND lsb_release -sr COMMAND sed s/[.].*// OUTPUT_VARIABLE redhat_version_major OUTPUT_STRIP_TRAILING_WHITESPACE) 58 | set(CPACK_GENERATOR "RPM") 59 | set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION}-${CPACK_RPM_PACKAGE_RELEASE}.el${redhat_version_major}.${CPACK_RPM_PACKAGE_ARCHITECTURE}) 60 | 61 | elseif(distribution MATCHES "openSUSE.*") 62 | set(CPACK_GENERATOR "RPM") 63 | set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION}-${release}.${CPACK_RPM_PACKAGE_ARCHITECTURE}) 64 | 65 | elseif(distribution STREQUAL "Fedora") 66 | set(CPACK_GENERATOR "RPM") 67 | set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION}.fc${release}.${CPACK_RPM_PACKAGE_ARCHITECTURE}) 68 | 69 | elseif(distribution STREQUAL "Scientific") 70 | set(CPACK_GENERATOR "RPM") 71 | set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION}-${release}.${CPACK_RPM_PACKAGE_ARCHITECTURE}) 72 | 73 | else() 74 | set(CPACK_GENERATOR "STGZ") 75 | set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION}-${release}.${CPACK_RPM_PACKAGE_ARCHITECTURE}) 76 | endif() 77 | 78 | set(CPACK_GENERATOR "TGZ;${CPACK_GENERATOR}") 79 | endif() 80 | 81 | # Store the packages in a separat dir 82 | set(CPACK_PACKAGE_DIRECTORY "${CMAKE_BINARY_DIR}/packages") 83 | set(CPACK_PACKAGE_INSTALL_DIRECTORY ${PROJECT_NAME}) 84 | 85 | include(CPack) 86 | -------------------------------------------------------------------------------- /docs/Doxyfile: -------------------------------------------------------------------------------- 1 | PROJECT_NAME = "html2md" 2 | PROJECT_BRIEF = "Simple and fast HTML to Markdown converter" 3 | 4 | INPUT = include/ src/ docs/index.md CHANGELOG.md python/README.md 5 | USE_MDFILE_AS_MAINPAGE = docs/index.md 6 | 7 | RECURSIVE = YES 8 | ENABLE_PREPROCESSING = YES 9 | 10 | MARKDOWN_SUPPORT = YES 11 | HTML_OUTPUT = doc 12 | GENERATE_LATEX = NO 13 | 14 | GENERATE_TREEVIEW = YES 15 | DISABLE_INDEX = NO 16 | FULL_SIDEBAR = NO 17 | EXTRACT_ALL = YES 18 | TREEVIEW_WIDTH = 335 19 | 20 | HTML_HEADER = docs/doxygen-awesome-css/doxygen-custom/header.html 21 | 22 | HTML_EXTRA_STYLESHEET = docs/doxygen-awesome-css/doxygen-awesome.css \ 23 | docs/doxygen-awesome-css/doxygen-custom/custom.css \ 24 | docs/doxygen-awesome-css/doxygen-awesome-sidebar-only.css \ 25 | docs/doxygen-awesome-css/doxygen-awesome-sidebar-only-darkmode-toggle.css 26 | 27 | HTML_EXTRA_FILES = docs/doxygen-awesome-css/doxygen-awesome-darkmode-toggle.js \ 28 | docs/doxygen-awesome-css/doxygen-awesome-fragment-copy-button.js \ 29 | docs/doxygen-awesome-css/doxygen-awesome-paragraph-link.js \ 30 | docs/doxygen-awesome-css/doxygen-awesome-interactive-toc.js 31 | 32 | # Transparent background for graphs 33 | HAVE_DOT = YES 34 | DOT_IMAGE_FORMAT = svg 35 | DOT_TRANSPARENT = YES 36 | INTERACTIVE_SVG = YES 37 | 38 | # TOC 39 | TOC_EXPAND = YES 40 | TOC_INCLUDE_HEADINGS = 5 41 | 42 | # Fix dark mode not deactivatable(Doxygen v1.9.6) 43 | HTML_COLORSTYLE = TOGGLE 44 | 45 | # Tests 46 | SOURCE_BROWSER = YES 47 | SEARCHENGINE = YES 48 | 49 | -------------------------------------------------------------------------------- /docs/doxygen-awesome-css/.gitignore: -------------------------------------------------------------------------------- 1 | docs/html 2 | .DS_Store 3 | .idea -------------------------------------------------------------------------------- /docs/doxygen-awesome-css/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 jothepro 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /docs/doxygen-awesome-css/doxygen-awesome-darkmode-toggle.js: -------------------------------------------------------------------------------- 1 | /** 2 | 3 | Doxygen Awesome 4 | https://github.com/jothepro/doxygen-awesome-css 5 | 6 | MIT License 7 | 8 | Copyright (c) 2021 - 2022 jothepro 9 | 10 | Permission is hereby granted, free of charge, to any person obtaining a copy 11 | of this software and associated documentation files (the "Software"), to deal 12 | in the Software without restriction, including without limitation the rights 13 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 | copies of the Software, and to permit persons to whom the Software is 15 | furnished to do so, subject to the following conditions: 16 | 17 | The above copyright notice and this permission notice shall be included in all 18 | copies or substantial portions of the Software. 19 | 20 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 23 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 26 | SOFTWARE. 27 | 28 | */ 29 | 30 | class DoxygenAwesomeDarkModeToggle extends HTMLElement { 31 | // SVG icons from https://fonts.google.com/icons 32 | // Licensed under the Apache 2.0 license: 33 | // https://www.apache.org/licenses/LICENSE-2.0.html 34 | static lightModeIcon = `` 35 | static darkModeIcon = `` 36 | static title = "Toggle Light/Dark Mode" 37 | 38 | static prefersLightModeInDarkModeKey = "prefers-light-mode-in-dark-mode" 39 | static prefersDarkModeInLightModeKey = "prefers-dark-mode-in-light-mode" 40 | 41 | static _staticConstructor = function() { 42 | DoxygenAwesomeDarkModeToggle.enableDarkMode(DoxygenAwesomeDarkModeToggle.userPreference) 43 | // Update the color scheme when the browsers preference changes 44 | // without user interaction on the website. 45 | window.matchMedia('(prefers-color-scheme: dark)').addEventListener('change', event => { 46 | DoxygenAwesomeDarkModeToggle.onSystemPreferenceChanged() 47 | }) 48 | // Update the color scheme when the tab is made visible again. 49 | // It is possible that the appearance was changed in another tab 50 | // while this tab was in the background. 51 | document.addEventListener("visibilitychange", visibilityState => { 52 | if (document.visibilityState === 'visible') { 53 | DoxygenAwesomeDarkModeToggle.onSystemPreferenceChanged() 54 | } 55 | }); 56 | }() 57 | 58 | static init() { 59 | $(function() { 60 | $(document).ready(function() { 61 | const toggleButton = document.createElement('doxygen-awesome-dark-mode-toggle') 62 | toggleButton.title = DoxygenAwesomeDarkModeToggle.title 63 | toggleButton.updateIcon() 64 | 65 | window.matchMedia('(prefers-color-scheme: dark)').addEventListener('change', event => { 66 | toggleButton.updateIcon() 67 | }) 68 | document.addEventListener("visibilitychange", visibilityState => { 69 | if (document.visibilityState === 'visible') { 70 | toggleButton.updateIcon() 71 | } 72 | }); 73 | 74 | $(document).ready(function(){ 75 | document.getElementById("MSearchBox").parentNode.appendChild(toggleButton) 76 | }) 77 | $(window).resize(function(){ 78 | document.getElementById("MSearchBox").parentNode.appendChild(toggleButton) 79 | }) 80 | }) 81 | }) 82 | } 83 | 84 | constructor() { 85 | super(); 86 | this.onclick=this.toggleDarkMode 87 | } 88 | 89 | /** 90 | * @returns `true` for dark-mode, `false` for light-mode system preference 91 | */ 92 | static get systemPreference() { 93 | return window.matchMedia('(prefers-color-scheme: dark)').matches 94 | } 95 | 96 | /** 97 | * @returns `true` for dark-mode, `false` for light-mode user preference 98 | */ 99 | static get userPreference() { 100 | return (!DoxygenAwesomeDarkModeToggle.systemPreference && localStorage.getItem(DoxygenAwesomeDarkModeToggle.prefersDarkModeInLightModeKey)) || 101 | (DoxygenAwesomeDarkModeToggle.systemPreference && !localStorage.getItem(DoxygenAwesomeDarkModeToggle.prefersLightModeInDarkModeKey)) 102 | } 103 | 104 | static set userPreference(userPreference) { 105 | DoxygenAwesomeDarkModeToggle.darkModeEnabled = userPreference 106 | if(!userPreference) { 107 | if(DoxygenAwesomeDarkModeToggle.systemPreference) { 108 | localStorage.setItem(DoxygenAwesomeDarkModeToggle.prefersLightModeInDarkModeKey, true) 109 | } else { 110 | localStorage.removeItem(DoxygenAwesomeDarkModeToggle.prefersDarkModeInLightModeKey) 111 | } 112 | } else { 113 | if(!DoxygenAwesomeDarkModeToggle.systemPreference) { 114 | localStorage.setItem(DoxygenAwesomeDarkModeToggle.prefersDarkModeInLightModeKey, true) 115 | } else { 116 | localStorage.removeItem(DoxygenAwesomeDarkModeToggle.prefersLightModeInDarkModeKey) 117 | } 118 | } 119 | DoxygenAwesomeDarkModeToggle.onUserPreferenceChanged() 120 | } 121 | 122 | static enableDarkMode(enable) { 123 | if(enable) { 124 | DoxygenAwesomeDarkModeToggle.darkModeEnabled = true 125 | document.documentElement.classList.add("dark-mode") 126 | document.documentElement.classList.remove("light-mode") 127 | } else { 128 | DoxygenAwesomeDarkModeToggle.darkModeEnabled = false 129 | document.documentElement.classList.remove("dark-mode") 130 | document.documentElement.classList.add("light-mode") 131 | } 132 | } 133 | 134 | static onSystemPreferenceChanged() { 135 | DoxygenAwesomeDarkModeToggle.darkModeEnabled = DoxygenAwesomeDarkModeToggle.userPreference 136 | DoxygenAwesomeDarkModeToggle.enableDarkMode(DoxygenAwesomeDarkModeToggle.darkModeEnabled) 137 | } 138 | 139 | static onUserPreferenceChanged() { 140 | DoxygenAwesomeDarkModeToggle.enableDarkMode(DoxygenAwesomeDarkModeToggle.darkModeEnabled) 141 | } 142 | 143 | toggleDarkMode() { 144 | DoxygenAwesomeDarkModeToggle.userPreference = !DoxygenAwesomeDarkModeToggle.userPreference 145 | this.updateIcon() 146 | } 147 | 148 | updateIcon() { 149 | if(DoxygenAwesomeDarkModeToggle.darkModeEnabled) { 150 | this.innerHTML = DoxygenAwesomeDarkModeToggle.darkModeIcon 151 | } else { 152 | this.innerHTML = DoxygenAwesomeDarkModeToggle.lightModeIcon 153 | } 154 | } 155 | } 156 | 157 | customElements.define("doxygen-awesome-dark-mode-toggle", DoxygenAwesomeDarkModeToggle); 158 | -------------------------------------------------------------------------------- /docs/doxygen-awesome-css/doxygen-awesome-fragment-copy-button.js: -------------------------------------------------------------------------------- 1 | /** 2 | 3 | Doxygen Awesome 4 | https://github.com/jothepro/doxygen-awesome-css 5 | 6 | MIT License 7 | 8 | Copyright (c) 2022 jothepro 9 | 10 | Permission is hereby granted, free of charge, to any person obtaining a copy 11 | of this software and associated documentation files (the "Software"), to deal 12 | in the Software without restriction, including without limitation the rights 13 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 | copies of the Software, and to permit persons to whom the Software is 15 | furnished to do so, subject to the following conditions: 16 | 17 | The above copyright notice and this permission notice shall be included in all 18 | copies or substantial portions of the Software. 19 | 20 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 23 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 26 | SOFTWARE. 27 | 28 | */ 29 | 30 | class DoxygenAwesomeFragmentCopyButton extends HTMLElement { 31 | constructor() { 32 | super(); 33 | this.onclick=this.copyContent 34 | } 35 | static title = "Copy to clipboard" 36 | static copyIcon = `` 37 | static successIcon = `` 38 | static successDuration = 980 39 | static init() { 40 | $(function() { 41 | $(document).ready(function() { 42 | if(navigator.clipboard) { 43 | const fragments = document.getElementsByClassName("fragment") 44 | for(const fragment of fragments) { 45 | const fragmentWrapper = document.createElement("div") 46 | fragmentWrapper.className = "doxygen-awesome-fragment-wrapper" 47 | const fragmentCopyButton = document.createElement("doxygen-awesome-fragment-copy-button") 48 | fragmentCopyButton.innerHTML = DoxygenAwesomeFragmentCopyButton.copyIcon 49 | fragmentCopyButton.title = DoxygenAwesomeFragmentCopyButton.title 50 | 51 | fragment.parentNode.replaceChild(fragmentWrapper, fragment) 52 | fragmentWrapper.appendChild(fragment) 53 | fragmentWrapper.appendChild(fragmentCopyButton) 54 | 55 | } 56 | } 57 | }) 58 | }) 59 | } 60 | 61 | 62 | copyContent() { 63 | const content = this.previousSibling.cloneNode(true) 64 | // filter out line number from file listings 65 | content.querySelectorAll(".lineno, .ttc").forEach((node) => { 66 | node.remove() 67 | }) 68 | let textContent = content.textContent 69 | // remove trailing newlines that appear in file listings 70 | let numberOfTrailingNewlines = 0 71 | while(textContent.charAt(textContent.length - (numberOfTrailingNewlines + 1)) == '\n') { 72 | numberOfTrailingNewlines++; 73 | } 74 | textContent = textContent.substring(0, textContent.length - numberOfTrailingNewlines) 75 | navigator.clipboard.writeText(textContent); 76 | this.classList.add("success") 77 | this.innerHTML = DoxygenAwesomeFragmentCopyButton.successIcon 78 | window.setTimeout(() => { 79 | this.classList.remove("success") 80 | this.innerHTML = DoxygenAwesomeFragmentCopyButton.copyIcon 81 | }, DoxygenAwesomeFragmentCopyButton.successDuration); 82 | } 83 | } 84 | 85 | customElements.define("doxygen-awesome-fragment-copy-button", DoxygenAwesomeFragmentCopyButton) 86 | -------------------------------------------------------------------------------- /docs/doxygen-awesome-css/doxygen-awesome-interactive-toc.js: -------------------------------------------------------------------------------- 1 | /** 2 | 3 | Doxygen Awesome 4 | https://github.com/jothepro/doxygen-awesome-css 5 | 6 | MIT License 7 | 8 | Copyright (c) 2022 jothepro 9 | 10 | Permission is hereby granted, free of charge, to any person obtaining a copy 11 | of this software and associated documentation files (the "Software"), to deal 12 | in the Software without restriction, including without limitation the rights 13 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 | copies of the Software, and to permit persons to whom the Software is 15 | furnished to do so, subject to the following conditions: 16 | 17 | The above copyright notice and this permission notice shall be included in all 18 | copies or substantial portions of the Software. 19 | 20 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 23 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 26 | SOFTWARE. 27 | 28 | */ 29 | 30 | class DoxygenAwesomeInteractiveToc { 31 | static topOffset = 38 32 | static hideMobileMenu = true 33 | static headers = [] 34 | 35 | static init() { 36 | window.addEventListener("load", () => { 37 | let toc = document.querySelector(".contents > .toc") 38 | if(toc) { 39 | toc.classList.add("interactive") 40 | if(!DoxygenAwesomeInteractiveToc.hideMobileMenu) { 41 | toc.classList.add("open") 42 | } 43 | document.querySelector(".contents > .toc > h3")?.addEventListener("click", () => { 44 | if(toc.classList.contains("open")) { 45 | toc.classList.remove("open") 46 | } else { 47 | toc.classList.add("open") 48 | } 49 | }) 50 | 51 | document.querySelectorAll(".contents > .toc > ul a").forEach((node) => { 52 | let id = node.getAttribute("href").substring(1) 53 | DoxygenAwesomeInteractiveToc.headers.push({ 54 | node: node, 55 | headerNode: document.getElementById(id) 56 | }) 57 | 58 | document.getElementById("doc-content")?.addEventListener("scroll", () => { 59 | DoxygenAwesomeInteractiveToc.update() 60 | }) 61 | }) 62 | DoxygenAwesomeInteractiveToc.update() 63 | } 64 | }) 65 | } 66 | 67 | static update() { 68 | let active = DoxygenAwesomeInteractiveToc.headers[0]?.node 69 | DoxygenAwesomeInteractiveToc.headers.forEach((header) => { 70 | let position = header.headerNode.getBoundingClientRect().top 71 | header.node.classList.remove("active") 72 | header.node.classList.remove("aboveActive") 73 | if(position < DoxygenAwesomeInteractiveToc.topOffset) { 74 | active = header.node 75 | active?.classList.add("aboveActive") 76 | } 77 | }) 78 | active?.classList.add("active") 79 | active?.classList.remove("aboveActive") 80 | } 81 | } -------------------------------------------------------------------------------- /docs/doxygen-awesome-css/doxygen-awesome-paragraph-link.js: -------------------------------------------------------------------------------- 1 | /** 2 | 3 | Doxygen Awesome 4 | https://github.com/jothepro/doxygen-awesome-css 5 | 6 | MIT License 7 | 8 | Copyright (c) 2022 jothepro 9 | 10 | Permission is hereby granted, free of charge, to any person obtaining a copy 11 | of this software and associated documentation files (the "Software"), to deal 12 | in the Software without restriction, including without limitation the rights 13 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 | copies of the Software, and to permit persons to whom the Software is 15 | furnished to do so, subject to the following conditions: 16 | 17 | The above copyright notice and this permission notice shall be included in all 18 | copies or substantial portions of the Software. 19 | 20 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 23 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 26 | SOFTWARE. 27 | 28 | */ 29 | 30 | class DoxygenAwesomeParagraphLink { 31 | // Icon from https://fonts.google.com/icons 32 | // Licensed under the Apache 2.0 license: 33 | // https://www.apache.org/licenses/LICENSE-2.0.html 34 | static icon = `` 35 | static title = "Permanent Link" 36 | static init() { 37 | $(function() { 38 | $(document).ready(function() { 39 | document.querySelectorAll(".contents a.anchor[id], .contents .groupheader > a[id]").forEach((node) => { 40 | let anchorlink = document.createElement("a") 41 | anchorlink.setAttribute("href", `#${node.getAttribute("id")}`) 42 | anchorlink.setAttribute("title", DoxygenAwesomeParagraphLink.title) 43 | anchorlink.classList.add("anchorlink") 44 | node.classList.add("anchor") 45 | anchorlink.innerHTML = DoxygenAwesomeParagraphLink.icon 46 | node.parentElement.appendChild(anchorlink) 47 | }) 48 | }) 49 | }) 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /docs/doxygen-awesome-css/doxygen-awesome-sidebar-only-darkmode-toggle.css: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | 4 | Doxygen Awesome 5 | https://github.com/jothepro/doxygen-awesome-css 6 | 7 | MIT License 8 | 9 | Copyright (c) 2021 jothepro 10 | 11 | Permission is hereby granted, free of charge, to any person obtaining a copy 12 | of this software and associated documentation files (the "Software"), to deal 13 | in the Software without restriction, including without limitation the rights 14 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 15 | copies of the Software, and to permit persons to whom the Software is 16 | furnished to do so, subject to the following conditions: 17 | 18 | The above copyright notice and this permission notice shall be included in all 19 | copies or substantial portions of the Software. 20 | 21 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 22 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 23 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 24 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 25 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 26 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 27 | SOFTWARE. 28 | 29 | */ 30 | 31 | @media screen and (min-width: 768px) { 32 | 33 | #MSearchBox { 34 | width: calc(var(--side-nav-fixed-width) - calc(2 * var(--spacing-medium)) - var(--searchbar-height) - 1px); 35 | } 36 | 37 | #MSearchField { 38 | width: calc(var(--side-nav-fixed-width) - calc(2 * var(--spacing-medium)) - 66px - var(--searchbar-height)); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /docs/doxygen-awesome-css/doxygen-awesome-sidebar-only.css: -------------------------------------------------------------------------------- 1 | /** 2 | 3 | Doxygen Awesome 4 | https://github.com/jothepro/doxygen-awesome-css 5 | 6 | MIT License 7 | 8 | Copyright (c) 2021 jothepro 9 | 10 | Permission is hereby granted, free of charge, to any person obtaining a copy 11 | of this software and associated documentation files (the "Software"), to deal 12 | in the Software without restriction, including without limitation the rights 13 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 | copies of the Software, and to permit persons to whom the Software is 15 | furnished to do so, subject to the following conditions: 16 | 17 | The above copyright notice and this permission notice shall be included in all 18 | copies or substantial portions of the Software. 19 | 20 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 23 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 26 | SOFTWARE. 27 | 28 | */ 29 | 30 | html { 31 | /* side nav width. MUST be = `TREEVIEW_WIDTH`. 32 | * Make sure it is wide enough to contain the page title (logo + title + version) 33 | */ 34 | --side-nav-fixed-width: 335px; 35 | --menu-display: none; 36 | 37 | --top-height: 120px; 38 | --toc-sticky-top: -25px; 39 | --toc-max-height: calc(100vh - 2 * var(--spacing-medium) - 25px); 40 | } 41 | 42 | #projectname { 43 | white-space: nowrap; 44 | } 45 | 46 | 47 | @media screen and (min-width: 768px) { 48 | html { 49 | --searchbar-background: var(--page-background-color); 50 | } 51 | 52 | #side-nav { 53 | min-width: var(--side-nav-fixed-width); 54 | max-width: var(--side-nav-fixed-width); 55 | top: var(--top-height); 56 | overflow: visible; 57 | } 58 | 59 | #nav-tree, #side-nav { 60 | height: calc(100vh - var(--top-height)) !important; 61 | } 62 | 63 | #nav-tree { 64 | padding: 0; 65 | } 66 | 67 | #top { 68 | display: block; 69 | border-bottom: none; 70 | height: var(--top-height); 71 | margin-bottom: calc(0px - var(--top-height)); 72 | max-width: var(--side-nav-fixed-width); 73 | overflow: hidden; 74 | background: var(--side-nav-background); 75 | } 76 | #main-nav { 77 | float: left; 78 | padding-right: 0; 79 | } 80 | 81 | .ui-resizable-handle { 82 | cursor: default; 83 | width: 1px !important; 84 | box-shadow: 0 calc(-2 * var(--top-height)) 0 0 var(--separator-color); 85 | } 86 | 87 | #nav-path { 88 | position: fixed; 89 | right: 0; 90 | left: var(--side-nav-fixed-width); 91 | bottom: 0; 92 | width: auto; 93 | } 94 | 95 | #doc-content { 96 | height: calc(100vh - 31px) !important; 97 | padding-bottom: calc(3 * var(--spacing-large)); 98 | padding-top: calc(var(--top-height) - 80px); 99 | box-sizing: border-box; 100 | margin-left: var(--side-nav-fixed-width) !important; 101 | } 102 | 103 | #MSearchBox { 104 | width: calc(var(--side-nav-fixed-width) - calc(2 * var(--spacing-medium))); 105 | } 106 | 107 | #MSearchField { 108 | width: calc(var(--side-nav-fixed-width) - calc(2 * var(--spacing-medium)) - 65px); 109 | } 110 | 111 | #MSearchResultsWindow { 112 | left: var(--spacing-medium) !important; 113 | right: auto; 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /docs/doxygen-awesome-css/doxygen-custom/custom.css: -------------------------------------------------------------------------------- 1 | .github-corner svg { 2 | fill: var(--primary-light-color); 3 | color: var(--page-background-color); 4 | width: 72px; 5 | height: 72px; 6 | } 7 | 8 | @media screen and (max-width: 767px) { 9 | .github-corner svg { 10 | width: 50px; 11 | height: 50px; 12 | } 13 | #projectnumber { 14 | margin-right: 22px; 15 | } 16 | } 17 | 18 | .alter-theme-button { 19 | display: inline-block; 20 | cursor: pointer; 21 | background: var(--primary-color); 22 | color: var(--page-background-color) !important; 23 | border-radius: var(--border-radius-medium); 24 | padding: var(--spacing-small) var(--spacing-medium); 25 | text-decoration: none; 26 | } 27 | 28 | .next_section_button { 29 | display: block; 30 | padding: var(--spacing-large) 0 var(--spacing-small) 0; 31 | color: var(--page-background-color); 32 | user-select: none; 33 | } 34 | 35 | .next_section_button::after { 36 | /* clearfix */ 37 | content: ""; 38 | clear: both; 39 | display: table; 40 | } 41 | 42 | .next_section_button a { 43 | overflow: hidden; 44 | float: right; 45 | border: 1px solid var(--separator-color); 46 | padding: var(--spacing-medium) calc(var(--spacing-large) / 2) var(--spacing-medium) var(--spacing-large); 47 | border-radius: var(--border-radius-medium); 48 | color: var(--page-secondary-foreground-color) !important; 49 | text-decoration: none; 50 | background-color: var(--page-background-color); 51 | transition: color .08s ease-in-out, background-color .1s ease-in-out; 52 | } 53 | 54 | .next_section_button a:hover { 55 | color: var(--page-foreground-color) !important; 56 | background-color: var(--odd-color); 57 | } 58 | 59 | .next_section_button a::after { 60 | content: '〉'; 61 | color: var(--page-secondary-foreground-color) !important; 62 | padding-left: var(--spacing-large); 63 | display: inline-block; 64 | transition: color .08s ease-in-out, transform .09s ease-in-out; 65 | } 66 | 67 | .next_section_button a:hover::after { 68 | color: var(--page-foreground-color) !important; 69 | transform: translateX(3px); 70 | } 71 | 72 | .alter-theme-button:hover { 73 | background: var(--primary-dark-color); 74 | } 75 | 76 | html.dark-mode .darkmode_inverted_image img, /* < doxygen 1.9.3 */ 77 | html.dark-mode .darkmode_inverted_image object[type="image/svg+xml"] /* doxygen 1.9.3 */ { 78 | filter: brightness(87%) hue-rotate(180deg) invert(); 79 | } 80 | 81 | .bordered_image { 82 | border-radius: var(--border-radius-small); 83 | border: 1px solid var(--separator-color); 84 | display: inline-block; 85 | overflow: hidden; 86 | } 87 | 88 | html.dark-mode .bordered_image img, /* < doxygen 1.9.3 */ 89 | html.dark-mode .bordered_image object[type="image/svg+xml"] /* doxygen 1.9.3 */ { 90 | border-radius: var(--border-radius-small); 91 | } 92 | 93 | .title_screenshot { 94 | filter: drop-shadow(0px 3px 10px rgba(0,0,0,0.22)); 95 | max-width: 500px; 96 | margin: var(--spacing-large) 0; 97 | } 98 | 99 | .title_screenshot .caption { 100 | display: none; 101 | } -------------------------------------------------------------------------------- /docs/doxygen-awesome-css/doxygen-custom/header.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | $title 17 | $title 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 32 | 51 | $treeview 52 | $search 53 | $mathjax 54 | 55 | $extrastylesheet 56 | 57 | 58 | 59 | 60 | 61 | 63 | 64 | 65 |
66 | 67 | 68 |
69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 82 | 83 | 84 | 85 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 |
77 |
$projectname 78 |  $projectnumber 79 |
80 |
$projectbrief
81 |
86 |
$projectbrief
87 |
$searchbox
98 |
99 | 100 | 101 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # html2md 2 | 3 | [TOC] 4 | 5 | ## What does it do 6 | 7 | html2md is a fast and reliable C++ library for converting HTML content into markdown. It offers support for a wide range of HTML tags, including those for formatting text, creating lists, and inserting images and links. In addition, html2md is the only HTML to markdown converter that offers support for table formatting, making it a valuable tool for users who need to convert HTML tables into markdown. 8 | 9 | ## How to use this library 10 | 11 | ### CMake 12 | 13 | Install html2md. Either use the pre-built packages found on [GitHub releases](https://github.com/tim-gromeyer/html2md/releases) or build and install it yourself. 14 | 15 | 16 | Afterwards: 17 | 18 | ```cmake 19 | find_package(html2md) 20 | target_link_library(your_target PRIVATE html2md) 21 | ``` 22 | 23 | ### Manually 24 | 25 | To use html2md, follow these steps: 26 | 27 | 1. Clone the library: `git clone https://github.com/tim-gromeyer/html2md` 28 | 2. Add the files `include/html2md.h` and `src/html2md.cpp` to your project 29 | 3. Include the `html2md.h` header in your code 30 | 4. Use the `html2md::Convert` function to convert your HTML content into markdown 31 | 32 | Here is an example of how to use the `html2md::Convert` function: 33 | 34 | ```cpp 35 | #include 36 | 37 | //... 38 | 39 | std::cout << html2md::Convert("

foo

"); // # foo 40 | ``` 41 | 42 | ## Supported Tags 43 | 44 | html2md supports the following HTML tags: 45 | 46 | 47 | | Tag | Description | Comment | 48 | | ------------ | ------------------ | ------------------------------------------ | 49 | | `a` | Anchor or link | Supports the `href` and `name` attributes. | 50 | | `b` | Bold | | 51 | | `blockquote` | Indented paragraph | | 52 | | `br` | Line break | | 53 | | `cite` | Inline citation | Same as `i`. | 54 | | `code` | Code | | 55 | | `dd` | Definition data | | 56 | | `del` | Strikethrough | | 57 | | `dfn` | Definition | Same as `i`. | 58 | | `div` | Document division | | 59 | | `em` | Emphasized | Same as `i`. | 60 | | `h1` | Level 1 heading | | 61 | | `h2` | Level 2 heading | | 62 | | `h3` | Level 3 heading | | 63 | | `h4` | Level 4 heading | | 64 | | `h5` | Level 5 heading | | 65 | | `h6` | Level 6 heading | | 66 | | `head` | Document header | Ignored. | 67 | | `hr` | Horizontal line | | 68 | | `i` | Italic | | 69 | | `img` | Image | Supports the `src` and `alt` attributes. | 70 | | `li` | List item | | 71 | | `meta` | Meta-information | Ignored. | 72 | | `ol` | Ordered list | Don't use other lists in this list. | 73 | | `p` | Paragraph | | 74 | | `pre` | Preformatted text | Works only with `code`. | 75 | | `s` | Strikethrough | Same as `del`. | 76 | | `span` | Grouped elements | | 77 | | `strong` | Strong | Same as `b`. | 78 | | `table` | Table | | 79 | | `td` | Table data cell | Uses `align` from `th`. | 80 | | `th` | Table header cell | Supports the `align` attribute. | 81 | | `title` | Document title | Same as `h1`. | 82 | | `tr` | Table row | | 83 | | `u` | Underlined | Uses HTML. | 84 | | `ul` | Unordered list | | 85 | 86 | ## Bindings 87 | 88 | - [Python](../python/README.md) 89 | 90 | ## Requirements 91 | 92 | 1. A compiler with **c++11** support like *g++>=9* 93 | 94 | That's all! 95 | 96 | ## License 97 | 98 | html2md is licensed under [The MIT License (MIT)](https://opensource.org/licenses/MIT) 99 | -------------------------------------------------------------------------------- /html2md.pc.in: -------------------------------------------------------------------------------- 1 | prefix=@CMAKE_INSTALL_PREFIX@ 2 | exec_prefix=@CMAKE_INSTALL_PREFIX@ 3 | libdir=${exec_prefix}/@CMAKE_INSTALL_LIBDIR@ 4 | includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@/html2md/ 5 | 6 | Name: @PROJECT_NAME@ 7 | Description: @PROJECT_DESCRIPTION@ 8 | Version: @PROJECT_VERSION@ 9 | 10 | Requires: 11 | Libs: -L${libdir} -lhtml2md 12 | Cflags: -I${includedir} 13 | -------------------------------------------------------------------------------- /html2mdConfig.cmake.in: -------------------------------------------------------------------------------- 1 | @PACKAGE_INIT@ 2 | 3 | include("${CMAKE_CURRENT_LIST_DIR}/@PROJECT_NAME@Targets.cmake") 4 | 5 | set(html2md_FOUND TRUE) 6 | set(HTML2MD_FOUND TRUE) 7 | 8 | -------------------------------------------------------------------------------- /include/html2md.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Tim Gromeyer 2 | // Licensed under the MIT License - https://opensource.org/licenses/MIT 3 | 4 | #ifndef HTML2MD_H 5 | #define HTML2MD_H 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | /*! 12 | * \brief html2md namespace 13 | * 14 | * The html2md namespace provides: 15 | * 1. The Converter class 16 | * 2. Static wrapper around Converter class 17 | * 18 | * \note Do NOT try to convert HTML that contains a list in an ordered list or a 19 | * `blockquote` in a list!\n This will be a **total** mess! 20 | */ 21 | namespace html2md { 22 | 23 | /*! 24 | * \brief Options for the conversion from HTML to Markdown 25 | * \warning Make sure to pass valid options; otherwise, the output will be 26 | * invalid! 27 | * 28 | * Example from `tests/main.cpp`: 29 | * 30 | * ```cpp 31 | * auto *options = new html2md::Options(); 32 | * options->splitLines = false; 33 | * 34 | * html2md::Converter c(html, options); 35 | * auto md = c.convert(); 36 | * ``` 37 | */ 38 | struct Options { 39 | /*! 40 | * \brief Add new line when a certain number of characters is reached 41 | * 42 | * \see softBreak 43 | * \see hardBreak 44 | */ 45 | bool splitLines = true; 46 | 47 | /*! 48 | * \brief softBreak Wrap after ... characters when the next space is reached 49 | * and as long as it's not in a list, table, image or anchor (link). 50 | */ 51 | int softBreak = 80; 52 | 53 | /*! 54 | * \brief hardBreak Force a break after ... characters in a line 55 | */ 56 | int hardBreak = 100; 57 | 58 | /*! 59 | * \brief The char used for unordered lists 60 | * 61 | * Valid: 62 | * - `-` 63 | * - `+` 64 | * - `*` 65 | * 66 | * Example: 67 | * 68 | * ```markdown 69 | * - List 70 | * + Also a list 71 | * * And this to 72 | * ``` 73 | */ 74 | char unorderedList = '-'; 75 | 76 | /*! 77 | * \brief The char used after the number of the item 78 | * 79 | * Valid: 80 | * - `.` 81 | * - `)` 82 | * 83 | * Example: 84 | * 85 | * ```markdown 86 | * 1. Hello 87 | * 2) World! 88 | * ``` 89 | */ 90 | char orderedList = '.'; 91 | 92 | /*! 93 | * \brief Whether title is added as h1 heading at the very beginning of the 94 | * markdown 95 | * 96 | * Whether title is added as h1 heading at the very beginning of the markdown. 97 | * Default is true. 98 | */ 99 | bool includeTitle = true; 100 | 101 | /*! 102 | * \brief Whetever to format Markdown Tables 103 | * 104 | * Whetever to format Markdown Tables. 105 | * Default is true. 106 | */ 107 | bool formatTable = true; 108 | 109 | inline bool operator==(html2md::Options o) const { 110 | return splitLines == o.splitLines && unorderedList == o.unorderedList && 111 | orderedList == o.orderedList && includeTitle == o.includeTitle && 112 | softBreak == o.softBreak && hardBreak == o.hardBreak; 113 | }; 114 | }; 115 | 116 | /*! 117 | * \brief Class for converting HTML to Markdown 118 | * 119 | * This class converts HTML to Markdown. 120 | * There is also a static wrapper for this class (see html2md::Convert). 121 | * 122 | * ## Usage example 123 | * 124 | * Option 1: Use the class: 125 | * 126 | * ```cpp 127 | * std::string html = "

example

"; 128 | * html2md::Converter c(html); 129 | * auto md = c.convert(); 130 | * 131 | * if (!c.ok()) std::cout << "There was something wrong in the HTML\n"; 132 | * std::cout << md; // # example 133 | * ``` 134 | * 135 | * Option 2: Use the static wrapper: 136 | * 137 | * ```cpp 138 | * std::string html = "

example

"; 139 | * 140 | * auto md = html2md::Convert(html); 141 | * std::cout << md; 142 | * ``` 143 | * 144 | * Advanced: use Options: 145 | * 146 | * ```cpp 147 | * std::string html = "

example

"; 148 | * 149 | * auto *options = new html2md::Options(); 150 | * options->splitLines = false; 151 | * options->unorderedList = '*'; 152 | * 153 | * html2md::Converter c(html, options); 154 | * auto md = c.convert(); 155 | * if (!c.ok()) std::cout << "There was something wrong in the HTML\n"; 156 | * std::cout << md; // # example 157 | * ``` 158 | */ 159 | class Converter { 160 | public: 161 | /*! 162 | * \brief Standard initializer, takes HTML as parameter. Also prepares 163 | * everything. \param html The HTML as std::string. \param options Options for 164 | * the Conversation. See html2md::Options() for more. 165 | * 166 | * \note Don't pass anything else than HTML, otherwise the output will be a 167 | * **mess**! 168 | * 169 | * This is the default initializer.
170 | * You can use appendToMd() to append something to the beginning of the 171 | * generated output. 172 | */ 173 | explicit inline Converter(std::string &html, 174 | struct Options *options = nullptr) { 175 | *this = Converter(&html, options); 176 | } 177 | 178 | /*! 179 | * \brief Convert HTML into Markdown. 180 | * \return Returns the converted Markdown. 181 | * 182 | * This function actually converts the HTML into Markdown. 183 | * It also cleans up the Markdown so you don't have to do anything. 184 | */ 185 | [[nodiscard]] std::string convert(); 186 | 187 | /*! 188 | * \brief Append a char to the Markdown. 189 | * \param ch The char to append. 190 | * \return Returns a copy of the instance with the char appended. 191 | */ 192 | Converter *appendToMd(char ch); 193 | 194 | /*! 195 | * \brief Append a char* to the Markdown. 196 | * \param str The char* to append. 197 | * \return Returns a copy of the instance with the char* appended. 198 | */ 199 | Converter *appendToMd(const char *str); 200 | 201 | /*! 202 | * \brief Append a string to the Markdown. 203 | * \param s The string to append. 204 | * \return Returns a copy of the instance with the string appended. 205 | */ 206 | inline Converter *appendToMd(const std::string &s) { 207 | return appendToMd(s.c_str()); 208 | } 209 | 210 | /*! 211 | * \brief Appends a ' ' in certain cases. 212 | * \return Copy of the instance with(maybe) the appended space. 213 | * 214 | * This function appends ' ' if: 215 | * - md does not end with `*` 216 | * - md does not end with `\n` aka newline 217 | */ 218 | Converter *appendBlank(); 219 | 220 | /*! 221 | * \brief Checks if everything was closed properly(in the HTML). 222 | * \return Returns false if there is a unclosed tag. 223 | * \note As long as you have not called convert(), it always returns true. 224 | */ 225 | [[nodiscard]] bool ok() const; 226 | 227 | /*! 228 | * \brief Reset the generated Markdown 229 | */ 230 | void reset(); 231 | 232 | /*! 233 | * \brief Checks if the HTML matches and the options are the same. 234 | * \param The Converter object to compare with 235 | * \return true if the HTML and options matches otherwise false 236 | */ 237 | inline bool operator==(const Converter *c) const { return *this == *c; } 238 | 239 | inline bool operator==(const Converter &c) const { 240 | return html_ == c.html_ && option == c.option; 241 | } 242 | 243 | /*! 244 | * \brief Returns ok(). 245 | */ 246 | inline explicit operator bool() const { return ok(); }; 247 | 248 | private: 249 | // Attributes 250 | static constexpr const char *kAttributeHref = "href"; 251 | static constexpr const char *kAttributeAlt = "alt"; 252 | static constexpr const char *kAttributeTitle = "title"; 253 | static constexpr const char *kAttributeClass = "class"; 254 | static constexpr const char *kAttributeSrc = "src"; 255 | static constexpr const char *kAttrinuteAlign = "align"; 256 | 257 | static constexpr const char *kTagAnchor = "a"; 258 | static constexpr const char *kTagBreak = "br"; 259 | static constexpr const char *kTagCode = "code"; 260 | static constexpr const char *kTagDiv = "div"; 261 | static constexpr const char *kTagHead = "head"; 262 | static constexpr const char *kTagLink = "link"; 263 | static constexpr const char *kTagListItem = "li"; 264 | static constexpr const char *kTagMeta = "meta"; 265 | static constexpr const char *kTagNav = "nav"; 266 | static constexpr const char *kTagNoScript = "noscript"; 267 | static constexpr const char *kTagOption = "option"; 268 | static constexpr const char *kTagOrderedList = "ol"; 269 | static constexpr const char *kTagParagraph = "p"; 270 | static constexpr const char *kTagPre = "pre"; 271 | static constexpr const char *kTagScript = "script"; 272 | static constexpr const char *kTagSpan = "span"; 273 | static constexpr const char *kTagStyle = "style"; 274 | static constexpr const char *kTagTemplate = "template"; 275 | static constexpr const char *kTagTitle = "title"; 276 | static constexpr const char *kTagUnorderedList = "ul"; 277 | static constexpr const char *kTagImg = "img"; 278 | static constexpr const char *kTagSeperator = "hr"; 279 | 280 | // Text format 281 | static constexpr const char *kTagBold = "b"; 282 | static constexpr const char *kTagStrong = "strong"; 283 | static constexpr const char *kTagItalic = "em"; 284 | static constexpr const char *kTagItalic2 = "i"; 285 | static constexpr const char *kTagCitation = "cite"; 286 | static constexpr const char *kTagDefinition = "dfn"; 287 | static constexpr const char *kTagUnderline = "u"; 288 | static constexpr const char *kTagStrighthrought = "del"; 289 | static constexpr const char *kTagStrighthrought2 = "s"; 290 | 291 | static constexpr const char *kTagBlockquote = "blockquote"; 292 | 293 | // Header 294 | static constexpr const char *kTagHeader1 = "h1"; 295 | static constexpr const char *kTagHeader2 = "h2"; 296 | static constexpr const char *kTagHeader3 = "h3"; 297 | static constexpr const char *kTagHeader4 = "h4"; 298 | static constexpr const char *kTagHeader5 = "h5"; 299 | static constexpr const char *kTagHeader6 = "h6"; 300 | 301 | // Table 302 | static constexpr const char *kTagTable = "table"; 303 | static constexpr const char *kTagTableRow = "tr"; 304 | static constexpr const char *kTagTableHeader = "th"; 305 | static constexpr const char *kTagTableData = "td"; 306 | 307 | size_t index_ch_in_html_ = 0; 308 | 309 | bool is_closing_tag_ = false; 310 | bool is_in_attribute_value_ = false; 311 | bool is_in_code_ = false; 312 | bool is_in_list_ = false; 313 | bool is_in_p_ = false; 314 | bool is_in_pre_ = false; 315 | bool is_in_table_ = false; 316 | bool is_in_table_row_ = false; 317 | bool is_in_tag_ = false; 318 | bool is_self_closing_tag_ = false; 319 | 320 | // relevant for
  • only, false = is in unordered list 321 | bool is_in_ordered_list_ = false; 322 | uint8_t index_ol = 0; 323 | 324 | // store the table start 325 | size_t table_start = 0; 326 | 327 | // number of lists 328 | uint8_t index_li = 0; 329 | 330 | uint8_t index_blockquote = 0; 331 | 332 | char prev_ch_in_md_ = 0, prev_prev_ch_in_md_ = 0; 333 | char prev_ch_in_html_ = 'x'; 334 | 335 | std::string html_; 336 | 337 | uint16_t offset_lt_ = 0; 338 | std::string current_tag_; 339 | std::string prev_tag_; 340 | 341 | // Line which separates header from data 342 | std::string tableLine; 343 | 344 | size_t chars_in_curr_line_ = 0; 345 | 346 | std::string md_; 347 | 348 | Options option; 349 | 350 | // Tag: base class for tag types 351 | struct Tag { 352 | virtual void OnHasLeftOpeningTag(Converter *c) = 0; 353 | virtual void OnHasLeftClosingTag(Converter *c) = 0; 354 | }; 355 | 356 | // Tag types 357 | 358 | // tags that are not printed (nav, script, noscript, ...) 359 | struct TagIgnored : Tag { 360 | void OnHasLeftOpeningTag(Converter *c) override{}; 361 | void OnHasLeftClosingTag(Converter *c) override{}; 362 | }; 363 | 364 | struct TagAnchor : Tag { 365 | void OnHasLeftOpeningTag(Converter *c) override; 366 | void OnHasLeftClosingTag(Converter *c) override; 367 | 368 | std::string current_href_; 369 | std::string current_title_; 370 | }; 371 | 372 | struct TagBold : Tag { 373 | void OnHasLeftOpeningTag(Converter *c) override; 374 | void OnHasLeftClosingTag(Converter *c) override; 375 | }; 376 | 377 | struct TagItalic : Tag { 378 | void OnHasLeftOpeningTag(Converter *c) override; 379 | void OnHasLeftClosingTag(Converter *c) override; 380 | }; 381 | 382 | struct TagUnderline : Tag { 383 | void OnHasLeftOpeningTag(Converter *c) override; 384 | void OnHasLeftClosingTag(Converter *c) override; 385 | }; 386 | 387 | struct TagStrikethrought : Tag { 388 | void OnHasLeftOpeningTag(Converter *c) override; 389 | void OnHasLeftClosingTag(Converter *c) override; 390 | }; 391 | 392 | struct TagBreak : Tag { 393 | void OnHasLeftOpeningTag(Converter *c) override; 394 | void OnHasLeftClosingTag(Converter *c) override; 395 | }; 396 | 397 | struct TagDiv : Tag { 398 | void OnHasLeftOpeningTag(Converter *c) override; 399 | void OnHasLeftClosingTag(Converter *c) override; 400 | }; 401 | 402 | struct TagHeader1 : Tag { 403 | void OnHasLeftOpeningTag(Converter *c) override; 404 | void OnHasLeftClosingTag(Converter *c) override; 405 | }; 406 | 407 | struct TagHeader2 : Tag { 408 | void OnHasLeftOpeningTag(Converter *c) override; 409 | void OnHasLeftClosingTag(Converter *c) override; 410 | }; 411 | 412 | struct TagHeader3 : Tag { 413 | void OnHasLeftOpeningTag(Converter *c) override; 414 | void OnHasLeftClosingTag(Converter *c) override; 415 | }; 416 | 417 | struct TagHeader4 : Tag { 418 | void OnHasLeftOpeningTag(Converter *c) override; 419 | void OnHasLeftClosingTag(Converter *c) override; 420 | }; 421 | 422 | struct TagHeader5 : Tag { 423 | void OnHasLeftOpeningTag(Converter *c) override; 424 | void OnHasLeftClosingTag(Converter *c) override; 425 | }; 426 | 427 | struct TagHeader6 : Tag { 428 | void OnHasLeftOpeningTag(Converter *c) override; 429 | void OnHasLeftClosingTag(Converter *c) override; 430 | }; 431 | 432 | struct TagListItem : Tag { 433 | void OnHasLeftOpeningTag(Converter *c) override; 434 | void OnHasLeftClosingTag(Converter *c) override; 435 | }; 436 | 437 | struct TagOption : Tag { 438 | void OnHasLeftOpeningTag(Converter *c) override; 439 | void OnHasLeftClosingTag(Converter *c) override; 440 | }; 441 | 442 | struct TagOrderedList : Tag { 443 | void OnHasLeftOpeningTag(Converter *c) override; 444 | void OnHasLeftClosingTag(Converter *c) override; 445 | }; 446 | 447 | struct TagParagraph : Tag { 448 | void OnHasLeftOpeningTag(Converter *c) override; 449 | void OnHasLeftClosingTag(Converter *c) override; 450 | }; 451 | 452 | struct TagPre : Tag { 453 | void OnHasLeftOpeningTag(Converter *c) override; 454 | void OnHasLeftClosingTag(Converter *c) override; 455 | }; 456 | 457 | struct TagCode : Tag { 458 | void OnHasLeftOpeningTag(Converter *c) override; 459 | void OnHasLeftClosingTag(Converter *c) override; 460 | }; 461 | 462 | struct TagSpan : Tag { 463 | void OnHasLeftOpeningTag(Converter *c) override; 464 | void OnHasLeftClosingTag(Converter *c) override; 465 | }; 466 | 467 | struct TagTitle : Tag { 468 | void OnHasLeftOpeningTag(Converter *c) override; 469 | void OnHasLeftClosingTag(Converter *c) override; 470 | }; 471 | 472 | struct TagUnorderedList : Tag { 473 | void OnHasLeftOpeningTag(Converter *c) override; 474 | void OnHasLeftClosingTag(Converter *c) override; 475 | }; 476 | 477 | struct TagImage : Tag { 478 | void OnHasLeftOpeningTag(Converter *c) override; 479 | void OnHasLeftClosingTag(Converter *c) override; 480 | }; 481 | 482 | struct TagSeperator : Tag { 483 | void OnHasLeftOpeningTag(Converter *c) override; 484 | void OnHasLeftClosingTag(Converter *c) override; 485 | }; 486 | 487 | struct TagTable : Tag { 488 | void OnHasLeftOpeningTag(Converter *c) override; 489 | void OnHasLeftClosingTag(Converter *c) override; 490 | }; 491 | 492 | struct TagTableRow : Tag { 493 | void OnHasLeftOpeningTag(Converter *c) override; 494 | void OnHasLeftClosingTag(Converter *c) override; 495 | }; 496 | 497 | struct TagTableHeader : Tag { 498 | void OnHasLeftOpeningTag(Converter *c) override; 499 | void OnHasLeftClosingTag(Converter *c) override; 500 | }; 501 | 502 | struct TagTableData : Tag { 503 | void OnHasLeftOpeningTag(Converter *c) override; 504 | void OnHasLeftClosingTag(Converter *c) override; 505 | }; 506 | 507 | struct TagBlockquote : Tag { 508 | void OnHasLeftOpeningTag(Converter *c) override; 509 | void OnHasLeftClosingTag(Converter *c) override; 510 | }; 511 | 512 | std::unordered_map> tags_; 513 | 514 | explicit Converter(std::string *html, struct Options *options); 515 | 516 | void CleanUpMarkdown(); 517 | 518 | // Trim from start (in place) 519 | static void LTrim(std::string *s); 520 | 521 | // Trim from end (in place) 522 | Converter *RTrim(std::string *s, bool trim_only_blank = false); 523 | 524 | // Trim from both ends (in place) 525 | Converter *Trim(std::string *s); 526 | 527 | // 1. trim all lines 528 | // 2. reduce consecutive newlines to maximum 3 529 | void TidyAllLines(std::string *str); 530 | 531 | std::string ExtractAttributeFromTagLeftOf(const std::string &attr); 532 | 533 | void TurnLineIntoHeader1(); 534 | 535 | void TurnLineIntoHeader2(); 536 | 537 | // Current char: '<' 538 | void OnHasEnteredTag(); 539 | 540 | Converter *UpdatePrevChFromMd(); 541 | 542 | /** 543 | * Handle next char within <...> tag 544 | * 545 | * @param ch current character 546 | * @return continue surrounding iteration? 547 | */ 548 | bool ParseCharInTag(char ch); 549 | 550 | // Current char: '>' 551 | bool OnHasLeftTag(); 552 | 553 | inline static bool TagContainsAttributesToHide(std::string *tag) { 554 | using std::string; 555 | 556 | return (*tag).find(" aria=\"hidden\"") != string::npos || 557 | (*tag).find("display:none") != string::npos || 558 | (*tag).find("visibility:hidden") != string::npos || 559 | (*tag).find("opacity:0") != string::npos || 560 | (*tag).find("Details-content--hidden-not-important") != string::npos; 561 | } 562 | 563 | Converter *ShortenMarkdown(size_t chars = 1); 564 | inline bool shortIfPrevCh(char prev) { 565 | if (prev_ch_in_md_ == prev) { 566 | ShortenMarkdown(); 567 | return true; 568 | } 569 | return false; 570 | }; 571 | 572 | /** 573 | * @param ch 574 | * @return continue iteration surrounding this method's invocation? 575 | */ 576 | bool ParseCharInTagContent(char ch); 577 | 578 | // Replace previous space (if any) in current markdown line by newline 579 | bool ReplacePreviousSpaceInLineByNewline(); 580 | 581 | static inline bool IsIgnoredTag(const std::string &tag) { 582 | return (tag.empty() || 583 | tag[0] == '-' || kTagTemplate == tag || kTagStyle == tag || 584 | kTagScript == tag || kTagNoScript == tag || kTagNav == tag); 585 | 586 | // meta: not ignored to tolerate if closing is omitted 587 | } 588 | 589 | [[nodiscard]] bool IsInIgnoredTag() const; 590 | }; // Converter 591 | 592 | /*! 593 | * \brief Static wrapper around the Converter class 594 | * \param html The HTML passed to Converter 595 | * \param ok Optional: Pass a reference to a local bool to store the output of 596 | * Converter::ok() \return Returns the by Converter generated Markdown 597 | */ 598 | inline std::string Convert(std::string &html, bool *ok = nullptr) { 599 | Converter c(html); 600 | auto md = c.convert(); 601 | if (ok != nullptr) 602 | *ok = c.ok(); 603 | return md; 604 | } 605 | 606 | #ifndef PYTHON_BINDINGS 607 | inline std::string Convert(std::string &&html, bool *ok = nullptr) { 608 | return Convert(html, ok); 609 | } 610 | #endif 611 | 612 | } // namespace html2md 613 | 614 | #endif // HTML2MD_H 615 | -------------------------------------------------------------------------------- /include/table.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Tim Gromeyer 2 | // Licensed under the MIT License - https://opensource.org/licenses/MIT 3 | 4 | #ifndef TABLE_H 5 | #define TABLE_H 6 | 7 | #include 8 | 9 | [[nodiscard]] std::string formatMarkdownTable(const std::string &inputTable); 10 | 11 | #endif // TABLE_H 12 | -------------------------------------------------------------------------------- /js/bindings.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "html2md.h" 3 | 4 | using namespace emscripten; 5 | 6 | EMSCRIPTEN_BINDINGS(html2md) { 7 | class_("Options") 8 | .constructor<>() 9 | .property("splitLines", &html2md::options::splitLines, &html2md::options::splitLines) 10 | .property("unorderedList", &html2md::options::unorderedList, &html2md::options::unorderedList) 11 | .property("orderedList", &html2md::options::orderedList, &html2md::options::orderedList) 12 | .property("includeTitle", &html2md::options::includeTitle, &html2md::options::includeTitle); 13 | 14 | class_("Converter") 15 | .constructor() 16 | .function("convert2Md", &html2md::Converter::Convert2Md) 17 | .function("ok", &html2md::Converter::ok); 18 | 19 | function("convert", &html2md::Convert); 20 | } 21 | 22 | -------------------------------------------------------------------------------- /objc/html2md_objc.mm: -------------------------------------------------------------------------------- 1 | // 2 | // html2md.m 3 | // html2md 4 | // 5 | // Created by 秋星桥 on 2/17/25. 6 | // 7 | 8 | #import 9 | 10 | #include "html2md.h" 11 | #include "include/html2md_objc.h" 12 | 13 | #include 14 | 15 | @implementation HTML2MD 16 | 17 | + (NSString *)convertHTMLToMarkdown:(NSString *)html { 18 | const char *htmlStr = [html UTF8String]; 19 | std::string outputMarkdown = html2md::Convert(htmlStr); 20 | NSString *markdownStr = [NSString stringWithUTF8String:outputMarkdown.c_str()]; 21 | return markdownStr; 22 | } 23 | 24 | @end 25 | -------------------------------------------------------------------------------- /objc/include/html2md_objc.h: -------------------------------------------------------------------------------- 1 | // 2 | // Header.h 3 | // html2md 4 | // 5 | // Created by 秋星桥 on 2/17/25. 6 | // 7 | 8 | #ifndef html2md_objc_h 9 | #define html2md_objc_h 10 | 11 | #include 12 | 13 | @interface HTML2MD : NSObject 14 | 15 | + (NSString *)convertHTMLToMarkdown:(NSString *)html; 16 | 17 | @end 18 | 19 | #endif /* html2md_objc_h */ 20 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["scikit-build-core", "pybind11>=2.12,<2.14"] 3 | build-backend = "scikit_build_core.build" 4 | 5 | [project] 6 | name = "pyhtml2md" 7 | authors = [ 8 | { name = "Tim Gromeyer", email = "sakul8826@gmail.com" } 9 | ] 10 | description = "Transform your HTML into clean, easy-to-read markdown with pyhtml2md." 11 | readme = "python/README.md" 12 | requires-python = ">=3.7" 13 | license = { text = "MIT" } 14 | version = "1.6.4" 15 | classifiers = [ 16 | "Intended Audience :: Developers", 17 | "License :: OSI Approved :: MIT License", 18 | "Programming Language :: C++", 19 | "Programming Language :: Python", 20 | "Programming Language :: Python :: 3", 21 | "Programming Language :: Python :: 3 :: Only", 22 | "Programming Language :: Python :: 3.7", 23 | "Programming Language :: Python :: 3.8", 24 | "Programming Language :: Python :: 3.9", 25 | "Topic :: File Formats", 26 | "Topic :: Text Processing :: Markup :: Markdown", 27 | "Topic :: Text Processing :: Markup :: HTML", 28 | ] 29 | keywords = [ 30 | "html", "markdown", "html-to-markdown", 31 | "python3", "cpp17", "cpp-library", 32 | "html2markdown", "html2md" 33 | ] 34 | 35 | [project.urls] 36 | Repository = "https://github.com/tim-gromeyer/html2md" 37 | 38 | [project.optional-dependencies] 39 | test = ["pytest>=6.0"] 40 | 41 | [tool.scikit-build] 42 | cmake.verbose = true 43 | logging.level = "INFO" 44 | minimum-version = "0.8" 45 | # TODO: Figure out when CMake added FindPython 46 | cmake.version = ">=3.12" 47 | 48 | [tool.scikit-build.cmake.define] 49 | PYTHON_BINDINGS = "ON" 50 | PYBIND11_FINDPYTHON = "ON" 51 | 52 | [tool.isort] 53 | profile = "black" 54 | 55 | [tool.mypy] 56 | files = "setup.py" 57 | python_version = "3.7" 58 | strict = true 59 | show_error_codes = true 60 | enable_error_code = ["ignore-without-code", "redundant-expr", "truthy-bool"] 61 | warn_unreachable = true 62 | 63 | [[tool.mypy.overrides]] 64 | module = ["ninja"] 65 | ignore_missing_imports = true 66 | 67 | 68 | [tool.pytest.ini_options] 69 | minversion = "6.0" 70 | addopts = ["-ra", "--showlocals", "--strict-markers", "--strict-config"] 71 | xfail_strict = true 72 | filterwarnings = ["error"] 73 | testpaths = ["tests"] 74 | 75 | [tool.cibuildwheel] 76 | test-command = "pytest {project}/tests" 77 | test-extras = ["test"] 78 | test-skip = ["*universal2:arm64"] 79 | # Setuptools bug causes collision between pypy and cpython artifacts 80 | before-build = "rm -rf {project}/build" 81 | 82 | -------------------------------------------------------------------------------- /python/README.md: -------------------------------------------------------------------------------- 1 | # pyhtml2md 2 | 3 | pyhtml2md provides a way to use the html2md C++ library in Python. html2md is a fast and reliable library for converting HTML content into markdown. 4 | 5 |
    6 | 7 | - [Installation](#installation) 8 | - [Basic usage](#basic-usage) 9 | - [Advanced usage](#advanced-usage) 10 | - [Supported Tags](#supported-tags) 11 | - [License](#license) 12 | 13 |
    14 | 15 | 20 | 21 | 22 | ## Installation 23 | 24 | You can install using pip: 25 | 26 | ```bash 27 | pip3 install pyhtml2md 28 | ``` 29 | 30 | ## Basic usage 31 | 32 | Here is an example of how to use the pyhtml2md to convert HTML to markdown: 33 | 34 | ```python 35 | import pyhtml2md 36 | 37 | markdown = pyhtml2md.convert("

    Hello, world!

    ") 38 | print(markdown) 39 | ``` 40 | 41 | The `convert` function takes an HTML string as input and returns a markdown string. 42 | 43 | ## Advanced usage 44 | 45 | pyhtml2md provides a `Options` class to customize the generation process. 46 | You can find all information on the c++ [documentation](https://tim-gromeyer.github.io/html2md/index.html) 47 | 48 | Here is an example: 49 | 50 | ```python 51 | import pyhtml2md 52 | 53 | options = pyhtml2md.Options() 54 | options.splitLines = False 55 | 56 | converter = pyhtml2md.Converter("

    Hello Python!

    ", options) 57 | markdown = converter.convert() 58 | print(markdown) 59 | print(converter.ok()) 60 | ``` 61 | 62 | ## Supported Tags 63 | 64 | pyhtml2md supports the following HTML tags: 65 | 66 | | Tag | Description | Comment | 67 | |--------------|--------------------|-----------------------------------------------------| 68 | | `a` | Anchor or link | Supports the `href`, `name` and `title` attributes. | 69 | | `b` | Bold | | 70 | | `blockquote` | Indented paragraph | | 71 | | `br` | Line break | | 72 | | `cite` | Inline citation | Same as `i`. | 73 | | `code` | Code | | 74 | | `dd` | Definition data | | 75 | | `del` | Strikethrough | | 76 | | `dfn` | Definition | Same as `i`. | 77 | | `div` | Document division | | 78 | | `em` | Emphasized | Same as `i`. | 79 | | `h1` | Level 1 heading | | 80 | | `h2` | Level 2 heading | | 81 | | `h3` | Level 3 heading | | 82 | | `h4` | Level 4 heading | | 83 | | `h5` | Level 5 heading | | 84 | | `h6` | Level 6 heading | | 85 | | `head` | Document header | Ignored. | 86 | | `hr` | Horizontal line | | 87 | | `i` | Italic | | 88 | | `img` | Image | Supports `src`, `alt`, `title` attributes. | 89 | | `li` | List item | | 90 | | `meta` | Meta-information | Ignored. | 91 | | `ol` | Ordered list | | 92 | | `p` | Paragraph | | 93 | | `pre` | Preformatted text | Works only with `code`. | 94 | | `s` | Strikethrough | Same as `del`. | 95 | | `span` | Grouped elements | Does nothing. | 96 | | `strong` | Strong | Same as `b`. | 97 | | `table` | Table | Tables are formatted! | 98 | | `tbody` | Table body | Does nothing. | 99 | | `td` | Table data cell | Uses `align` from `th`. | 100 | | `tfoot` | Table footer | Does nothing. | 101 | | `th` | Table header cell | Supports the `align` attribute. | 102 | | `thead` | Table header | Does nothing. | 103 | | `title` | Document title | Same as `h1`. | 104 | | `tr` | Table row | | 105 | | `u` | Underlined | Uses HTML. | 106 | | `ul` | Unordered list | | 107 | 108 | ## License 109 | 110 | pyhtml2md is licensed under [The MIT License (MIT)](https://opensource.org/licenses/MIT) 111 | -------------------------------------------------------------------------------- /python/bindings.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | namespace py = pybind11; 4 | 5 | PYBIND11_MODULE(pyhtml2md, m) { 6 | m.doc() = "Python bindings for html2md"; // optional module docstring 7 | 8 | // Options class bindings 9 | py::class_(m, "Options") 10 | .def(py::init<>()) 11 | .def_readwrite( 12 | "splitLines", &html2md::Options::splitLines, 13 | "Add new line when a certain number of characters is reached") 14 | .def_readwrite("softBreak", &html2md::Options::softBreak, 15 | "Wrap after ... characters when the next space is reached") 16 | .def_readwrite("hardBreak", &html2md::Options::hardBreak, 17 | "Force a break after ... characters in a line") 18 | .def_readwrite("unorderedList", &html2md::Options::unorderedList, 19 | "The char used for unordered lists") 20 | .def_readwrite("orderedList", &html2md::Options::orderedList, 21 | "The char used after the number of the item") 22 | .def_readwrite("includeTitle", &html2md::Options::includeTitle, 23 | "Whether title is added as h1 heading at the very " 24 | "beginning of the markdown") 25 | .def_readwrite("formatTable", &html2md::Options::formatTable, 26 | "Whether to format Markdown Tables") 27 | .def("__eq__", &html2md::Options::operator==); 28 | 29 | py::class_(m, "Converter") 30 | .def(py::init(), 31 | "Class for converting HTML to Markdown", py::arg("html"), 32 | py::arg("options") = py::none()) 33 | .def("convert", &html2md::Converter::convert, 34 | "This function actually converts the HTML into Markdown.") 35 | .def("ok", &html2md::Converter::ok, 36 | "Checks if everything was closed properly(in the HTML).") 37 | .def("__call__", &html2md::Converter::operator bool); 38 | 39 | m.def("convert", &html2md::Convert, 40 | "Static wrapper around the Converter class", py::arg("html"), 41 | py::arg("ok") = py::none()); 42 | } 43 | -------------------------------------------------------------------------------- /scripts/clang-format.sh: -------------------------------------------------------------------------------- 1 | cd $(dirname "$0") 2 | cd .. 3 | clang-format -style=llvm -i cli/*.cpp include/*.h python/*.cpp src/*.cpp tests/*.cpp js/*.cpp 4 | -------------------------------------------------------------------------------- /src/html2md.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Tim Gromeyer 2 | // Licensed under the MIT License - https://opensource.org/licenses/MIT 3 | 4 | #include "html2md.h" 5 | #include "table.h" 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | using std::make_shared; 14 | using std::string; 15 | using std::vector; 16 | 17 | namespace { 18 | bool startsWith(const string &str, const string &prefix) { 19 | return str.size() >= prefix.size() && 20 | 0 == str.compare(0, prefix.size(), prefix); 21 | } 22 | 23 | bool endsWith(const string &str, const string &suffix) { 24 | return str.size() >= suffix.size() && 25 | 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix); 26 | } 27 | 28 | size_t ReplaceAll(string *haystack, const string &needle, 29 | const string &replacement) { 30 | // Get first occurrence 31 | size_t pos = (*haystack).find(needle); 32 | 33 | size_t amount_replaced = 0; 34 | 35 | // Repeat until end is reached 36 | while (pos != string::npos) { 37 | // Replace this occurrence of sub string 38 | (*haystack).replace(pos, needle.size(), replacement); 39 | 40 | // Get the next occurrence from the current position 41 | pos = (*haystack).find(needle, pos + replacement.size()); 42 | 43 | ++amount_replaced; 44 | } 45 | 46 | return amount_replaced; 47 | } 48 | 49 | size_t ReplaceAll(string *haystack, const string &needle, const char c) { 50 | return ReplaceAll(haystack, needle, string({c})); 51 | } 52 | 53 | // Split given string by given character delimiter into vector of strings 54 | vector Split(string const &str, char delimiter) { 55 | vector result; 56 | std::stringstream iss(str); 57 | 58 | for (string token; getline(iss, token, delimiter);) 59 | result.push_back(token); 60 | 61 | return result; 62 | } 63 | 64 | string Repeat(const string &str, size_t amount) { 65 | if (amount == 0) 66 | return ""; 67 | else if (amount == 1) 68 | return str; 69 | 70 | string out; 71 | 72 | for (size_t i = 0; i < amount; ++i) 73 | out.append(str); 74 | 75 | return out; 76 | } 77 | } // namespace 78 | 79 | namespace html2md { 80 | 81 | Converter::Converter(string *html, Options *options) : html_(*html) { 82 | if (options) 83 | option = *options; 84 | 85 | tags_.reserve(41); 86 | 87 | // non-printing tags 88 | auto tagIgnored = make_shared(); 89 | tags_[kTagHead] = tagIgnored; 90 | tags_[kTagMeta] = tagIgnored; 91 | tags_[kTagNav] = tagIgnored; 92 | tags_[kTagNoScript] = tagIgnored; 93 | tags_[kTagScript] = tagIgnored; 94 | tags_[kTagStyle] = tagIgnored; 95 | tags_[kTagTemplate] = tagIgnored; 96 | 97 | // printing tags 98 | tags_[kTagAnchor] = make_shared(); 99 | tags_[kTagBreak] = make_shared(); 100 | tags_[kTagDiv] = make_shared(); 101 | tags_[kTagHeader1] = make_shared(); 102 | tags_[kTagHeader2] = make_shared(); 103 | tags_[kTagHeader3] = make_shared(); 104 | tags_[kTagHeader4] = make_shared(); 105 | tags_[kTagHeader5] = make_shared(); 106 | tags_[kTagHeader6] = make_shared(); 107 | tags_[kTagListItem] = make_shared(); 108 | tags_[kTagOption] = make_shared(); 109 | tags_[kTagOrderedList] = make_shared(); 110 | tags_[kTagPre] = make_shared(); 111 | tags_[kTagCode] = make_shared(); 112 | tags_[kTagParagraph] = make_shared(); 113 | tags_[kTagSpan] = make_shared(); 114 | tags_[kTagUnorderedList] = make_shared(); 115 | tags_[kTagTitle] = make_shared(); 116 | tags_[kTagImg] = make_shared(); 117 | tags_[kTagSeperator] = make_shared(); 118 | 119 | // Text formatting 120 | auto tagBold = make_shared(); 121 | tags_[kTagBold] = tagBold; 122 | tags_[kTagStrong] = tagBold; 123 | 124 | auto tagItalic = make_shared(); 125 | tags_[kTagItalic] = tagItalic; 126 | tags_[kTagItalic2] = tagItalic; 127 | tags_[kTagDefinition] = tagItalic; 128 | tags_[kTagCitation] = tagItalic; 129 | 130 | tags_[kTagUnderline] = make_shared(); 131 | 132 | auto tagStrighthrought = make_shared(); 133 | tags_[kTagStrighthrought] = tagStrighthrought; 134 | tags_[kTagStrighthrought2] = tagStrighthrought; 135 | 136 | tags_[kTagBlockquote] = make_shared(); 137 | 138 | // Tables 139 | tags_[kTagTable] = make_shared(); 140 | tags_[kTagTableRow] = make_shared(); 141 | tags_[kTagTableHeader] = make_shared(); 142 | tags_[kTagTableData] = make_shared(); 143 | } 144 | 145 | void Converter::CleanUpMarkdown() { 146 | TidyAllLines(&md_); 147 | 148 | ReplaceAll(&md_, " , ", ", "); 149 | 150 | ReplaceAll(&md_, "\n.\n", ".\n"); 151 | ReplaceAll(&md_, "\n↵\n", " ↵\n"); 152 | ReplaceAll(&md_, "\n*\n", "\n"); 153 | ReplaceAll(&md_, "\n. ", ".\n"); 154 | 155 | ReplaceAll(&md_, """, '"'); 156 | ReplaceAll(&md_, "<", "<"); 157 | ReplaceAll(&md_, ">", ">"); 158 | ReplaceAll(&md_, "&", '&'); 159 | ReplaceAll(&md_, " ", ' '); 160 | ReplaceAll(&md_, "→", "→"); 161 | 162 | ReplaceAll(&md_, "\t\t ", "\t\t"); 163 | } 164 | 165 | Converter *Converter::appendToMd(char ch) { 166 | if (IsInIgnoredTag()) 167 | return this; 168 | 169 | if (index_blockquote != 0 && ch == '\n') { 170 | if (is_in_pre_) { 171 | md_ += ch; 172 | chars_in_curr_line_ = 0; 173 | appendToMd(Repeat("> ", index_blockquote)); 174 | } 175 | 176 | return this; 177 | } 178 | 179 | md_ += ch; 180 | 181 | if (ch == '\n') 182 | chars_in_curr_line_ = 0; 183 | else 184 | ++chars_in_curr_line_; 185 | 186 | return this; 187 | } 188 | 189 | Converter *Converter::appendToMd(const char *str) 190 | { 191 | if (IsInIgnoredTag()) 192 | return this; 193 | 194 | md_ += str; 195 | 196 | auto str_len = strlen(str); 197 | 198 | for (auto i = 0; i < str_len; ++i) { 199 | if (str[i] == '\n') 200 | chars_in_curr_line_ = 0; 201 | else 202 | ++chars_in_curr_line_; 203 | } 204 | 205 | return this; 206 | } 207 | 208 | Converter *Converter::appendBlank() { 209 | UpdatePrevChFromMd(); 210 | 211 | if (prev_ch_in_md_ == '\n' || 212 | (prev_ch_in_md_ == '*' && prev_prev_ch_in_md_ == '*')) 213 | return this; 214 | 215 | return appendToMd(' '); 216 | } 217 | 218 | bool Converter::ok() const { 219 | return !is_in_pre_ && !is_in_list_ && !is_in_p_ && !is_in_table_ && 220 | !is_in_tag_ && index_blockquote == 0 && index_li == 0; 221 | } 222 | 223 | void Converter::LTrim(string *s) { 224 | (*s).erase((*s).begin(), 225 | find_if((*s).begin(), (*s).end(), 226 | [](unsigned char ch) { return !std::isspace(ch); })); 227 | } 228 | 229 | Converter *Converter::RTrim(string *s, bool trim_only_blank) { 230 | (*s).erase(find_if((*s).rbegin(), (*s).rend(), 231 | [trim_only_blank](unsigned char ch) { 232 | if (trim_only_blank) 233 | return !isblank(ch); 234 | 235 | return !isspace(ch); 236 | }) 237 | .base(), 238 | (*s).end()); 239 | 240 | return this; 241 | } 242 | 243 | // NOTE: Pay attention when changing one of the trim functions. It can break the 244 | // output! 245 | Converter *Converter::Trim(string *s) { 246 | if (!startsWith(*s, "\t")) 247 | LTrim(s); 248 | 249 | if (!(startsWith(*s, " "), endsWith(*s, " "))) 250 | RTrim(s); 251 | 252 | return this; 253 | } 254 | 255 | void Converter::TidyAllLines(string *str) { 256 | auto lines = Split(*str, '\n'); 257 | string res; 258 | 259 | uint8_t amount_newlines = 0; 260 | bool in_code_block = false; 261 | 262 | for (auto line : lines) { 263 | if (startsWith(line, "```") || startsWith(line, "~~~")) 264 | in_code_block = !in_code_block; 265 | if (in_code_block) { 266 | res += line + '\n'; 267 | continue; 268 | } 269 | 270 | Trim(&line); 271 | 272 | if (line.empty()) { 273 | if (amount_newlines < 2 && !res.empty()) { 274 | res += '\n'; 275 | amount_newlines++; 276 | } 277 | } else { 278 | amount_newlines = 0; 279 | 280 | res += line + '\n'; 281 | } 282 | } 283 | 284 | *str = res; 285 | } 286 | 287 | string Converter::ExtractAttributeFromTagLeftOf(const string &attr) { 288 | // Extract the whole tag from current offset, e.g. from '>', backwards 289 | auto tag = html_.substr(offset_lt_, index_ch_in_html_ - offset_lt_); 290 | 291 | // locate given attribute 292 | auto offset_attr = tag.find(attr); 293 | 294 | if (offset_attr == string::npos) 295 | return ""; 296 | 297 | // locate attribute-value pair's '=' 298 | auto offset_equals = tag.find('=', offset_attr); 299 | 300 | if (offset_equals == string::npos) 301 | return ""; 302 | 303 | // locate value's surrounding quotes 304 | auto offset_double_quote = tag.find('"', offset_equals); 305 | auto offset_single_quote = tag.find('\'', offset_equals); 306 | 307 | bool has_double_quote = offset_double_quote != string::npos; 308 | bool has_single_quote = offset_single_quote != string::npos; 309 | 310 | if (!has_double_quote && !has_single_quote) 311 | return ""; 312 | 313 | char wrapping_quote = 0; 314 | 315 | size_t offset_opening_quote = 0; 316 | size_t offset_closing_quote = 0; 317 | 318 | if (has_double_quote) { 319 | if (!has_single_quote) { 320 | wrapping_quote = '"'; 321 | offset_opening_quote = offset_double_quote; 322 | } else { 323 | if (offset_double_quote < offset_single_quote) { 324 | wrapping_quote = '"'; 325 | offset_opening_quote = offset_double_quote; 326 | } else { 327 | wrapping_quote = '\''; 328 | offset_opening_quote = offset_single_quote; 329 | } 330 | } 331 | } else { 332 | // has only single quote 333 | wrapping_quote = '\''; 334 | offset_opening_quote = offset_single_quote; 335 | } 336 | 337 | if (offset_opening_quote == string::npos) 338 | return ""; 339 | 340 | offset_closing_quote = tag.find(wrapping_quote, offset_opening_quote + 1); 341 | 342 | if (offset_closing_quote == string::npos) 343 | return ""; 344 | 345 | return tag.substr(offset_opening_quote + 1, 346 | offset_closing_quote - 1 - offset_opening_quote); 347 | } 348 | 349 | void Converter::TurnLineIntoHeader1() { 350 | appendToMd('\n' + Repeat("=", chars_in_curr_line_) + "\n\n"); 351 | 352 | chars_in_curr_line_ = 0; 353 | } 354 | 355 | void Converter::TurnLineIntoHeader2() { 356 | appendToMd('\n' + Repeat("-", chars_in_curr_line_) + "\n\n"); 357 | 358 | chars_in_curr_line_ = 0; 359 | } 360 | 361 | string Converter::convert() { 362 | // We already converted 363 | if (index_ch_in_html_ == html_.size()) 364 | return md_; 365 | 366 | reset(); 367 | 368 | for (char ch : html_) { 369 | ++index_ch_in_html_; 370 | 371 | if (!is_in_tag_ && ch == '<') { 372 | OnHasEnteredTag(); 373 | 374 | continue; 375 | } 376 | 377 | if (is_in_tag_) 378 | ParseCharInTag(ch); 379 | else 380 | ParseCharInTagContent(ch); 381 | } 382 | 383 | CleanUpMarkdown(); 384 | 385 | return md_; 386 | } 387 | 388 | void Converter::OnHasEnteredTag() { 389 | offset_lt_ = index_ch_in_html_; 390 | is_in_tag_ = true; 391 | prev_tag_ = current_tag_; 392 | current_tag_ = ""; 393 | 394 | if (!md_.empty()) { 395 | UpdatePrevChFromMd(); 396 | } 397 | } 398 | 399 | Converter *Converter::UpdatePrevChFromMd() { 400 | if (!md_.empty()) { 401 | prev_ch_in_md_ = md_[md_.length() - 1]; 402 | 403 | if (md_.length() > 1) 404 | prev_prev_ch_in_md_ = md_[md_.length() - 2]; 405 | } 406 | 407 | return this; 408 | } 409 | 410 | bool Converter::ParseCharInTag(char ch) { 411 | if (ch == '/' && !is_in_attribute_value_) { 412 | is_closing_tag_ = current_tag_.empty(); 413 | is_self_closing_tag_ = !is_closing_tag_; 414 | 415 | return true; 416 | } 417 | 418 | if (ch == '>') 419 | return OnHasLeftTag(); 420 | 421 | if (ch == '"') { 422 | if (is_in_attribute_value_) { 423 | is_in_attribute_value_ = false; 424 | } else if (current_tag_[current_tag_.length() - 1] == '=') { 425 | is_in_attribute_value_ = true; 426 | } 427 | 428 | return true; 429 | } 430 | 431 | current_tag_ += ch; 432 | 433 | return false; 434 | } 435 | 436 | bool Converter::OnHasLeftTag() { 437 | is_in_tag_ = false; 438 | 439 | UpdatePrevChFromMd(); 440 | 441 | if (!is_closing_tag_) 442 | if (TagContainsAttributesToHide(¤t_tag_)) 443 | return true; 444 | 445 | auto cut_tags = Split(current_tag_, ' '); 446 | if (cut_tags.empty()) 447 | return true; 448 | 449 | current_tag_ = cut_tags[0]; 450 | 451 | auto tag = tags_[current_tag_]; 452 | 453 | if (!tag) 454 | return true; 455 | 456 | if (!is_closing_tag_) { 457 | tag->OnHasLeftOpeningTag(this); 458 | } 459 | if (is_closing_tag_ || is_self_closing_tag_) { 460 | is_closing_tag_ = false; 461 | 462 | tag->OnHasLeftClosingTag(this); 463 | } 464 | 465 | return true; 466 | } 467 | 468 | Converter *Converter::ShortenMarkdown(size_t chars) { 469 | md_ = md_.substr(0, md_.length() - chars); 470 | 471 | if (chars > chars_in_curr_line_) 472 | chars_in_curr_line_ = 0; 473 | else 474 | chars_in_curr_line_ = chars_in_curr_line_ - chars; 475 | 476 | return this->UpdatePrevChFromMd(); 477 | } 478 | 479 | bool Converter::ParseCharInTagContent(char ch) { 480 | if (is_in_code_) { 481 | md_ += ch; 482 | 483 | if (index_blockquote != 0 && ch == '\n') 484 | appendToMd(Repeat("> ", index_blockquote)); 485 | 486 | return true; 487 | } 488 | 489 | if (IsInIgnoredTag() || current_tag_ == kTagLink) { 490 | prev_ch_in_html_ = ch; 491 | 492 | return true; 493 | } 494 | 495 | if (ch == '\n') { 496 | if (index_blockquote != 0) { 497 | md_ += '\n'; 498 | chars_in_curr_line_ = 0; 499 | appendToMd(Repeat("> ", index_blockquote)); 500 | } 501 | 502 | return true; 503 | } 504 | 505 | switch (ch) { 506 | case '*': 507 | appendToMd("\\*"); 508 | break; 509 | case '`': 510 | appendToMd("\\`"); 511 | break; 512 | case '\\': 513 | appendToMd("\\\\"); 514 | break; 515 | default: 516 | md_ += ch; 517 | ++chars_in_curr_line_; 518 | break; 519 | } 520 | 521 | if (chars_in_curr_line_ > option.softBreak && !is_in_table_ && !is_in_list_ && 522 | current_tag_ != kTagImg && current_tag_ != kTagAnchor && 523 | option.splitLines) { 524 | if (ch == ' ') { // If the next char is - it will become a list 525 | md_ += '\n'; 526 | chars_in_curr_line_ = 0; 527 | } else if (chars_in_curr_line_ > option.hardBreak) { 528 | ReplacePreviousSpaceInLineByNewline(); 529 | } 530 | } 531 | 532 | return false; 533 | } 534 | 535 | bool Converter::ReplacePreviousSpaceInLineByNewline() { 536 | if (current_tag_ == kTagParagraph || 537 | is_in_table_ && (prev_tag_ != kTagCode && prev_tag_ != kTagPre)) 538 | return false; 539 | 540 | auto offset = md_.length() - 1; 541 | 542 | if (md_.length() == 0) 543 | return true; 544 | 545 | do { 546 | if (md_[offset] == '\n') 547 | return false; 548 | 549 | if (md_[offset] == ' ') { 550 | md_[offset] = '\n'; 551 | chars_in_curr_line_ = md_.length() - offset; 552 | 553 | return true; 554 | } 555 | 556 | --offset; 557 | } while (offset > 0); 558 | 559 | return false; 560 | } 561 | 562 | void Converter::TagAnchor::OnHasLeftOpeningTag(Converter *c) { 563 | if (c->prev_tag_ == kTagImg) 564 | c->appendToMd('\n'); 565 | 566 | current_title_ = c->ExtractAttributeFromTagLeftOf(kAttributeTitle); 567 | 568 | c->appendToMd('['); 569 | current_href_ = c->ExtractAttributeFromTagLeftOf(kAttributeHref); 570 | } 571 | 572 | void Converter::TagAnchor::OnHasLeftClosingTag(Converter *c) { 573 | if (!c->shortIfPrevCh('[')) { 574 | c->appendToMd("](")->appendToMd(current_href_); 575 | 576 | // If title is set append it 577 | if (!current_title_.empty()) { 578 | c->appendToMd(" \"")->appendToMd(current_title_)->appendToMd('"'); 579 | current_title_.clear(); 580 | } 581 | 582 | c->appendToMd(')'); 583 | 584 | if (c->prev_tag_ == kTagImg) 585 | c->appendToMd('\n'); 586 | } 587 | } 588 | 589 | void Converter::TagBold::OnHasLeftOpeningTag(Converter *c) { 590 | c->appendToMd("**"); 591 | } 592 | 593 | void Converter::TagBold::OnHasLeftClosingTag(Converter *c) { 594 | c->appendToMd("**"); 595 | } 596 | 597 | void Converter::TagItalic::OnHasLeftOpeningTag(Converter *c) { 598 | c->appendToMd('*'); 599 | } 600 | 601 | void Converter::TagItalic::OnHasLeftClosingTag(Converter *c) { 602 | c->appendToMd('*'); 603 | } 604 | 605 | void Converter::TagUnderline::OnHasLeftOpeningTag(Converter *c) { 606 | c->appendToMd(""); 607 | } 608 | 609 | void Converter::TagUnderline::OnHasLeftClosingTag(Converter *c) { 610 | c->appendToMd(""); 611 | } 612 | 613 | void Converter::TagStrikethrought::OnHasLeftOpeningTag(Converter *c) { 614 | c->appendToMd('~'); 615 | } 616 | 617 | void Converter::TagStrikethrought::OnHasLeftClosingTag(Converter *c) { 618 | c->appendToMd('~'); 619 | } 620 | 621 | void Converter::TagBreak::OnHasLeftOpeningTag(Converter *c) { 622 | if (c->is_in_list_) { // When it's in a list, it's not in a paragraph 623 | c->appendToMd(" \n"); 624 | c->appendToMd(Repeat(" ", c->index_li)); 625 | } else if (c->is_in_table_) { 626 | c->appendToMd("
    "); 627 | } else if (!c->is_in_p_ && !c->prev_tag_.empty()) { 628 | c->appendToMd("\n
    \n\n"); 629 | } else if (c->md_.length() > 0) 630 | c->appendToMd(" \n"); 631 | } 632 | 633 | void Converter::TagBreak::OnHasLeftClosingTag(Converter *c) {} 634 | 635 | void Converter::TagDiv::OnHasLeftOpeningTag(Converter *c) { 636 | if (c->prev_ch_in_md_ != '\n') 637 | c->appendToMd('\n'); 638 | 639 | if (c->prev_prev_ch_in_md_ != '\n') 640 | c->appendToMd('\n'); 641 | } 642 | 643 | void Converter::TagDiv::OnHasLeftClosingTag(Converter *c) {} 644 | 645 | void Converter::TagHeader1::OnHasLeftOpeningTag(Converter *c) { 646 | c->appendToMd("\n# "); 647 | } 648 | 649 | void Converter::TagHeader1::OnHasLeftClosingTag(Converter *c) { 650 | if (c->prev_prev_ch_in_md_ != ' ') 651 | c->appendToMd('\n'); 652 | } 653 | 654 | void Converter::TagHeader2::OnHasLeftOpeningTag(Converter *c) { 655 | c->appendToMd("\n## "); 656 | } 657 | 658 | void Converter::TagHeader2::OnHasLeftClosingTag(Converter *c) { 659 | if (c->prev_prev_ch_in_md_ != ' ') 660 | c->appendToMd('\n'); 661 | } 662 | 663 | void Converter::TagHeader3::OnHasLeftOpeningTag(Converter *c) { 664 | c->appendToMd("\n### "); 665 | } 666 | 667 | void Converter::TagHeader3::OnHasLeftClosingTag(Converter *c) { 668 | if (c->prev_prev_ch_in_md_ != ' ') 669 | c->appendToMd('\n'); 670 | } 671 | 672 | void Converter::TagHeader4::OnHasLeftOpeningTag(Converter *c) { 673 | c->appendToMd("\n#### "); 674 | } 675 | 676 | void Converter::TagHeader4::OnHasLeftClosingTag(Converter *c) { 677 | if (c->prev_prev_ch_in_md_ != ' ') 678 | c->appendToMd('\n'); 679 | } 680 | 681 | void Converter::TagHeader5::OnHasLeftOpeningTag(Converter *c) { 682 | c->appendToMd("\n##### "); 683 | } 684 | 685 | void Converter::TagHeader5::OnHasLeftClosingTag(Converter *c) { 686 | if (c->prev_prev_ch_in_md_ != ' ') 687 | c->appendToMd('\n'); 688 | } 689 | 690 | void Converter::TagHeader6::OnHasLeftOpeningTag(Converter *c) { 691 | c->appendToMd("\n###### "); 692 | } 693 | 694 | void Converter::TagHeader6::OnHasLeftClosingTag(Converter *c) { 695 | if (c->prev_prev_ch_in_md_ != ' ') 696 | c->appendToMd('\n'); 697 | } 698 | 699 | void Converter::TagListItem::OnHasLeftOpeningTag(Converter *c) { 700 | if (c->is_in_table_) 701 | return; 702 | 703 | if (!c->is_in_ordered_list_) { 704 | c->appendToMd(string({c->option.unorderedList, ' '})); 705 | return; 706 | } 707 | 708 | ++c->index_ol; 709 | 710 | string num = std::to_string(c->index_ol); 711 | num.append({c->option.orderedList, ' '}); 712 | c->appendToMd(num); 713 | } 714 | 715 | void Converter::TagListItem::OnHasLeftClosingTag(Converter *c) { 716 | if (c->is_in_table_) 717 | return; 718 | 719 | if (c->prev_ch_in_md_ != '\n') 720 | c->appendToMd('\n'); 721 | } 722 | 723 | void Converter::TagOption::OnHasLeftOpeningTag(Converter *c) {} 724 | 725 | void Converter::TagOption::OnHasLeftClosingTag(Converter *c) { 726 | if (c->md_.length() > 0) 727 | c->appendToMd(" \n"); 728 | } 729 | 730 | void Converter::TagOrderedList::OnHasLeftOpeningTag(Converter *c) { 731 | if (c->is_in_table_) 732 | return; 733 | 734 | c->is_in_list_ = true; 735 | c->is_in_ordered_list_ = true; 736 | c->index_ol = 0; 737 | 738 | ++c->index_li; 739 | 740 | c->ReplacePreviousSpaceInLineByNewline(); 741 | 742 | c->appendToMd('\n'); 743 | } 744 | 745 | void Converter::TagOrderedList::OnHasLeftClosingTag(Converter *c) { 746 | if (c->is_in_table_) 747 | return; 748 | 749 | c->is_in_ordered_list_ = false; 750 | 751 | if (c->index_li != 0) 752 | --c->index_li; 753 | 754 | c->is_in_list_ = c->index_li != 0; 755 | 756 | c->appendToMd('\n'); 757 | } 758 | 759 | void Converter::TagParagraph::OnHasLeftOpeningTag(Converter *c) { 760 | c->is_in_p_ = true; 761 | 762 | if (c->is_in_list_ && c->prev_tag_ == kTagParagraph) 763 | c->appendToMd("\n\t"); 764 | else if (!c->is_in_list_) 765 | c->appendToMd('\n'); 766 | } 767 | 768 | void Converter::TagParagraph::OnHasLeftClosingTag(Converter *c) { 769 | c->is_in_p_ = false; 770 | 771 | if (!c->md_.empty()) 772 | c->appendToMd("\n"); // Workaround \n restriction for blockquotes 773 | 774 | if (c->index_blockquote != 0) 775 | c->appendToMd(Repeat("> ", c->index_blockquote)); 776 | } 777 | 778 | void Converter::TagPre::OnHasLeftOpeningTag(Converter *c) { 779 | c->is_in_pre_ = true; 780 | 781 | if (c->prev_ch_in_md_ != '\n') 782 | c->appendToMd('\n'); 783 | 784 | if (c->prev_prev_ch_in_md_ != '\n') 785 | c->appendToMd('\n'); 786 | 787 | if (c->is_in_list_ && c->prev_tag_ != kTagParagraph) 788 | c->ShortenMarkdown(2); 789 | 790 | if (c->is_in_list_) 791 | c->appendToMd("\t\t"); 792 | else 793 | c->appendToMd("```"); 794 | } 795 | 796 | void Converter::TagPre::OnHasLeftClosingTag(Converter *c) { 797 | c->is_in_pre_ = false; 798 | 799 | if (c->is_in_list_) 800 | return; 801 | 802 | c->appendToMd("```"); 803 | c->appendToMd('\n'); // Don't combine because of blockquote 804 | } 805 | 806 | void Converter::TagCode::OnHasLeftOpeningTag(Converter *c) { 807 | c->is_in_code_ = true; 808 | 809 | if (c->is_in_pre_) { 810 | if (c->is_in_list_) 811 | return; 812 | 813 | auto code = c->ExtractAttributeFromTagLeftOf(kAttributeClass); 814 | if (!code.empty()) { 815 | if (startsWith(code, "language-")) 816 | code.erase(0, 9); // remove language- 817 | c->appendToMd(code); 818 | } 819 | c->appendToMd('\n'); 820 | } else 821 | c->appendToMd('`'); 822 | } 823 | 824 | void Converter::TagCode::OnHasLeftClosingTag(Converter *c) { 825 | c->is_in_code_ = false; 826 | 827 | if (c->is_in_pre_) 828 | return; 829 | 830 | c->appendToMd('`'); 831 | } 832 | 833 | void Converter::TagSpan::OnHasLeftOpeningTag(Converter *c) {} 834 | 835 | void Converter::TagSpan::OnHasLeftClosingTag(Converter *c) {} 836 | 837 | void Converter::TagTitle::OnHasLeftOpeningTag(Converter *c) {} 838 | 839 | void Converter::TagTitle::OnHasLeftClosingTag(Converter *c) { 840 | c->TurnLineIntoHeader1(); 841 | } 842 | 843 | void Converter::TagUnorderedList::OnHasLeftOpeningTag(Converter *c) { 844 | if (c->is_in_list_ || c->is_in_table_) 845 | return; 846 | 847 | c->is_in_list_ = true; 848 | 849 | ++c->index_li; 850 | 851 | c->appendToMd('\n'); 852 | } 853 | 854 | void Converter::TagUnorderedList::OnHasLeftClosingTag(Converter *c) { 855 | if (c->is_in_table_) 856 | return; 857 | 858 | if (c->index_li != 0) 859 | --c->index_li; 860 | 861 | c->is_in_list_ = c->index_li != 0; 862 | 863 | if (c->prev_prev_ch_in_md_ == '\n' && c->prev_ch_in_md_ == '\n') 864 | c->ShortenMarkdown(); 865 | else if (c->prev_ch_in_md_ != '\n') 866 | c->appendToMd('\n'); 867 | } 868 | 869 | void Converter::TagImage::OnHasLeftOpeningTag(Converter *c) { 870 | if (c->prev_tag_ != kTagAnchor && c->prev_ch_in_md_ != '\n') 871 | c->appendToMd('\n'); 872 | 873 | c->appendToMd("![") 874 | ->appendToMd(c->ExtractAttributeFromTagLeftOf(kAttributeAlt)) 875 | ->appendToMd("](") 876 | ->appendToMd(c->ExtractAttributeFromTagLeftOf(kAttributeSrc)); 877 | 878 | auto title = c->ExtractAttributeFromTagLeftOf(kAttributeTitle); 879 | if (!title.empty()) { 880 | c->appendToMd(" \"")->appendToMd(title)->appendToMd('"'); 881 | } 882 | 883 | c->appendToMd(")"); 884 | } 885 | 886 | void Converter::TagImage::OnHasLeftClosingTag(Converter *c) { 887 | if (c->prev_tag_ == kTagAnchor) 888 | c->appendToMd('\n'); 889 | } 890 | 891 | void Converter::TagSeperator::OnHasLeftOpeningTag(Converter *c) { 892 | c->appendToMd("\n---\n"); // NOTE: We can make this an option 893 | } 894 | 895 | void Converter::TagSeperator::OnHasLeftClosingTag(Converter *c) {} 896 | 897 | void Converter::TagTable::OnHasLeftOpeningTag(Converter *c) { 898 | c->is_in_table_ = true; 899 | c->appendToMd('\n'); 900 | c->table_start = c->md_.length(); 901 | } 902 | 903 | void Converter::TagTable::OnHasLeftClosingTag(Converter *c) { 904 | c->is_in_table_ = false; 905 | c->appendToMd('\n'); 906 | 907 | if (!c->option.formatTable) 908 | return; 909 | 910 | string table = c->md_.substr(c->table_start); 911 | table = formatMarkdownTable(table); 912 | c->ShortenMarkdown(c->md_.size() - c->table_start); 913 | c->appendToMd(table); 914 | } 915 | 916 | void Converter::TagTableRow::OnHasLeftOpeningTag(Converter *c) { 917 | c->appendToMd('\n'); 918 | } 919 | 920 | void Converter::TagTableRow::OnHasLeftClosingTag(Converter *c) { 921 | c->UpdatePrevChFromMd(); 922 | if (c->prev_ch_in_md_ == '|') 923 | c->appendToMd('\n'); // There's a bug 924 | else 925 | c->appendToMd('|'); 926 | 927 | if (!c->tableLine.empty()) { 928 | if (c->prev_ch_in_md_ != '\n') 929 | c->appendToMd('\n'); 930 | 931 | c->tableLine.append("|\n"); 932 | c->appendToMd(c->tableLine); 933 | c->tableLine.clear(); 934 | } 935 | } 936 | 937 | void Converter::TagTableHeader::OnHasLeftOpeningTag(Converter *c) { 938 | auto align = c->ExtractAttributeFromTagLeftOf(kAttrinuteAlign); 939 | 940 | string line = "| "; 941 | 942 | if (align == "left" || align == "center") 943 | line += ':'; 944 | 945 | line += '-'; 946 | 947 | if (align == "right" || align == "center") 948 | line += ": "; 949 | else 950 | line += ' '; 951 | 952 | c->tableLine.append(line); 953 | 954 | c->appendToMd("| "); 955 | } 956 | 957 | void Converter::TagTableHeader::OnHasLeftClosingTag(Converter *c) {} 958 | 959 | void Converter::TagTableData::OnHasLeftOpeningTag(Converter *c) { 960 | if (c->prev_prev_ch_in_md_ != '|') 961 | c->appendToMd("| "); 962 | } 963 | 964 | void Converter::TagTableData::OnHasLeftClosingTag(Converter *c) {} 965 | 966 | void Converter::TagBlockquote::OnHasLeftOpeningTag(Converter *c) { 967 | ++c->index_blockquote; 968 | } 969 | 970 | void Converter::TagBlockquote::OnHasLeftClosingTag(Converter *c) { 971 | --c->index_blockquote; 972 | c->ShortenMarkdown(2); // Remove the '> ' 973 | } 974 | 975 | void Converter::reset() { 976 | md_.clear(); 977 | prev_ch_in_md_ = 0; 978 | prev_prev_ch_in_md_ = 0; 979 | index_ch_in_html_ = 0; 980 | } 981 | 982 | bool Converter::IsInIgnoredTag() const { 983 | if (current_tag_ == kTagTitle && !option.includeTitle) 984 | return true; 985 | 986 | return IsIgnoredTag(current_tag_); 987 | } 988 | } // namespace html2md 989 | -------------------------------------------------------------------------------- /src/table.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Tim Gromeyer 2 | // Licensed under the MIT License - https://opensource.org/licenses/MIT 3 | 4 | #include "table.h" 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | using std::string; 12 | using std::vector; 13 | 14 | const size_t MIN_LINE_LENGTH = 3; // Minimum length of line 15 | 16 | void removeLeadingTrailingSpaces(string &str) { 17 | size_t firstNonSpace = str.find_first_not_of(' '); 18 | if (firstNonSpace == string::npos) { 19 | str.clear(); // Entire string is spaces 20 | return; 21 | } 22 | 23 | size_t lastNonSpace = str.find_last_not_of(' '); 24 | str = str.substr(firstNonSpace, lastNonSpace - firstNonSpace + 1); 25 | } 26 | 27 | string enlargeTableHeaderLine(const string &str, size_t length) { 28 | if (str.empty() || length < MIN_LINE_LENGTH) 29 | return ""; 30 | 31 | size_t first = str.find_first_of(':'); 32 | size_t last = str.find_last_of(':'); 33 | 34 | if (first == 0 && first == last) 35 | last = string::npos; 36 | 37 | string line = string(length, '-'); 38 | 39 | if (first == 0) 40 | line[0] = ':'; 41 | if (last == str.length() - 1) 42 | line[length - 1] = ':'; 43 | 44 | return line; 45 | } 46 | 47 | string formatMarkdownTable(const string &inputTable) { 48 | std::istringstream iss(inputTable); 49 | string line; 50 | vector> tableData; 51 | 52 | // Parse the input table into a 2D vector 53 | while (std::getline(iss, line)) { 54 | std::istringstream lineStream(line); 55 | string cell; 56 | vector rowData; 57 | 58 | while (std::getline(lineStream, cell, '|')) { 59 | if (!cell.empty()) { 60 | removeLeadingTrailingSpaces(cell); // Use the trim function 61 | rowData.push_back(cell); 62 | } 63 | } 64 | 65 | if (!rowData.empty()) { 66 | tableData.push_back(std::move(rowData)); // Move rowData to avoid copying 67 | } 68 | } 69 | 70 | if (tableData.empty()) { 71 | return ""; 72 | } 73 | 74 | // Determine maximum width of each column 75 | vector columnWidths(tableData[0].size(), 0); 76 | for (const auto &row : tableData) { 77 | if (columnWidths.size() < row.size()) { 78 | columnWidths.resize(row.size(), 0); 79 | } 80 | 81 | for (size_t i = 0; i < row.size(); ++i) { 82 | columnWidths[i] = std::max(columnWidths[i], row[i].size()); 83 | } 84 | } 85 | 86 | // Build the formatted table 87 | std::ostringstream formattedTable; 88 | for (size_t rowNumber = 0; rowNumber < tableData.size(); ++rowNumber) { 89 | const auto &row = tableData[rowNumber]; 90 | 91 | formattedTable << "|"; 92 | 93 | for (size_t i = 0; i < row.size(); ++i) { 94 | if (rowNumber == 1) { 95 | formattedTable << enlargeTableHeaderLine(row[i], columnWidths[i] + 2) 96 | << "|"; 97 | continue; 98 | } 99 | formattedTable << " " << std::setw(columnWidths[i]) << std::left << row[i] 100 | << " |"; 101 | } 102 | formattedTable << "\n"; 103 | } 104 | 105 | return formattedTable.str(); 106 | } 107 | -------------------------------------------------------------------------------- /tests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | project(tests LANGUAGES C CXX) 2 | 3 | if (NOT EXISTS ${CMAKE_CURRENT_LIST_DIR}/md4c/src/) 4 | include(FindGit) 5 | 6 | if(NOT GIT_FOUND) 7 | message(WARNING "git not found. Please download md4c manually or disable tests.") 8 | return() 9 | endif() 10 | 11 | get_directory_property(dir PARENT_DIRECTORY) 12 | 13 | execute_process(COMMAND ${GIT_EXECUTABLE} submodule update --init --depth=1 14 | WORKING_DIRECTORY ${dir}) 15 | endif() 16 | 17 | set(MD4C_FILES 18 | md4c/src/entity.c 19 | md4c/src/entity.h 20 | md4c/src/md4c-html.c 21 | md4c/src/md4c-html.h 22 | md4c/src/md4c.c 23 | md4c/src/md4c.h 24 | ) 25 | 26 | add_library(md4c-html STATIC ${MD4C_FILES}) 27 | target_include_directories(md4c-html PUBLIC md4c/src) 28 | 29 | add_executable(test-exe main.cpp) 30 | target_link_libraries(test-exe md4c-html html2md-static) 31 | target_compile_definitions(test-exe PUBLIC DIR="${CMAKE_CURRENT_LIST_DIR}") 32 | set_target_properties(test-exe PROPERTIES OUTPUT_NAME "tests") 33 | target_compile_features(test-exe PUBLIC cxx_std_17) # Require at least c++17 34 | 35 | 36 | if (CMAKE_VERSION VERSION_LESS 3.11.0) 37 | return() 38 | endif() 39 | 40 | add_custom_target(test 41 | COMMAND $ 42 | COMMENT Runing tests.. 43 | DEPENDS test-exe 44 | ) 45 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | ## How does the test work? 2 | 3 | Well, the program searches(in this dir) for files ending with `.md`. 4 | 5 | 1. It then converts the Markdown to HTML using [md4c](https://github.com/tim-gromeyer/MarkdownEdit_md4c). 6 | 2. Afterwards it converts the HTML back to Markdown. 7 | 3. The generated Markdown gets converted back to HTML 8 | 4. It compares the HTML generated from the original Markdown 9 | and the HTML generated from the converted Markdown. 10 | -------------------------------------------------------------------------------- /tests/blockquote.md: -------------------------------------------------------------------------------- 1 | # Blockquote Demonstration 2 | 3 | Blockquotes can contain various Markdown elements, including code blocks and other formatting. 4 | 5 | ## Syntax 6 | 7 | You can create a blockquote by adding a `>` character before the quoted text. You can also nest blockquotes by using multiple `>` characters. 8 | 9 | ### Example 10 | 11 | > This is a simple blockquote. 12 | > It can span multiple lines. 13 | 14 | You can also include other Markdown elements within blockquotes: 15 | 16 | > Here's a list: 17 | > - Item 1 18 | > - Item 2 19 | > - Item 3 20 | 21 | And you can nest blockquotes as well: 22 | 23 | > This is a level 1 blockquote. 24 | > 25 | > > This is a nested level 2 blockquote. 26 | > > 27 | > > > This is a nested level 3 blockquote. 28 | 29 | ## Code Blocks 30 | 31 | You can include code blocks within blockquotes: 32 | 33 | > Here's an example of a code block: 34 | > 35 | > ``` 36 | > def greet(name): 37 | > print(f"Hello, {name}!") 38 | > ``` 39 | > 40 | > And here's inline code: `print("Markdown is great!")` 41 | 42 | ## Links and Images 43 | 44 | Links and images can also be included in blockquotes: 45 | 46 | > Check out the [Markdown Guide](https://www.markdownguide.org/) for more information. 47 | > 48 | > ![Markdown Logo](https://markdown-here.com/img/icon256.png) 49 | 50 | ## Conclusion 51 | 52 | Blockquotes are a versatile tool in Markdown that allow you to emphasize and format various types of content within a quoted context. 53 | -------------------------------------------------------------------------------- /tests/breaks.md: -------------------------------------------------------------------------------- 1 | # Line Breaks Demo 2 | 3 | ## Double Space Method 4 | 5 | This is the first line. 6 | This line has a line break after it. 7 | 8 | This is another paragraph. 9 | And this line has a line break too. 10 | 11 | ## `
    ` Tag Method 12 | 13 | This line will be followed by a line break.
    14 | And this line will be on the next line. 15 | 16 | You can also use the `
    ` tag without closing it:
    17 | This will continue on the same line, but with a space after. 18 | 19 | -------------------------------------------------------------------------------- /tests/code.md: -------------------------------------------------------------------------------- 1 | # Code Example Markdown 2 | 3 | ## Python Code 4 | 5 | You can include Python code blocks like this: 6 | 7 | ```python 8 | def factorial(n): 9 | if n == 0: 10 | return 1 11 | else: 12 | return n * factorial(n - 1) 13 | 14 | result = factorial(5) 15 | print("Factorial of 5:", result) 16 | ``` 17 | 18 | ## JavaScript Code 19 | 20 | JavaScript code can be included like this: 21 | 22 | ```javascript 23 | function fibonacci(n) { 24 | if (n <= 1) { 25 | return n; 26 | } else { 27 | return fibonacci(n - 1) + fibonacci(n - 2); 28 | } 29 | } 30 | 31 | const fibResult = fibonacci(6); 32 | console.log(`Fibonacci of 6: ${fibResult}`); 33 | ``` 34 | 35 | ## Inline Code 36 | 37 | You can also include inline code using backticks. For example, `print("Hello, World!")` is a simple Python print statement. 38 | 39 | ## Syntax Highlighting 40 | 41 | Markdown supports syntax highlighting for various programming languages, making your code more readable. For instance, you can specify the language after the triple backticks: 42 | 43 | ```java 44 | public class HelloWorld { 45 | public static void main(String[] args) { 46 | System.out.println("Hello, World!"); 47 | } 48 | } 49 | ``` 50 | 51 | Enjoy using code snippets in your Markdown files! 52 | -------------------------------------------------------------------------------- /tests/comment.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | HTML Comment Example 5 | 6 | 7 |

    Welcome to My Website

    8 | 9 |

    This is some content on my page.

    10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /tests/escaping.md: -------------------------------------------------------------------------------- 1 | # Escaping Special Symbols Demo 2 | 3 | This is a demonstration of how special symbols like `*` are escaped in Markdown. 4 | 5 | ## Asterisks 6 | 7 | To display an asterisk (\*) without triggering Markdown formatting, you can use a backslash: `\*`. 8 | 9 | ## Code Blocks 10 | 11 | You can also display code blocks inline using backticks (\`). For example, `var x = 5;`. 12 | 13 | ## Backslashes 14 | 15 | To display a backslash (\\) itself, you need to escape it with another backslash: \\\\. 16 | -------------------------------------------------------------------------------- /tests/formating.md: -------------------------------------------------------------------------------- 1 | # Formatting Demo 2 | 3 | This is a demonstration of various formatting options available in Markdown. 4 | 5 | ## Bold 6 | 7 | **This text is bold.** 8 | 9 | ## Italic 10 | 11 | *This text is italic.* 12 | 13 | ## Strikethrough 14 | 15 | ~~This text is strikethrough.~~ 16 | 17 | ## Underline 18 | 19 | This text is underlined using HTML inline styling. 20 | 21 | ## Combination 22 | 23 | You can also combine formatting options. For example, ***this text is bold and italic.*** 24 | 25 | ## Nested Formatting 26 | 27 | You can also nest formatting, such as combining **bold and *italic*** or ***bold and italic together.*** 28 | 29 | -------------------------------------------------------------------------------- /tests/links.md: -------------------------------------------------------------------------------- 1 | # Markdown Links and Images 2 | 3 | ## Regular Links 4 | 5 | - [Google](https://www.google.com) 6 | - [OpenAI](https://www.openai.com) 7 | - [GitHub](https://www.github.com) 8 | 9 | ## Inline Links 10 | 11 | Check out this [awesome website](https://www.example.com)! 12 | 13 | ## Link with 14 | 15 | [gaminginlinux](gamingonlinux.com "Gaming rocks!") 16 | 17 | ## Reference Links 18 | 19 | - [Markdown Syntax][markdown] 20 | - [Markdown Cheatsheet][cheatsheet] 21 | 22 | [markdown]: https://www.markdownguide.org/basic-syntax/ 23 | [cheatsheet]: https://www.markdownguide.org/cheat-sheet/ 24 | 25 | ## Images 26 | 27 | ![Nature](https://www.example.com/images/nature.jpg) 28 | ![Space](https://www.example.com/images/space.jpg) 29 | 30 | ## Images with Alt Text 31 | 32 | ![Mountains](https://www.example.com/images/mountains.jpg "Beautiful Mountains") 33 | ![Beach](https://www.example.com/images/beach.jpg "Sunny Beach") 34 | 35 | ## Images with Links 36 | 37 | [![Sunset](https://www.example.com/images/sunset.jpg)](https://www.example.com) 38 | 39 | ## Images with References 40 | 41 | [![Forest][forest-image]][forest-link] 42 | 43 | [forest-image]: https://www.example.com/images/forest.jpg 44 | [forest-link]: https://www.example.com/nature/forest 45 | 46 | -------------------------------------------------------------------------------- /tests/lists.md: -------------------------------------------------------------------------------- 1 | - foo 2 | - - bar 3 | 4 | 1. foo 5 | 2. bar 6 | 7 | - list entry with 8 | break 9 | - - Another 10 | break 11 | foo 12 | bar 13 | - Hello World 14 | -------------------------------------------------------------------------------- /tests/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "html2md.h" 11 | #include "md4c-html.h" 12 | #include "table.h" 13 | 14 | using std::cerr; 15 | using std::cout; 16 | using std::ifstream; 17 | using std::string; 18 | using std::stringstream; 19 | using std::vector; 20 | using std::chrono::duration; 21 | using std::chrono::high_resolution_clock; 22 | using std::chrono::milliseconds; 23 | namespace fs = std::filesystem; 24 | 25 | namespace markdown { 26 | void captureHtmlFragment(const MD_CHAR *data, const MD_SIZE data_size, 27 | void *userData) { 28 | auto *str = static_cast(userData); 29 | 30 | str->write(data, data_size); 31 | } 32 | 33 | string toHTML(const string &md) { 34 | stringstream html; 35 | 36 | static MD_TOC_OPTIONS options; 37 | 38 | md_html(md.c_str(), md.size(), &captureHtmlFragment, &html, MD_DIALECT_GITHUB, 39 | MD_HTML_FLAG_SKIP_UTF8_BOM, &options); 40 | 41 | return html.str(); 42 | }; 43 | 44 | string fromHTML(string &html) { 45 | static html2md::Options options; 46 | options.splitLines = false; 47 | 48 | html2md::Converter c(html, &options); 49 | return c.convert(); 50 | } 51 | } // namespace markdown 52 | 53 | namespace file { 54 | string readAll(const string &name) { 55 | ifstream in(name); 56 | stringstream buffer; 57 | buffer << in.rdbuf(); 58 | return buffer.str(); 59 | }; 60 | } // namespace file 61 | 62 | // Log the error 63 | void log(const string &file, const string &origMd, const string &generatedMd) { 64 | cerr << "Task " << fs::path(file).filename() << " failed:\nOriginal Md:\n" 65 | << origMd << "\nGenerated Markdown:\n" 66 | << generatedMd << '\n'; 67 | } 68 | 69 | // Print "Running " + filename 70 | void running(const string &file) { 71 | cout << "Running test " << fs::path(file).filename() << "...\t"; 72 | } 73 | 74 | // Print "Passed!" in green 75 | void passed() { cout << "\x1B[32mPassed!\033[0m\n"; } 76 | 77 | // Print "Failed!" in red 78 | void error() { cout << "\x1B[31mFailed!\033[0m\n"; } 79 | 80 | void runTest(const string &file, short *errorCount) { 81 | // Read the markdown file 82 | const string md = file::readAll(file); 83 | 84 | running(file); 85 | 86 | // Convert the Md to HTML 87 | string html = markdown::toHTML(md); 88 | 89 | // Generate Md from the HTML 90 | string convertedMd = markdown::fromHTML(html); 91 | 92 | // Convert it back to HTML 93 | string testHTML = markdown::toHTML(convertedMd); 94 | 95 | // Compare original and result HTML 96 | if (html == testHTML) 97 | passed(); 98 | else { 99 | error(); 100 | log(file, md, convertedMd); 101 | ++*errorCount; 102 | } 103 | } 104 | 105 | void testOption(const char *name) { 106 | cout << "Test option \"" << name << "\"...\t"; 107 | } 108 | 109 | bool testUnorderedList() { 110 | testOption("unorderedList"); 111 | 112 | string html = "
    • List
    "; 113 | 114 | html2md::Options o; 115 | o.unorderedList = '*'; 116 | 117 | html2md::Converter c(html, &o); 118 | 119 | auto md = c.convert(); 120 | 121 | return md.find("* List\n") != string::npos; 122 | } 123 | 124 | bool testOrderedList() { 125 | testOption("orderedList"); 126 | 127 | string html = "
    1. List
    "; 128 | 129 | html2md::Options o; 130 | o.orderedList = ')'; 131 | 132 | html2md::Converter c(html, &o); 133 | 134 | auto md = c.convert(); 135 | 136 | return md.find("1) List\n") != string::npos; 137 | } 138 | 139 | bool testDisableTitle() { 140 | testOption("includeTitle"); 141 | 142 | string html = "HTML title"; 143 | 144 | html2md::Options o; 145 | o.includeTitle = false; 146 | 147 | html2md::Converter c(html, &o); 148 | 149 | auto md = c.convert(); 150 | 151 | return md.empty() && 152 | html2md::Convert(html).find("HTML title") != string::npos; 153 | } 154 | 155 | bool testFormatTable() { 156 | testOption("formatTable"); 157 | 158 | constexpr const char *inputTable = "| 1 | 2 | 3 |\n" 159 | "| :-- | :-: | --: |\n" 160 | "| Hello | World | ! |\n" 161 | "| foo | bar | buzz |\n"; 162 | 163 | constexpr const char *expectedOutput = "| 1 | 2 | 3 |\n" 164 | "|:------|:-----:|-----:|\n" 165 | "| Hello | World | ! |\n" 166 | "| foo | bar | buzz |\n"; 167 | 168 | string formattedTable = formatMarkdownTable(inputTable); 169 | 170 | return formattedTable == expectedOutput; 171 | } 172 | 173 | int main(int argc, const char **argv) { 174 | // List to store all markdown files in this dir 175 | vector files; 176 | 177 | static vector markdownExtensions = {".md", ".markdown", ".mkd"}; 178 | 179 | // Find the files 180 | for (const auto &p : fs::recursive_directory_iterator(DIR)) { 181 | if (std::find(markdownExtensions.begin(), markdownExtensions.end(), 182 | p.path().extension()) != markdownExtensions.end() && 183 | p.path().parent_path() == DIR) 184 | files.emplace_back(p.path().string()); 185 | } 186 | 187 | // Test files passed as argument 188 | for (int i = 1; i < argc; i++) { 189 | // Check if the argument is a valid file path and ends with ".md" 190 | string file = argv[i]; 191 | if (fs::is_regular_file(file) && file.find(".md") == file.size() - 3) { 192 | files.emplace_back(file); 193 | } 194 | } 195 | 196 | // Sort file names 197 | sort(files.begin(), files.end()); 198 | 199 | // File name 200 | const char *errorFileName = DIR "/error.log"; 201 | 202 | // Redirect errors to error.log 203 | FILE *errorFile = freopen(errorFileName, "w", stderr); 204 | if (!errorFile) 205 | cerr << "Failed to open " << errorFileName 206 | << " for whatever reason!\n" 207 | "Errors will be printed to the terminal instead of written to the " 208 | "mentioned file above."; 209 | 210 | // For measuring time. 211 | auto t1 = high_resolution_clock::now(); 212 | 213 | // Count the errors 214 | short errorCount = 0; 215 | 216 | // Run the tests 217 | for (auto &file : files) 218 | runTest(file, &errorCount); 219 | 220 | // Test the options 221 | auto tests = {&testDisableTitle, &testUnorderedList, &testOrderedList, 222 | &testFormatTable}; 223 | 224 | for (const auto &test : tests) 225 | if (!test()) { 226 | ++errorCount; 227 | error(); 228 | } else 229 | passed(); 230 | 231 | auto t2 = high_resolution_clock::now(); 232 | 233 | /* Getting number of milliseconds as a double. */ 234 | duration ms_double = t2 - t1; 235 | 236 | cout << files.size() + tests.size() << " tests executed in " 237 | << ms_double.count() << "ms. " << errorCount << " failed.\n"; 238 | 239 | return 0; 240 | } 241 | -------------------------------------------------------------------------------- /tests/tables.md: -------------------------------------------------------------------------------- 1 | Simple table: 2 | 3 | | foo | 1 | 4 | |-----|---| 5 | | 1 | 3 | 6 | | bar | 5 | 7 | 8 | 9 | Table with alignment: 10 | 11 | | Syntax | Description | Test Text | 12 | | :-------- | :---------: | ----------: | 13 | | Header | Title | Here's this | 14 | | Paragraph | Text | And more | 15 | 16 | Table with line breaks: 17 | 18 | | From | To | 19 | |-------------- |----------------------------------------------- | 20 | | **Plain** | C-string
    Sorted
    MD5
    SHA256
    SHA512 | 21 | | **Markdown** | HTML
    Plain | 22 | | **HTML** | Markdown
    Plain | 23 | | **C-string** | Plain | 24 | 25 | Table with code: 26 | 27 | | table | 28 | |:-------:| 29 | | `code` | 30 | | no code | 31 | -------------------------------------------------------------------------------- /tests/test_advanced.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import pyhtml2md 3 | 4 | def test_basic_conversion(): 5 | # Test basic header conversion 6 | assert pyhtml2md.convert("

    Hello Python!

    ") == "# Hello Python!\n" 7 | 8 | # Test basic paragraph 9 | assert pyhtml2md.convert("

    Simple paragraph

    ") == "Simple paragraph\n" 10 | 11 | def test_converter_class(): 12 | # Test converter initialization and conversion 13 | converter = pyhtml2md.Converter("

    Hello Python!

    ") 14 | assert converter.convert() == "# Hello Python!\n" 15 | assert converter.ok() == True 16 | 17 | # Test boolean operator 18 | assert bool(converter) == True 19 | 20 | def test_options(): 21 | # Test options configuration 22 | options = pyhtml2md.Options() 23 | options.splitLines = False 24 | options.unorderedList = '*' 25 | options.orderedList = ')' 26 | options.includeTitle = False 27 | 28 | html = "
    • First
    • Second
    " 29 | converter = pyhtml2md.Converter(html, options) 30 | result = converter.convert() 31 | assert result.startswith('* First') 32 | assert converter.ok() 33 | 34 | def test_complex_formatting(): 35 | html = """ 36 |

    Main Title

    37 |

    Bold text and italic text

    38 |
      39 |
    • First item
    • 40 |
    • Second item
    • 41 |
    42 |
      43 |
    1. Numbered one
    2. 44 |
    3. Numbered two
    4. 45 |
    46 | """ 47 | options = pyhtml2md.Options() 48 | options.splitLines = False 49 | converter = pyhtml2md.Converter(html, options) 50 | result = converter.convert() 51 | 52 | assert "# Main Title" in result 53 | assert "**Bold text**" in result 54 | assert "*italic text*" in result 55 | assert "1. Numbered one" in result 56 | assert "2. Numbered two" in result 57 | 58 | def test_line_breaks(): 59 | # Test br outside paragraphs 60 | assert "Text \nText2" in pyhtml2md.convert("Text
    Text2") 61 | 62 | # Test br inside paragraphs 63 | assert "Line 1 \nLine 2" in pyhtml2md.convert("

    Line 1
    Line 2

    ") 64 | 65 | # Test br with bullet points in paragraph 66 | assert "Primary Colors: \n• Red \n• Blue \n• Yellow" in pyhtml2md.convert("

    Primary Colors:
    • Red
    • Blue
    • Yellow

    ") 67 | 68 | # Test soft line break settings 69 | html = "A very long line of text that should be wrapped according to the soft break and hard break settings" 70 | options = pyhtml2md.Options() 71 | options.splitLines = True 72 | options.softBreak = 20 73 | options.hardBreak = 30 74 | 75 | converter = pyhtml2md.Converter(html, options) 76 | result = converter.convert() 77 | lines = result.split('\n') 78 | assert any(len(line) <= 30 for line in lines) 79 | 80 | def test_table_formatting(): 81 | html = """ 82 | 83 | 84 | 85 |
    Header 1Header 2
    Data 1Data 2
    86 | """ 87 | options = pyhtml2md.Options() 88 | options.formatTable = True 89 | converter = pyhtml2md.Converter(html, options) 90 | result = converter.convert() 91 | 92 | assert "|" in result 93 | assert "Data 1" in result 94 | 95 | def test_error_handling(): 96 | # Test with malformed HTML 97 | html = "

    Unclosed paragraph" 98 | converter = pyhtml2md.Converter(html) 99 | converter.convert() 100 | assert not converter.ok() 101 | 102 | def test_options_equality(): 103 | options1 = pyhtml2md.Options() 104 | options2 = pyhtml2md.Options() 105 | 106 | assert options1 == options2 107 | 108 | options2.splitLines = False 109 | assert options1 != options2 110 | 111 | def test_special_characters(): 112 | html = "

    <special> & "characters"

    " 113 | result = pyhtml2md.convert(html) 114 | assert "" in result 115 | assert '"characters"' in result 116 | assert "&" in result 117 | 118 | def test_html_entities(): 119 | html = """ 120 |

    "Double quotes" <less than> >greater than< & ampersand   non-breaking space → right arrow

    121 | """ 122 | result = pyhtml2md.convert(html) 123 | assert '"Double quotes"' in result 124 | assert "" in result 125 | assert ">greater than<" in result 126 | assert "& ampersand" in result 127 | assert " non-breaking space" in result 128 | assert "→ right arrow" in result 129 | 130 | def test_nested_structures(): 131 | html = """ 132 |
    133 |

    Quoted text with bold and italic

    134 |
      135 |
    • Nested list
    • 136 |
    137 |
    138 | """ 139 | result = pyhtml2md.convert(html) 140 | assert ">" in result # blockquote marker 141 | assert "**bold**" in result 142 | assert "*italic*" in result 143 | assert "**list**" in result 144 | 145 | if __name__ == "__main__": 146 | pytest.main([__file__]) -------------------------------------------------------------------------------- /tests/test_basic.py: -------------------------------------------------------------------------------- 1 | import pyhtml2md 2 | 3 | def test_main(): 4 | assert pyhtml2md.convert("

    Hello, world!

    ") == "# Hello, world!\n" 5 | 6 | --------------------------------------------------------------------------------