├── MarkItDown.Tests ├── TestData │ ├── Sample.txt │ ├── Sample.pdf │ └── Sample.html ├── Converters │ ├── PlainTextConverterTests.cs │ ├── PDFConverterTests.cs │ ├── URLConverterTests.cs │ └── HtmlConverterTests.cs ├── BaseConverterTests.cs └── MarkItDownSharp.Tests.csproj ├── MarkItDown ├── FodyWeavers.xml ├── Exceptions │ ├── ConversionException.cs │ └── UnsupportedFormatException.cs ├── Models │ ├── DocumentConverterResult.cs │ └── ConversionOptions.cs ├── MarkItDownSharp.nuspec ├── Helpers │ ├── UrlHelper.cs │ ├── PathHelper.cs │ └── CustomMarkdownConverter.cs ├── README.md ├── .gitignore ├── Properties │ └── AssemblyInfo.cs ├── Converters │ ├── PlainTextConverter.cs │ ├── DocumentConverter.cs │ ├── HtmlConverter.cs │ ├── WikipediaConverter.cs │ ├── XlsxConverter.cs │ ├── PDFConverter.cs │ ├── MediaConverters.cs │ ├── ZipConverter.cs │ ├── URLConverter.cs │ ├── DocxConverter.cs │ ├── YouTubeConverter.cs │ ├── BingSerpConverter.cs │ ├── PptxConverter.cs │ └── ConfluenceConverter.cs ├── MarkItDownSharp.cs └── MarkItDownSharp.csproj ├── MarkItDownDemo ├── ILLink │ └── ILLink.Descriptors.LibraryBuild.xml ├── Properties │ └── AssemblyInfo.cs ├── Program.cs ├── App.config ├── packages.config └── MarkItDownSharpDemo.csproj ├── README.md ├── MarkItDownSharp.sln ├── .gitattributes └── .gitignore /MarkItDown.Tests/TestData/Sample.txt: -------------------------------------------------------------------------------- 1 | This is a plain text sample file. -------------------------------------------------------------------------------- /MarkItDown.Tests/TestData/Sample.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kelter-antunes/MarkItDownSharp/HEAD/MarkItDown.Tests/TestData/Sample.pdf -------------------------------------------------------------------------------- /MarkItDown/FodyWeavers.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /MarkItDown.Tests/TestData/Sample.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Sample HTML Document 5 | 6 | 7 |

Welcome to MarkItDown

8 |

This is a sample paragraph to be converted to Markdown.

9 | 13 | 14 | -------------------------------------------------------------------------------- /MarkItDownDemo/ILLink/ILLink.Descriptors.LibraryBuild.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /MarkItDown/Exceptions/ConversionException.cs: -------------------------------------------------------------------------------- 1 | // Exceptions/FileConversionException.cs 2 | 3 | using System; 4 | 5 | namespace MarkItDownSharp.Exceptions 6 | { 7 | public class ConversionException : Exception 8 | { 9 | public ConversionException() 10 | { 11 | } 12 | 13 | public ConversionException(string message) : base(message) 14 | { 15 | } 16 | 17 | public ConversionException(string message, Exception inner) : base(message, inner) 18 | { 19 | } 20 | } 21 | } -------------------------------------------------------------------------------- /MarkItDown/Exceptions/UnsupportedFormatException.cs: -------------------------------------------------------------------------------- 1 | // Exceptions/UnsupportedFormatException.cs 2 | 3 | using System; 4 | 5 | namespace MarkItDownSharp.Exceptions 6 | { 7 | public class UnsupportedFormatException : Exception 8 | { 9 | public UnsupportedFormatException() 10 | { 11 | } 12 | 13 | public UnsupportedFormatException(string message) : base(message) 14 | { 15 | } 16 | 17 | public UnsupportedFormatException(string message, Exception inner) : base(message, inner) 18 | { 19 | } 20 | } 21 | } -------------------------------------------------------------------------------- /MarkItDown/Models/DocumentConverterResult.cs: -------------------------------------------------------------------------------- 1 | // Models/DocumentConverterResult.cs 2 | namespace MarkItDownSharp.Models 3 | { 4 | public class DocumentConverterResult 5 | { 6 | public DocumentConverterResult(string title = null, string textContent = "") 7 | { 8 | Title = title; 9 | TextContent = textContent; 10 | } 11 | 12 | public string Title { get; set; } 13 | public string TextContent { get; set; } 14 | public System.Collections.Generic.Dictionary MetaData { get; set; } 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /MarkItDown/MarkItDownSharp.nuspec: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | $id$ 5 | $version$ 6 | $title$ 7 | Miguel 'Kelter' Antunes 8 | false 9 | MIT 10 | 11 | https://github.com/kelter-antunes/MarkItDownSharp 12 | $description$ 13 | Initial release of the package. 14 | $copyright$ 15 | markdown llm 16 | 17 | -------------------------------------------------------------------------------- /MarkItDown/Helpers/UrlHelper.cs: -------------------------------------------------------------------------------- 1 | // Helpers/UrlHelper.cs 2 | 3 | using System; 4 | 5 | namespace MarkItDownSharp.Helpers 6 | { 7 | public static class UrlHelper 8 | { 9 | /// 10 | /// Determines if the given input string is a valid HTTP or HTTPS URL. 11 | /// 12 | /// The input string to validate. 13 | /// True if the input is a URL; otherwise, false. 14 | public static bool IsValidUrl(string input) 15 | { 16 | return Uri.TryCreate(input, UriKind.Absolute, out var uriResult) && 17 | (uriResult.Scheme == Uri.UriSchemeHttp || uriResult.Scheme == Uri.UriSchemeHttps); 18 | } 19 | } 20 | } -------------------------------------------------------------------------------- /MarkItDown.Tests/Converters/PlainTextConverterTests.cs: -------------------------------------------------------------------------------- 1 | // MarkItDown.Tests/Converters/PlainTextConverterTests.cs 2 | 3 | namespace MarkItDownSharp.Tests.Converters; 4 | 5 | public class PlainTextConverterTests : BaseConverterTests 6 | { 7 | [Fact] 8 | public async Task Convert_PlainTextFile_ShouldReturnCorrectContent() 9 | { 10 | // Arrange 11 | var fileName = "Sample.txt"; 12 | 13 | // Act 14 | var result = await ConvertAsync(fileName); 15 | 16 | // Assert 17 | Assert.NotNull(result); 18 | Assert.Null(result.Title); // As per PlainTextConverter implementation 19 | var expectedContent = File.ReadAllText(Path.Combine(TestDataPath, fileName)); 20 | Assert.Equal(expectedContent, result.TextContent); 21 | } 22 | } -------------------------------------------------------------------------------- /MarkItDown.Tests/Converters/PDFConverterTests.cs: -------------------------------------------------------------------------------- 1 | // MarkItDown.Tests/Converters/HtmlConverterTests.cs 2 | 3 | namespace MarkItDownSharp.Tests.Converters; 4 | 5 | public class PdfConverterTests : BaseConverterTests 6 | { 7 | [Fact] 8 | public async Task Convert_PDFFile_ShouldReturnExpectedMarkdown() 9 | { 10 | // Arrange 11 | var fileName = "Sample.pdf"; 12 | var expectedTitle = "Lorem ipsum"; 13 | var expectedContent = "Lorem ipsum\r\nLorem ipsum dolor sit amet, consectetur adipiscing\r\nelit."; 14 | 15 | // Act 16 | var result = await ConvertAsync(fileName); 17 | 18 | // Assert 19 | Assert.NotNull(result); 20 | Assert.Equal(expectedTitle, result.Title); 21 | Assert.Contains(expectedContent, result.TextContent); 22 | } 23 | } -------------------------------------------------------------------------------- /MarkItDown.Tests/Converters/URLConverterTests.cs: -------------------------------------------------------------------------------- 1 | // MarkItDown.Tests/Converters/URLConverterTests.cs 2 | 3 | namespace MarkItDownSharp.Tests.Converters; 4 | 5 | public class URLConverterTests : BaseConverterTests 6 | { 7 | [Fact] 8 | public async Task Convert_URLFile_ShouldReturnExpectedMarkdown() 9 | { 10 | // Arrange 11 | var url = "https://example.com"; 12 | var expectedTitle = "Example Domain"; 13 | var expectedContent = 14 | "This domain is for use in illustrative examples in documents."; 15 | 16 | // Act 17 | var result = await ConvertAsync(url); 18 | 19 | // Assert 20 | Assert.NotNull(result); 21 | Assert.Equal(expectedTitle, result.Title); 22 | Assert.Contains(expectedContent, result.TextContent); 23 | } 24 | } -------------------------------------------------------------------------------- /MarkItDown.Tests/Converters/HtmlConverterTests.cs: -------------------------------------------------------------------------------- 1 | // MarkItDown.Tests/Converters/HtmlConverterTests.cs 2 | 3 | namespace MarkItDownSharp.Tests.Converters; 4 | 5 | public class HtmlConverterTests : BaseConverterTests 6 | { 7 | [Fact] 8 | public async Task Convert_HtmlFile_ShouldReturnExpectedMarkdown() 9 | { 10 | // Arrange 11 | var fileName = "Sample.html"; 12 | var expectedTitle = "Sample HTML Document"; 13 | var expectedContent = 14 | "# Welcome to MarkItDown\r\n\r\nThis is a sample paragraph to be converted to Markdown.\r\n\r\n- First item\r\n- Second item"; 15 | 16 | // Act 17 | var result = await ConvertAsync(fileName); 18 | 19 | // Assert 20 | Assert.NotNull(result); 21 | Assert.Equal(expectedTitle, result.Title); 22 | Assert.Equal(expectedContent, result.TextContent); 23 | } 24 | } -------------------------------------------------------------------------------- /MarkItDown.Tests/BaseConverterTests.cs: -------------------------------------------------------------------------------- 1 | // MarkItDown.Tests/BaseConverterTests.cs 2 | 3 | using MarkItDownSharp.Models; 4 | 5 | namespace MarkItDownSharp.Tests; 6 | 7 | public abstract class BaseConverterTests 8 | { 9 | protected readonly MarkItDownConverter Converter; 10 | protected readonly string TestDataPath; 11 | 12 | public BaseConverterTests() 13 | { 14 | Converter = new MarkItDownConverter(); 15 | TestDataPath = Path.Combine(Directory.GetCurrentDirectory(), "TestData"); 16 | } 17 | 18 | protected async Task ConvertAsync(string relativePathOrUrl) 19 | { 20 | var fullPathOrUrl = Path.Combine(TestDataPath, relativePathOrUrl); 21 | if (File.Exists(fullPathOrUrl)) return await Converter.ConvertLocalAsync(fullPathOrUrl); 22 | 23 | // Assume it's a URL 24 | return await Converter.ConvertLocalAsync(relativePathOrUrl); 25 | } 26 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MarkItDownSharp 2 | 3 | 4 | `MarkItDownSharp` is a C# .Net Framework library designed for converting various file formats to Markdown. This makes it suitable for indexing, text analysis, and other applications that benefit from structured text. It is a C# .Net Framework implementation (more or less) of the original `markitdown` [Python library.](https://github.com/microsoft/markitdown) 5 | 6 | It supports: 7 | 8 | - [x] PDF 9 | - [x] Word (.docx) 10 | - [x] Excel (.xlsx) 11 | - [ ] Images (EXIF metadata extraction and optional LLM-based description) 12 | - [x] Audio (EXIF metadata extraction only) 13 | - [x] HTML 14 | - [x] Text-based formats (plain text, .csv, .xml, .rss, .atom) 15 | - [x] Jupyter Notebooks (.ipynb) 16 | - [x] Bing Search Result Pages (SERP) 17 | - [x] ZIP files (recursively iterates over contents) 18 | - [x] PowerPoint (.pptx) 19 | - [x] Confluence (spaces and single pages) 20 | 21 | 22 | > [!NOTE] 23 | > 24 | > Speech Recognition for audio converter has not been implemented yet. I'm happy to accept contributions for this feature. 25 | -------------------------------------------------------------------------------- /MarkItDown/README.md: -------------------------------------------------------------------------------- 1 | # MarkItDownSharp 2 | 3 | 4 | `MarkItDownSharp` is a C# .Net Framework library designed for converting various file formats to Markdown. This makes it suitable for indexing, text analysis, and other applications that benefit from structured text. It is a C# .Net Framework implementation (more or less) of the original `markitdown` [Python library.](https://github.com/microsoft/markitdown) 5 | 6 | It supports: 7 | 8 | - [x] PDF 9 | - [x] Word (.docx) 10 | - [x] Excel (.xlsx) 11 | - [ ] Images (EXIF metadata extraction and optional LLM-based description) 12 | - [x] Audio (EXIF metadata extraction only) 13 | - [x] HTML 14 | - [x] Text-based formats (plain text, .csv, .xml, .rss, .atom) 15 | - [x] Jupyter Notebooks (.ipynb) 16 | - [x] Bing Search Result Pages (SERP) 17 | - [x] ZIP files (recursively iterates over contents) 18 | - [x] PowerPoint (.pptx) 19 | - [x] Confluence (spaces and single pages) 20 | 21 | 22 | > [!NOTE] 23 | > 24 | > Speech Recognition for audio converter has not been implemented yet. I'm happy to accept contributions for this feature. -------------------------------------------------------------------------------- /MarkItDown/.gitignore: -------------------------------------------------------------------------------- 1 | # Build Results 2 | bin/ 3 | obj/ 4 | *.exe 5 | *.dll 6 | *.pdb 7 | 8 | # User-specific files 9 | *.user 10 | *.vspscc 11 | *.vssscc 12 | 13 | # Mono Auto Generated Files 14 | mono_crash.* 15 | 16 | # Windows-specific files 17 | Thumbs.db 18 | ehthumbs.db 19 | Desktop.ini 20 | $RECYCLE.BIN/ 21 | 22 | # Logs and database files 23 | *.log 24 | *.ldf 25 | *.mdf 26 | 27 | # Visual Studio files 28 | .vs/ 29 | *.suo 30 | *.userosscache 31 | *.sln.docstates 32 | *.userprefs 33 | 34 | # Rider files 35 | .idea/ 36 | *.sln.iml 37 | 38 | # NuGet Packages 39 | *.nupkg 40 | packages/ 41 | 42 | # Resharper files 43 | _ReSharper*/ 44 | *.DotSettings.user 45 | 46 | # Temporary files 47 | *.tmp 48 | *.temp 49 | ~$* 50 | 51 | # Test results 52 | TestResults/ 53 | 54 | # Web files 55 | *.publishsettings 56 | *.pubxml 57 | *.publishproj 58 | 59 | # JetBrains Rider 60 | .idea/ 61 | 62 | # Compiled Python caches 63 | *.pyc 64 | 65 | # Exclude node_modules if your library uses npm for front-end dependencies 66 | node_modules/ 67 | 68 | # Deployment files 69 | *.deploy 70 | *.publish 71 | 72 | # Backup files 73 | *.bak 74 | *.orig 75 | *.old 76 | 77 | # Config files containing sensitive data 78 | *.config 79 | appsettings.*.json 80 | 81 | # Exclude autogenerated files 82 | *.g.* 83 | *.designer.cs 84 | -------------------------------------------------------------------------------- /MarkItDownDemo/Properties/AssemblyInfo.cs: -------------------------------------------------------------------------------- 1 | using System.Reflection; 2 | using System.Runtime.InteropServices; 3 | 4 | // General Information about an assembly is controlled through the following 5 | // set of attributes. Change these attribute values to modify the information 6 | // associated with an assembly. 7 | [assembly: AssemblyTitle("MarkItDownDemo")] 8 | [assembly: AssemblyDescription("")] 9 | [assembly: AssemblyConfiguration("")] 10 | [assembly: AssemblyCompany("")] 11 | [assembly: AssemblyProduct("MarkItDownDemo")] 12 | [assembly: AssemblyCopyright("Copyright © 2025")] 13 | [assembly: AssemblyTrademark("")] 14 | [assembly: AssemblyCulture("")] 15 | 16 | // Setting ComVisible to false makes the types in this assembly not visible 17 | // to COM components. If you need to access a type in this assembly from 18 | // COM, set the ComVisible attribute to true on that type. 19 | [assembly: ComVisible(false)] 20 | 21 | // The following GUID is for the ID of the typelib if this project is exposed to COM 22 | [assembly: Guid("161d5d72-7aad-4aa4-a83c-8afc444aa9e5")] 23 | 24 | // Version information for an assembly consists of the following four values: 25 | // 26 | // Major Version 27 | // Minor Version 28 | // Build Number 29 | // Revision 30 | // 31 | [assembly: AssemblyVersion("1.0.0.0")] 32 | [assembly: AssemblyFileVersion("1.0.0.0")] -------------------------------------------------------------------------------- /MarkItDown/Properties/AssemblyInfo.cs: -------------------------------------------------------------------------------- 1 | using System.Reflection; 2 | using System.Runtime.InteropServices; 3 | 4 | // General Information about an assembly is controlled through the following 5 | // set of attributes. Change these attribute values to modify the information 6 | // associated with an assembly. 7 | [assembly: AssemblyTitle("MarkItDownSharp")] 8 | [assembly: AssemblyDescription("It is a WIP C# .Net Framework implementation of the original markitdown Python library.")] 9 | [assembly: AssemblyConfiguration("")] 10 | [assembly: AssemblyCompany("")] 11 | [assembly: AssemblyProduct("MarkItDownSharp")] 12 | [assembly: AssemblyCopyright("Copyright © 2025")] 13 | [assembly: AssemblyTrademark("")] 14 | [assembly: AssemblyCulture("")] 15 | 16 | // Setting ComVisible to false makes the types in this assembly not visible 17 | // to COM components. If you need to access a type in this assembly from 18 | // COM, set the ComVisible attribute to true on that type. 19 | [assembly: ComVisible(false)] 20 | 21 | // The following GUID is for the ID of the typelib if this project is exposed to COM 22 | [assembly: Guid("beb01153-40fa-446d-8859-da5b89772c30")] 23 | 24 | // Version information for an assembly consists of the following four values: 25 | // 26 | // Major Version 27 | // Minor Version 28 | // Build Number 29 | // Revision 30 | // 31 | [assembly: AssemblyVersion("1.0.4.0")] 32 | [assembly: AssemblyFileVersion("1.0.4.0")] -------------------------------------------------------------------------------- /MarkItDown/Converters/PlainTextConverter.cs: -------------------------------------------------------------------------------- 1 | // Converters/PlainTextConverter.cs 2 | 3 | using System; 4 | using System.Collections.Generic; 5 | using System.IO; 6 | using System.Threading.Tasks; 7 | using MarkItDownSharp.Models; 8 | 9 | namespace MarkItDownSharp.Converters 10 | { 11 | public class PlainTextConverter : DocumentConverter 12 | { 13 | public override bool CanConvertUrl(string url) 14 | { 15 | return false; 16 | } 17 | 18 | public override bool CanConvertFile(string extension) 19 | { 20 | return extension.Equals(".txt", StringComparison.OrdinalIgnoreCase); 21 | } 22 | 23 | public override async Task ConvertAsync(string localPath, ConversionOptions options) 24 | { 25 | if (!CanConvertFile(options.FileExtension)) return null; 26 | 27 | var textContent = await Task.Run(() => File.ReadAllText(localPath)); 28 | return new DocumentConverterResult 29 | { 30 | Title = null, 31 | TextContent = textContent 32 | }; 33 | } 34 | 35 | public override async Task> ConvertToListAsync(string pathOrUrl, ConversionOptions options) 36 | { 37 | var result = await ConvertAsync(pathOrUrl, options); 38 | return result != null ? new List { result } : new List(); 39 | } 40 | } 41 | } -------------------------------------------------------------------------------- /MarkItDown/Models/ConversionOptions.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using System.Net.Http; 3 | using MarkItDownSharp.Converters; 4 | 5 | namespace MarkItDownSharp.Models 6 | { 7 | public class ConversionOptions 8 | { 9 | public string FileExtension { get; set; } 10 | public string Url { get; set; } 11 | public HttpClient HttpClient { get; set; } 12 | public string LlmClient { get; set; } 13 | public string LlmModel { get; set; } 14 | public string StyleMap { get; set; } 15 | 16 | public List ParentConverters { get; set; } = new List(); 17 | public bool CleanupExtracted { get; set; } = true; 18 | 19 | // Existing properties to support Confluence conversion. 20 | public string ConfluenceBaseUrl { get; set; } 21 | public string ConfluenceUsername { get; set; } 22 | public string ConfluenceApiToken { get; set; } 23 | 24 | // New properties for Confluence conversion customization. 25 | // ConfluencePageLimit specifies the number of pages retrieved per API call for space conversions. 26 | public int ConfluencePageLimit { get; set; } = 50; 27 | // ConfluenceMaxPages limits the total number of pages to process (set to 0 for unlimited). 28 | public int ConfluenceMaxPages { get; set; } = 0; 29 | // ConfluenceExpand specifies the body expansion parameter to be used (for example "body.view" or "body.storage"). 30 | public string ConfluenceExpand { get; set; } = "body.view"; 31 | } 32 | } -------------------------------------------------------------------------------- /MarkItDown.Tests/MarkItDownSharp.Tests.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | net8.0 5 | enable 6 | enable 7 | 8 | false 9 | true 10 | 11 | 12 | 13 | 14 | all 15 | runtime; build; native; contentfiles; analyzers; buildtransitive 16 | 17 | 18 | 19 | 20 | all 21 | runtime; build; native; contentfiles; analyzers; buildtransitive 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | PreserveNewest 40 | 41 | 42 | PreserveNewest 43 | 44 | 45 | PreserveNewest 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /MarkItDown/Helpers/PathHelper.cs: -------------------------------------------------------------------------------- 1 | // Helpers/PathHelper.cs 2 | 3 | using System; 4 | using System.IO; 5 | 6 | namespace MarkItDownSharp.Helpers 7 | { 8 | public static class PathHelper 9 | { 10 | /// 11 | /// Computes the relative path from one path to another. 12 | /// 13 | /// The base path. 14 | /// The target path. 15 | /// The relative path from the base path to the target path. 16 | public static string GetRelativePath(string relativeTo, string path) 17 | { 18 | if (string.IsNullOrEmpty(relativeTo)) 19 | throw new ArgumentNullException(nameof(relativeTo)); 20 | if (string.IsNullOrEmpty(path)) 21 | throw new ArgumentNullException(nameof(path)); 22 | 23 | var uri1 = new Uri(AppendDirectorySeparatorChar(relativeTo)); 24 | var uri2 = new Uri(path); 25 | var relativeUri = uri1.MakeRelativeUri(uri2); 26 | var relativePath = Uri.UnescapeDataString(relativeUri.ToString()) 27 | .Replace('/', Path.DirectorySeparatorChar); 28 | return relativePath; 29 | } 30 | 31 | /// 32 | /// Ensures the path ends with a directory separator character. 33 | /// 34 | /// The path to check. 35 | /// The path ending with a directory separator character. 36 | private static string AppendDirectorySeparatorChar(string path) 37 | { 38 | if (!path.EndsWith(Path.DirectorySeparatorChar.ToString())) 39 | return path + Path.DirectorySeparatorChar; 40 | return path; 41 | } 42 | } 43 | } -------------------------------------------------------------------------------- /MarkItDown/Converters/DocumentConverter.cs: -------------------------------------------------------------------------------- 1 | // Converters/DocumentConverter.cs 2 | 3 | using System.Collections.Generic; 4 | using System.Threading.Tasks; 5 | using MarkItDownSharp.Models; 6 | 7 | namespace MarkItDownSharp.Converters 8 | { 9 | public abstract class DocumentConverter 10 | { 11 | /// 12 | /// Converts the document at the given path or URL to Markdown. 13 | /// 14 | /// The local file path or URL of the document. 15 | /// Additional options for conversion. 16 | /// A DocumentConverterResult containing the conversion output. 17 | public abstract Task ConvertAsync(string pathOrUrl, ConversionOptions options); 18 | 19 | /// 20 | /// Converts the document to a list of DocumentConverterResult. 21 | /// The default implementation calls ConvertAsync and wraps its result in a list. 22 | /// Converters that support multi‑page documents should override this method. 23 | /// 24 | public abstract Task> ConvertToListAsync(string pathOrUrl, ConversionOptions options); 25 | 26 | 27 | /// 28 | /// Determines if the converter can handle the given URL. 29 | /// 30 | /// The URL to check. 31 | /// True if it can handle the URL; otherwise, false. 32 | public virtual bool CanConvertUrl(string url) 33 | { 34 | return false; 35 | } 36 | 37 | /// 38 | /// Determines if the converter can handle the given file extension. 39 | /// 40 | /// The file extension (e.g., ".txt"). 41 | /// True if it can handle the extension; otherwise, false. 42 | public virtual bool CanConvertFile(string extension) 43 | { 44 | return false; 45 | } 46 | } 47 | } -------------------------------------------------------------------------------- /MarkItDownSharp.sln: -------------------------------------------------------------------------------- 1 | 2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Version 17 4 | VisualStudioVersion = 17.13.35617.110 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "MarkItDownSharp", "MarkItDown\MarkItDownSharp.csproj", "{BEB01153-40FA-446D-8859-DA5B89772C30}" 7 | EndProject 8 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "MarkItDownSharpDemo", "MarkItDownDemo\MarkItDownSharpDemo.csproj", "{161D5D72-7AAD-4AA4-A83C-8AFC444AA9E5}" 9 | EndProject 10 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "MarkItDownSharp.Tests", "MarkItDown.Tests\MarkItDownSharp.Tests.csproj", "{C53C49DC-9298-401C-9E0D-A9DB01D17C42}" 11 | EndProject 12 | Global 13 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 14 | Debug|Any CPU = Debug|Any CPU 15 | Release|Any CPU = Release|Any CPU 16 | EndGlobalSection 17 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 18 | {BEB01153-40FA-446D-8859-DA5B89772C30}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 19 | {BEB01153-40FA-446D-8859-DA5B89772C30}.Debug|Any CPU.Build.0 = Debug|Any CPU 20 | {BEB01153-40FA-446D-8859-DA5B89772C30}.Release|Any CPU.ActiveCfg = Release|Any CPU 21 | {BEB01153-40FA-446D-8859-DA5B89772C30}.Release|Any CPU.Build.0 = Release|Any CPU 22 | {161D5D72-7AAD-4AA4-A83C-8AFC444AA9E5}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 23 | {161D5D72-7AAD-4AA4-A83C-8AFC444AA9E5}.Debug|Any CPU.Build.0 = Debug|Any CPU 24 | {161D5D72-7AAD-4AA4-A83C-8AFC444AA9E5}.Release|Any CPU.ActiveCfg = Release|Any CPU 25 | {161D5D72-7AAD-4AA4-A83C-8AFC444AA9E5}.Release|Any CPU.Build.0 = Release|Any CPU 26 | {C53C49DC-9298-401C-9E0D-A9DB01D17C42}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 27 | {C53C49DC-9298-401C-9E0D-A9DB01D17C42}.Debug|Any CPU.Build.0 = Debug|Any CPU 28 | {C53C49DC-9298-401C-9E0D-A9DB01D17C42}.Release|Any CPU.ActiveCfg = Release|Any CPU 29 | {C53C49DC-9298-401C-9E0D-A9DB01D17C42}.Release|Any CPU.Build.0 = Release|Any CPU 30 | EndGlobalSection 31 | GlobalSection(SolutionProperties) = preSolution 32 | HideSolutionNode = FALSE 33 | EndGlobalSection 34 | GlobalSection(ExtensibilityGlobals) = postSolution 35 | SolutionGuid = {94521BCE-27CA-4095-8D0D-673EC8C27C17} 36 | EndGlobalSection 37 | EndGlobal 38 | -------------------------------------------------------------------------------- /MarkItDownDemo/Program.cs: -------------------------------------------------------------------------------- 1 | // Example usage 2 | 3 | using System; 4 | using System.Threading.Tasks; 5 | using MarkItDownSharp; 6 | using MarkItDownSharp.Converters; 7 | using MarkItDownSharp.Models; 8 | 9 | internal class Program 10 | { 11 | private static async Task Main(string[] args) 12 | { 13 | var converter = new MarkItDownConverter(); 14 | 15 | // Confluence page example URL. 16 | // For example, using a space overview URL for multiple pages. 17 | var confluenceUrl = "https://yourcompany/wiki/spaces/SPACENAME/overview"; 18 | 19 | // Set up conversion options. 20 | var options = new ConversionOptions 21 | { 22 | ConfluenceBaseUrl = "https://yourcompany/wiki/", 23 | ConfluenceMaxPages = 500000, 24 | ConfluencePageLimit = 50, 25 | ConfluenceExpand = "body.export_view", 26 | ConfluenceUsername = "YOUR_USERNAME", 27 | ConfluenceApiToken = "YOUR_API_TOKEN" 28 | }; 29 | 30 | try 31 | { 32 | // For Confluence URLs, call ConvertToListAsync to receive a list of DocumentConverterResult items. 33 | var results = await converter.ConvertToListAsync(confluenceUrl, options); 34 | 35 | foreach (var result in results) 36 | { 37 | Console.WriteLine($"Title: {result.Title}"); 38 | Console.WriteLine("Content:"); 39 | Console.WriteLine(result.TextContent); 40 | if (result.MetaData != null) 41 | { 42 | Console.WriteLine("Metadata:"); 43 | foreach (var kv in result.MetaData) 44 | { 45 | if (kv.Value is System.Collections.IEnumerable && !(kv.Value is string)) 46 | { 47 | Console.WriteLine($" {kv.Key}: {string.Join(", ", (System.Collections.IEnumerable)kv.Value)}"); 48 | } 49 | else 50 | { 51 | Console.WriteLine($" {kv.Key}: {kv.Value}"); 52 | } 53 | } 54 | } 55 | Console.WriteLine(new string('-', 20)); 56 | } 57 | } 58 | catch (Exception ex) 59 | { 60 | Console.WriteLine($"Error during conversion: {ex.Message}"); 61 | } 62 | } 63 | } -------------------------------------------------------------------------------- /MarkItDown/Converters/HtmlConverter.cs: -------------------------------------------------------------------------------- 1 | // Converters/HtmlConverter.cs 2 | 3 | using System; 4 | using System.Collections.Generic; 5 | using System.IO; 6 | using System.Text.RegularExpressions; 7 | using System.Threading.Tasks; 8 | using HtmlAgilityPack; 9 | using MarkItDownSharp.Helpers; 10 | using MarkItDownSharp.Models; 11 | 12 | namespace MarkItDownSharp.Converters 13 | { 14 | public class HtmlConverter : DocumentConverter 15 | { 16 | private readonly CustomMarkdownConverter _markdownConverter; 17 | 18 | public HtmlConverter() 19 | { 20 | _markdownConverter = new CustomMarkdownConverter(); 21 | } 22 | 23 | public override bool CanConvertUrl(string url) 24 | { 25 | // Exclude Confluence URLs so they can be handled by ConfluenceConverter 26 | if (url.ToLowerInvariant().Contains("confluence") || 27 | url.ToLowerInvariant().Contains("atlassian.net")) 28 | { 29 | return false; 30 | } 31 | return Regex.IsMatch(url, @"^https?://"); 32 | } 33 | 34 | 35 | public override bool CanConvertFile(string extension) 36 | { 37 | return extension.Equals(".html", StringComparison.OrdinalIgnoreCase) || 38 | extension.Equals(".htm", StringComparison.OrdinalIgnoreCase); 39 | } 40 | 41 | public override async Task ConvertAsync(string pathOrUrl, ConversionOptions options) 42 | { 43 | if (!CanConvertFile(options.FileExtension)) 44 | return null; 45 | 46 | var htmlContent = await Task.Run(() => File.ReadAllText(pathOrUrl)); 47 | var doc = new HtmlDocument(); 48 | doc.LoadHtml(htmlContent); 49 | 50 | var body = doc.DocumentNode.SelectSingleNode("//body") ?? doc.DocumentNode; 51 | var markdown = _markdownConverter.ConvertToMarkdown(body.InnerHtml); 52 | 53 | var title = doc.DocumentNode.SelectSingleNode("//title")?.InnerText.Trim(); 54 | 55 | return new DocumentConverterResult 56 | { 57 | Title = title, 58 | TextContent = markdown 59 | }; 60 | } 61 | 62 | public override async Task> ConvertToListAsync(string pathOrUrl, ConversionOptions options) 63 | { 64 | var result = await ConvertAsync(pathOrUrl, options); 65 | return result != null ? new List { result } : new List(); 66 | } 67 | } 68 | } -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Set default behavior to automatically normalize line endings. 3 | ############################################################################### 4 | * text=auto 5 | 6 | ############################################################################### 7 | # Set default behavior for command prompt diff. 8 | # 9 | # This is need for earlier builds of msysgit that does not have it on by 10 | # default for csharp files. 11 | # Note: This is only used by command line 12 | ############################################################################### 13 | #*.cs diff=csharp 14 | 15 | ############################################################################### 16 | # Set the merge driver for project and solution files 17 | # 18 | # Merging from the command prompt will add diff markers to the files if there 19 | # are conflicts (Merging from VS is not affected by the settings below, in VS 20 | # the diff markers are never inserted). Diff markers may cause the following 21 | # file extensions to fail to load in VS. An alternative would be to treat 22 | # these files as binary and thus will always conflict and require user 23 | # intervention with every merge. To do so, just uncomment the entries below 24 | ############################################################################### 25 | #*.sln merge=binary 26 | #*.csproj merge=binary 27 | #*.vbproj merge=binary 28 | #*.vcxproj merge=binary 29 | #*.vcproj merge=binary 30 | #*.dbproj merge=binary 31 | #*.fsproj merge=binary 32 | #*.lsproj merge=binary 33 | #*.wixproj merge=binary 34 | #*.modelproj merge=binary 35 | #*.sqlproj merge=binary 36 | #*.wwaproj merge=binary 37 | 38 | ############################################################################### 39 | # behavior for image files 40 | # 41 | # image files are treated as binary by default. 42 | ############################################################################### 43 | #*.jpg binary 44 | #*.png binary 45 | #*.gif binary 46 | 47 | ############################################################################### 48 | # diff behavior for common document formats 49 | # 50 | # Convert binary document formats to text before diffing them. This feature 51 | # is only available from the command line. Turn it on by uncommenting the 52 | # entries below. 53 | ############################################################################### 54 | #*.doc diff=astextplain 55 | #*.DOC diff=astextplain 56 | #*.docx diff=astextplain 57 | #*.DOCX diff=astextplain 58 | #*.dot diff=astextplain 59 | #*.DOT diff=astextplain 60 | #*.pdf diff=astextplain 61 | #*.PDF diff=astextplain 62 | #*.rtf diff=astextplain 63 | #*.RTF diff=astextplain 64 | -------------------------------------------------------------------------------- /MarkItDown/Converters/WikipediaConverter.cs: -------------------------------------------------------------------------------- 1 | // Converters/WikipediaConverter.cs 2 | 3 | using System.Collections.Generic; 4 | using System.IO; 5 | using System.Net.Http; 6 | using System.Text.RegularExpressions; 7 | using System.Threading.Tasks; 8 | using HtmlAgilityPack; 9 | using MarkItDownSharp.Helpers; 10 | using MarkItDownSharp.Models; 11 | 12 | namespace MarkItDownSharp.Converters 13 | { 14 | public class WikipediaConverter : DocumentConverter 15 | { 16 | 17 | private readonly CustomMarkdownConverter _markdownConverter; 18 | 19 | public WikipediaConverter() 20 | { 21 | _markdownConverter = new CustomMarkdownConverter(); 22 | 23 | } 24 | 25 | public override bool CanConvertUrl(string url) 26 | { 27 | return Regex.IsMatch(url, @"^https?://[a-z]{2,3}\.wikipedia\.org/"); 28 | } 29 | 30 | public override bool CanConvertFile(string extension) 31 | { 32 | // This converter handles URLs, not specific file extensions 33 | return false; 34 | } 35 | 36 | public override async Task ConvertAsync(string pathOrUrl, ConversionOptions options) 37 | { 38 | // Since URLConverter ensures only URLs are passed here, proceed directly 39 | var htmlContent = await Task.Run(() => File.ReadAllText(pathOrUrl)); 40 | var doc = new HtmlDocument(); 41 | doc.LoadHtml(htmlContent); 42 | 43 | // Remove script and style tags 44 | foreach (var node in doc.DocumentNode.SelectNodes("//script|//style") ?? new HtmlNodeCollection(null)) 45 | node.Remove(); 46 | 47 | var contentDiv = doc.DocumentNode.SelectSingleNode("//div[@id='mw-content-text']"); 48 | var titleSpan = doc.DocumentNode.SelectSingleNode("//span[contains(@class, 'mw-page-title-main')]"); 49 | var title = titleSpan?.InnerText.Trim() ?? doc.DocumentNode.SelectSingleNode("//title")?.InnerText.Trim(); 50 | 51 | var markdownContent = ""; 52 | 53 | if (contentDiv != null) 54 | { 55 | var bodyMarkdown = _markdownConverter.ConvertToMarkdown(contentDiv.InnerHtml); 56 | markdownContent = $"# {title}\n\n{bodyMarkdown}"; 57 | } 58 | else 59 | { 60 | markdownContent = _markdownConverter.ConvertToMarkdown(doc.DocumentNode.InnerHtml); 61 | } 62 | 63 | return new DocumentConverterResult 64 | { 65 | Title = title, 66 | TextContent = markdownContent 67 | }; 68 | } 69 | public override async Task> ConvertToListAsync(string pathOrUrl, ConversionOptions options) 70 | { 71 | var result = await ConvertAsync(pathOrUrl, options); 72 | return result != null ? new List { result } : new List(); 73 | } 74 | } 75 | } -------------------------------------------------------------------------------- /MarkItDown/Converters/XlsxConverter.cs: -------------------------------------------------------------------------------- 1 | // Converters/XlsxConverter.cs 2 | 3 | using System; 4 | using System.Collections.Generic; 5 | using System.IO; 6 | using System.Linq; 7 | using System.Text; 8 | using System.Threading.Tasks; 9 | using ClosedXML.Excel; 10 | using MarkItDownSharp.Exceptions; 11 | using MarkItDownSharp.Helpers; 12 | using MarkItDownSharp.Models; 13 | 14 | namespace MarkItDownSharp.Converters 15 | { 16 | public class XlsxConverter : DocumentConverter 17 | { 18 | private readonly CustomMarkdownConverter _markdownConverter; 19 | 20 | public XlsxConverter() 21 | { 22 | _markdownConverter = new CustomMarkdownConverter(); 23 | } 24 | 25 | public override bool CanConvertUrl(string url) 26 | { 27 | return false; 28 | } 29 | 30 | public override bool CanConvertFile(string extension) 31 | { 32 | return extension.Equals(".xlsx", StringComparison.OrdinalIgnoreCase); 33 | } 34 | 35 | public override async Task ConvertAsync(string pathOrUrl, ConversionOptions options) 36 | { 37 | if (!CanConvertFile(options.FileExtension)) return null; 38 | 39 | try 40 | { 41 | using (var workbook = new XLWorkbook(pathOrUrl)) 42 | { 43 | var sb = new StringBuilder(); 44 | var title = Path.GetFileNameWithoutExtension(pathOrUrl); 45 | 46 | foreach (var worksheet in workbook.Worksheets) 47 | { 48 | sb.AppendLine($"## {worksheet.Name}\n"); 49 | 50 | var table = worksheet.RangeUsed(); 51 | if (table == null) 52 | { 53 | sb.AppendLine("_No data in this sheet._\n"); 54 | continue; 55 | } 56 | 57 | var rows = table.RowsUsed().ToList(); 58 | if (rows.Count == 0) 59 | { 60 | sb.AppendLine("_No data in this sheet._\n"); 61 | continue; 62 | } 63 | 64 | // Use the first row as headers 65 | var headerRow = rows.First(); 66 | var headers = headerRow.Cells().Select(c => c.Value.ToString().Replace("|", "\\|")).ToList(); 67 | sb.AppendLine("| " + string.Join(" | ", headers) + " |"); 68 | sb.AppendLine("|" + 69 | string.Join("|", headers.Select(h => new string('-', Math.Max(3, h.Length)))) + 70 | "|"); 71 | 72 | // Add data rows 73 | foreach (var row in rows.Skip(1)) 74 | { 75 | var cells = row.Cells().Select(c => c.Value.ToString().Replace("|", "\\|")).ToList(); 76 | sb.AppendLine("| " + string.Join(" | ", cells) + " |"); 77 | } 78 | 79 | sb.AppendLine("\n"); 80 | } 81 | 82 | // Optionally, set the title based on the first worksheet name if available 83 | var firstSheet = workbook.Worksheets.FirstOrDefault(); 84 | if (firstSheet != null && !string.IsNullOrWhiteSpace(firstSheet.Name)) title = firstSheet.Name; 85 | 86 | var markdownContent = _markdownConverter.ConvertToMarkdown(sb.ToString()); 87 | 88 | return new DocumentConverterResult 89 | { 90 | Title = title, 91 | TextContent = markdownContent.Trim() 92 | }; 93 | } 94 | } 95 | catch (Exception ex) 96 | { 97 | // Handle exceptions (e.g., corrupted file) as needed 98 | throw new ConversionException($"Failed to convert Excel file '{pathOrUrl}': {ex.Message}", ex); 99 | } 100 | } 101 | 102 | public override async Task> ConvertToListAsync(string pathOrUrl, ConversionOptions options) 103 | { 104 | var result = await ConvertAsync(pathOrUrl, options); 105 | return result != null ? new List { result } : new List(); 106 | } 107 | } 108 | } -------------------------------------------------------------------------------- /MarkItDown/Converters/PDFConverter.cs: -------------------------------------------------------------------------------- 1 | // Converters/PdfConverter.cs 2 | 3 | using System; 4 | using System.Collections.Generic; 5 | using System.IO; 6 | using System.Linq; 7 | using System.Text; 8 | using System.Threading.Tasks; 9 | using MarkItDownSharp.Models; 10 | using UglyToad.PdfPig; 11 | using UglyToad.PdfPig.Content; 12 | 13 | namespace MarkItDownSharp.Converters 14 | { 15 | public class PdfConverter : DocumentConverter 16 | { 17 | public override bool CanConvertUrl(string url) 18 | { 19 | return false; 20 | } 21 | 22 | public override bool CanConvertFile(string extension) 23 | { 24 | return extension.Equals(".pdf", StringComparison.OrdinalIgnoreCase); 25 | } 26 | 27 | public override async Task ConvertAsync(string localPath, ConversionOptions options) 28 | { 29 | if (!CanConvertFile(options.FileExtension)) return null; 30 | 31 | if (!File.Exists(localPath)) 32 | throw new FileNotFoundException($"File not found: {localPath}"); 33 | 34 | var markdownBuilder = new StringBuilder(); 35 | var title = Path.GetFileNameWithoutExtension(localPath); // Default title: document name 36 | 37 | using (var document = PdfDocument.Open(localPath)) 38 | { 39 | var pages = document.GetPages().ToList(); 40 | var isFirstLineExtracted = false; // Flag to check if the first line has been extracted 41 | 42 | foreach (var page in pages) 43 | { 44 | var words = page.GetWords().ToList(); 45 | 46 | if (!isFirstLineExtracted && words.Any()) 47 | { 48 | // Group words into lines first 49 | var lines = GroupWordsIntoLines(words); 50 | 51 | if (lines.Any()) 52 | { 53 | // Extract the first line as the title 54 | var firstLine = lines.First(); 55 | title = string.Join(" ", firstLine.Words.Select(w => w.Text)).Trim(); 56 | isFirstLineExtracted = true; 57 | } 58 | } 59 | 60 | // Group words into lines for the entire page 61 | var linesInPage = GroupWordsIntoLines(words); 62 | 63 | // Sort lines by Y (top to bottom) 64 | var sortedLines = linesInPage.OrderByDescending(l => l.Y).ToList(); 65 | 66 | // Append lines to the markdown content 67 | foreach (var line in sortedLines) 68 | { 69 | // Sort words in the line by their X positions (left to right) 70 | var sortedWords = line.Words.OrderBy(w => w.BoundingBox.Left).ToList(); 71 | var lineText = string.Join(" ", sortedWords.Select(w => w.Text)).Trim(); 72 | markdownBuilder.AppendLine(lineText); 73 | } 74 | 75 | markdownBuilder.AppendLine(); // Add a newline after each page for separation 76 | } 77 | } 78 | 79 | // Finalize the markdown content 80 | var finalMarkdown = markdownBuilder.ToString().Trim(); 81 | 82 | var result = new DocumentConverterResult 83 | { 84 | Title = title, 85 | TextContent = finalMarkdown 86 | }; 87 | 88 | return result; 89 | } 90 | 91 | public override async Task> ConvertToListAsync(string pathOrUrl, ConversionOptions options) 92 | { 93 | var result = await ConvertAsync(pathOrUrl, options); 94 | return result != null ? new List { result } : new List(); 95 | } 96 | 97 | /// 98 | /// Groups words into lines based on their vertical (Y) positions. 99 | /// 100 | private List GroupWordsIntoLines(List words) 101 | { 102 | var sortedWords = words.OrderByDescending(w => w.BoundingBox.Bottom).ToList(); 103 | var lines = new List(); 104 | var lineTolerance = 2.0; // Adjust as needed for vertical grouping 105 | 106 | foreach (var word in sortedWords) 107 | { 108 | var addedToLine = false; 109 | 110 | foreach (var line in lines) 111 | if (Math.Abs(line.Y - word.BoundingBox.Bottom) <= lineTolerance) 112 | { 113 | line.Words.Add(word); 114 | addedToLine = true; 115 | break; 116 | } 117 | 118 | if (!addedToLine) 119 | lines.Add(new Line 120 | { 121 | Y = word.BoundingBox.Bottom, 122 | Words = new List { word } 123 | }); 124 | } 125 | 126 | return lines; 127 | } 128 | 129 | /// 130 | /// Represents a line of text composed of multiple words. 131 | /// 132 | private class Line 133 | { 134 | public double Y { get; set; } 135 | public List Words { get; set; } 136 | } 137 | } 138 | } -------------------------------------------------------------------------------- /MarkItDownDemo/App.config: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | -------------------------------------------------------------------------------- /MarkItDownDemo/packages.config: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | -------------------------------------------------------------------------------- /MarkItDown/Converters/MediaConverters.cs: -------------------------------------------------------------------------------- 1 | // Converters/MediaConverters.cs 2 | 3 | using System; 4 | using System.Collections.Generic; 5 | using System.IO; 6 | using System.Threading.Tasks; 7 | using MarkItDownSharp.Models; 8 | using NAudio.Wave; 9 | using File = System.IO.File; 10 | 11 | namespace MarkItDownSharp.Converters 12 | { 13 | public abstract class MediaConverter : DocumentConverter 14 | { 15 | protected string GetMetadata(string localPath) 16 | { 17 | try 18 | { 19 | var file = TagLib.File.Create(localPath); 20 | 21 | var metadata = $"Title: {file.Tag.Title}\n" + 22 | $"Artist: {string.Join(", ", file.Tag.Performers)}\n" + 23 | $"Album: {file.Tag.Album}\n" + 24 | $"Duration: {file.Properties.Duration}\n"; 25 | 26 | return metadata.Trim(); 27 | } 28 | catch 29 | { 30 | return "Metadata could not be retrieved."; 31 | } 32 | } 33 | 34 | public override bool CanConvertUrl(string url) 35 | { 36 | return false; 37 | } 38 | 39 | public override bool CanConvertFile(string extension) 40 | { 41 | // To be overridden by subclasses 42 | return false; 43 | } 44 | } 45 | 46 | public class WavConverter : MediaConverter 47 | { 48 | public override bool CanConvertFile(string extension) 49 | { 50 | return extension.Equals(".wav", StringComparison.OrdinalIgnoreCase); 51 | } 52 | 53 | public override async Task ConvertAsync(string localPath, ConversionOptions options) 54 | { 55 | if (!CanConvertFile(options.FileExtension)) return null; 56 | 57 | var markdown = ""; 58 | 59 | // Extract metadata 60 | var metadata = GetMetadata(localPath); 61 | if (!string.IsNullOrEmpty(metadata)) markdown += metadata + "\n"; 62 | 63 | // Transcribe audio 64 | var transcript = await TranscribeAudioAsync(localPath, options); 65 | if (!string.IsNullOrEmpty(transcript)) markdown += "\n### Audio Transcript:\n" + transcript + "\n"; 66 | 67 | return new DocumentConverterResult 68 | { 69 | Title = null, 70 | TextContent = markdown.Trim() 71 | }; 72 | } 73 | public override async Task> ConvertToListAsync(string pathOrUrl, ConversionOptions options) 74 | { 75 | var result = await ConvertAsync(pathOrUrl, options); 76 | return result != null ? new List { result } : new List(); 77 | } 78 | 79 | private async Task TranscribeAudioAsync(string localPath, ConversionOptions options) 80 | { 81 | // Implement transcription logic using a speech recognition API or library 82 | // Placeholder: Return empty string 83 | await Task.CompletedTask; 84 | return "[Audio transcription not implemented]"; 85 | } 86 | } 87 | 88 | public class Mp3Converter : MediaConverter 89 | { 90 | public override bool CanConvertFile(string extension) 91 | { 92 | return extension.Equals(".mp3", StringComparison.OrdinalIgnoreCase); 93 | } 94 | 95 | public override async Task ConvertAsync(string localPath, ConversionOptions options) 96 | { 97 | if (!CanConvertFile(options.FileExtension)) return null; 98 | 99 | var markdown = ""; 100 | 101 | // Extract metadata 102 | var metadata = GetMetadata(localPath); 103 | if (!string.IsNullOrEmpty(metadata)) markdown += metadata + "\n"; 104 | 105 | // Convert MP3 to WAV for transcription 106 | var tempWavPath = Path.Combine(Path.GetTempPath(), Path.GetRandomFileName() + ".wav"); 107 | try 108 | { 109 | using (var reader = new Mp3FileReader(localPath)) 110 | using (var writer = new WaveFileWriter(tempWavPath, reader.WaveFormat)) 111 | { 112 | reader.CopyTo(writer); 113 | } 114 | 115 | // Transcribe audio 116 | var transcript = await TranscribeAudioAsync(tempWavPath, options); 117 | if (!string.IsNullOrEmpty(transcript)) markdown += "\n### Audio Transcript:\n" + transcript + "\n"; 118 | } 119 | catch 120 | { 121 | markdown += "\n### Audio Transcript:\nError. Could not transcribe this audio.\n"; 122 | } 123 | finally 124 | { 125 | if (File.Exists(tempWavPath)) File.Delete(tempWavPath); 126 | } 127 | 128 | return new DocumentConverterResult 129 | { 130 | Title = null, 131 | TextContent = markdown.Trim() 132 | }; 133 | } 134 | 135 | public override async Task> ConvertToListAsync(string pathOrUrl, ConversionOptions options) 136 | { 137 | var result = await ConvertAsync(pathOrUrl, options); 138 | return result != null ? new List { result } : new List(); 139 | } 140 | 141 | private async Task TranscribeAudioAsync(string localPath, ConversionOptions options) 142 | { 143 | // Implement transcription logic using a speech recognition API or library 144 | // Placeholder: Return empty string 145 | await Task.CompletedTask; 146 | return "[Audio transcription not implemented]"; 147 | } 148 | } 149 | 150 | // Similarly, implement other MediaConverters like ImageConverter if needed 151 | } -------------------------------------------------------------------------------- /MarkItDown/Converters/ZipConverter.cs: -------------------------------------------------------------------------------- 1 | // Converters/ZipConverter.cs 2 | 3 | using System; 4 | using System.Collections.Generic; 5 | using System.IO; 6 | using System.Threading.Tasks; 7 | using ICSharpCode.SharpZipLib.Core; 8 | using ICSharpCode.SharpZipLib.Zip; 9 | using MarkItDownSharp.Helpers; 10 | using MarkItDownSharp.Models; 11 | 12 | namespace MarkItDownSharp.Converters 13 | { 14 | public class ZipConverter : DocumentConverter 15 | { 16 | public override bool CanConvertUrl(string url) 17 | { 18 | return false; 19 | } 20 | 21 | public override bool CanConvertFile(string extension) 22 | { 23 | return extension.Equals(".zip", StringComparison.OrdinalIgnoreCase); 24 | } 25 | 26 | public override async Task ConvertAsync(string localPath, ConversionOptions options) 27 | { 28 | if (!CanConvertFile(options.FileExtension)) return null; 29 | 30 | if (options.ParentConverters == null || options.ParentConverters.Count == 0) 31 | return new DocumentConverterResult 32 | { 33 | Title = null, 34 | TextContent = $"[ERROR] No converters available to process zip contents from: {localPath}" 35 | }; 36 | 37 | var extractedFolderName = $"extracted_{Path.GetFileNameWithoutExtension(localPath)}_zip"; 38 | var destinationPath = Path.Combine(Path.GetDirectoryName(localPath), extractedFolderName); 39 | 40 | // Prevent path traversal 41 | if (!destinationPath.StartsWith(Path.GetDirectoryName(localPath), StringComparison.OrdinalIgnoreCase)) 42 | return new DocumentConverterResult 43 | { 44 | Title = null, 45 | TextContent = "[ERROR] Invalid zip file path." 46 | }; 47 | 48 | var markdownContent = $"Content from the zip file `{Path.GetFileName(localPath)}`:\n\n"; 49 | 50 | try 51 | { 52 | // Extract ZIP 53 | Directory.CreateDirectory(destinationPath); 54 | using (var fs = File.OpenRead(localPath)) 55 | using (var zf = new ZipFile(fs)) 56 | { 57 | foreach (ZipEntry zipEntry in zf) 58 | { 59 | if (!zipEntry.IsFile) 60 | continue; 61 | 62 | var entryFileName = zipEntry.Name; 63 | var buffer = new byte[4096]; 64 | var zipStream = zf.GetInputStream(zipEntry); 65 | 66 | var fullZipToPath = Path.Combine(destinationPath, entryFileName); 67 | var directoryName = Path.GetDirectoryName(fullZipToPath); 68 | if (!Directory.Exists(directoryName)) Directory.CreateDirectory(directoryName); 69 | 70 | using (var streamWriter = File.Create(fullZipToPath)) 71 | { 72 | StreamUtils.Copy(zipStream, streamWriter, buffer); 73 | } 74 | } 75 | } 76 | 77 | // Process extracted files 78 | foreach (var filePath in Directory.EnumerateFiles(destinationPath, "*.*", SearchOption.AllDirectories)) 79 | { 80 | var relativePath = PathHelper.GetRelativePath(destinationPath, filePath); 81 | var fileExtension = Path.GetExtension(filePath).ToLowerInvariant(); 82 | 83 | // Use existing CanConvertFile method 84 | var converter = options.ParentConverters.Find(c => c.CanConvertFile(fileExtension)); 85 | if (converter != null) 86 | { 87 | var newOptions = new ConversionOptions 88 | { 89 | FileExtension = fileExtension, 90 | ParentConverters = options.ParentConverters, 91 | HttpClient = options.HttpClient, 92 | LlmClient = options.LlmClient, 93 | LlmModel = options.LlmModel, 94 | StyleMap = options.StyleMap, 95 | CleanupExtracted = options.CleanupExtracted 96 | }; 97 | 98 | var result = await converter.ConvertAsync(filePath, newOptions); 99 | if (result != null) markdownContent += $"## File: {relativePath}\n\n{result.TextContent}\n\n"; 100 | } 101 | else 102 | { 103 | markdownContent += 104 | $"### Unsupported File: {relativePath}\n\n_No converter available for this file type._\n\n"; 105 | } 106 | } 107 | 108 | // Cleanup 109 | if (options.CleanupExtracted) Directory.Delete(destinationPath, true); 110 | 111 | return new DocumentConverterResult 112 | { 113 | Title = null, 114 | TextContent = markdownContent.Trim() 115 | }; 116 | } 117 | catch (Exception ex) 118 | { 119 | return new DocumentConverterResult 120 | { 121 | Title = null, 122 | TextContent = $"[ERROR] Failed to process zip file {localPath}: {ex.Message}" 123 | }; 124 | } 125 | } 126 | 127 | public override async Task> ConvertToListAsync(string pathOrUrl, ConversionOptions options) 128 | { 129 | var result = await ConvertAsync(pathOrUrl, options); 130 | return result != null ? new List { result } : new List(); 131 | } 132 | } 133 | } -------------------------------------------------------------------------------- /MarkItDown/MarkItDownSharp.cs: -------------------------------------------------------------------------------- 1 | // MarkItDown/MarkItDown.cs 2 | 3 | using System.Collections.Generic; 4 | using System.IO; 5 | using System.Linq; 6 | using System.Net.Http; 7 | using System.Threading.Tasks; 8 | using MarkItDownSharp.Converters; 9 | using MarkItDownSharp.Exceptions; 10 | using MarkItDownSharp.Helpers; 11 | using MarkItDownSharp.Models; 12 | 13 | namespace MarkItDownSharp 14 | { 15 | public class MarkItDownConverter 16 | { 17 | private readonly List _pageConverters; 18 | 19 | public MarkItDownConverter() 20 | { 21 | _pageConverters = new List(); 22 | 23 | // Register converters in order of priority 24 | RegisterPageConverter(new ConfluenceConverter()); // highest priority for Confluence URLs 25 | RegisterPageConverter(new UrlConverter()); 26 | RegisterPageConverter(new ZipConverter()); 27 | RegisterPageConverter(new PdfConverter()); 28 | RegisterPageConverter(new DocxConverter()); 29 | RegisterPageConverter(new XlsxConverter()); 30 | RegisterPageConverter(new PptxConverter()); 31 | RegisterPageConverter(new PlainTextConverter()); 32 | RegisterPageConverter(new HtmlConverter()); 33 | RegisterPageConverter(new WavConverter()); 34 | RegisterPageConverter(new Mp3Converter()); 35 | // Add other converters as needed 36 | } 37 | 38 | /// 39 | /// Converts a local file or URL to Markdown. 40 | /// 41 | /// The local file path or URL. 42 | /// Additional conversion options. 43 | /// A DocumentConverterResult containing the conversion output. 44 | public async Task ConvertLocalAsync(string pathOrUrl, ConversionOptions options = null) 45 | { 46 | options = options ?? new ConversionOptions(); 47 | options.ParentConverters = _pageConverters; 48 | 49 | if (UrlHelper.IsValidUrl(pathOrUrl)) 50 | { 51 | // Delegate to URLConverter 52 | var urlConverter = _pageConverters.OfType() 53 | .FirstOrDefault(c => c.CanConvertUrl(pathOrUrl)); 54 | if (urlConverter != null) 55 | { 56 | var result = await urlConverter.ConvertAsync(pathOrUrl, options); 57 | if (result != null) 58 | return result; 59 | } 60 | } 61 | else 62 | { 63 | if (!File.Exists(pathOrUrl)) 64 | throw new FileNotFoundException($"File not found: {pathOrUrl}"); 65 | 66 | var extension = Path.GetExtension(pathOrUrl).ToLowerInvariant(); 67 | options.FileExtension = extension; 68 | 69 | foreach (var converter in _pageConverters) 70 | { 71 | if (converter.CanConvertFile(extension)) 72 | { 73 | var result = await converter.ConvertAsync(pathOrUrl, options); 74 | if (result != null) 75 | return result; 76 | } 77 | } 78 | } 79 | 80 | throw new UnsupportedFormatException($"Unsupported input: {pathOrUrl}"); 81 | } 82 | 83 | /// 84 | /// Converts a local file or URL to a list of DocumentConverterResult items. 85 | /// This method delegates to the underlying converter's ConvertToListAsync, which may return 86 | /// multiple results (for example, one for each page in a Confluence space) or a single-item list. 87 | /// 88 | /// The local file path or URL. 89 | /// Additional conversion options. 90 | /// A List of DocumentConverterResult containing the conversion output. 91 | public async Task> ConvertToListAsync(string pathOrUrl, ConversionOptions options = null) 92 | { 93 | options = options ?? new ConversionOptions(); 94 | options.ParentConverters = _pageConverters; 95 | 96 | if (UrlHelper.IsValidUrl(pathOrUrl)) 97 | { 98 | // Look for a converter that handles the URL. 99 | var converter = _pageConverters.FirstOrDefault(c => c.CanConvertUrl(pathOrUrl)); 100 | if (converter != null) 101 | { 102 | var results = await converter.ConvertToListAsync(pathOrUrl, options); 103 | if (results != null && results.Count > 0) 104 | return results; 105 | } 106 | } 107 | else 108 | { 109 | if (!File.Exists(pathOrUrl)) 110 | throw new FileNotFoundException($"File not found: {pathOrUrl}"); 111 | 112 | var extension = Path.GetExtension(pathOrUrl).ToLowerInvariant(); 113 | options.FileExtension = extension; 114 | 115 | foreach (var converter in _pageConverters) 116 | { 117 | if (converter.CanConvertFile(extension)) 118 | { 119 | var results = await converter.ConvertToListAsync(pathOrUrl, options); 120 | if (results != null && results.Count > 0) 121 | return results; 122 | } 123 | } 124 | } 125 | 126 | throw new UnsupportedFormatException($"Unsupported input: {pathOrUrl}"); 127 | } 128 | 129 | /// 130 | /// Registers a new page converter. 131 | /// 132 | /// The converter to register. 133 | public void RegisterPageConverter(DocumentConverter converter) 134 | { 135 | _pageConverters.Insert(0, converter); // Higher priority converters are checked first 136 | } 137 | } 138 | } -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | ## 4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 5 | 6 | # User-specific files 7 | *.rsuser 8 | *.suo 9 | *.user 10 | *.userosscache 11 | *.sln.docstates 12 | 13 | # User-specific files (MonoDevelop/Xamarin Studio) 14 | *.userprefs 15 | 16 | # Mono auto generated files 17 | mono_crash.* 18 | 19 | # Build results 20 | [Dd]ebug/ 21 | [Dd]ebugPublic/ 22 | [Rr]elease/ 23 | [Rr]eleases/ 24 | x64/ 25 | x86/ 26 | [Ww][Ii][Nn]32/ 27 | [Aa][Rr][Mm]/ 28 | [Aa][Rr][Mm]64/ 29 | bld/ 30 | [Bb]in/ 31 | [Oo]bj/ 32 | [Oo]ut/ 33 | [Ll]og/ 34 | [Ll]ogs/ 35 | 36 | # Visual Studio 2015/2017 cache/options directory 37 | .vs/ 38 | # Uncomment if you have tasks that create the project's static files in wwwroot 39 | #wwwroot/ 40 | 41 | # Visual Studio 2017 auto generated files 42 | Generated\ Files/ 43 | 44 | # MSTest test Results 45 | [Tt]est[Rr]esult*/ 46 | [Bb]uild[Ll]og.* 47 | 48 | # NUnit 49 | *.VisualState.xml 50 | TestResult.xml 51 | nunit-*.xml 52 | 53 | # Build Results of an ATL Project 54 | [Dd]ebugPS/ 55 | [Rr]eleasePS/ 56 | dlldata.c 57 | 58 | # Benchmark Results 59 | BenchmarkDotNet.Artifacts/ 60 | 61 | # .NET Core 62 | project.lock.json 63 | project.fragment.lock.json 64 | artifacts/ 65 | 66 | # ASP.NET Scaffolding 67 | ScaffoldingReadMe.txt 68 | 69 | # StyleCop 70 | StyleCopReport.xml 71 | 72 | # Files built by Visual Studio 73 | *_i.c 74 | *_p.c 75 | *_h.h 76 | *.ilk 77 | *.meta 78 | *.obj 79 | *.iobj 80 | *.pch 81 | *.pdb 82 | *.ipdb 83 | *.pgc 84 | *.pgd 85 | *.rsp 86 | *.sbr 87 | *.tlb 88 | *.tli 89 | *.tlh 90 | *.tmp 91 | *.tmp_proj 92 | *_wpftmp.csproj 93 | *.log 94 | *.vspscc 95 | *.vssscc 96 | .builds 97 | *.pidb 98 | *.svclog 99 | *.scc 100 | 101 | # Chutzpah Test files 102 | _Chutzpah* 103 | 104 | # Visual C++ cache files 105 | ipch/ 106 | *.aps 107 | *.ncb 108 | *.opendb 109 | *.opensdf 110 | *.sdf 111 | *.cachefile 112 | *.VC.db 113 | *.VC.VC.opendb 114 | 115 | # Visual Studio profiler 116 | *.psess 117 | *.vsp 118 | *.vspx 119 | *.sap 120 | 121 | # Visual Studio Trace Files 122 | *.e2e 123 | 124 | # TFS 2012 Local Workspace 125 | $tf/ 126 | 127 | # Guidance Automation Toolkit 128 | *.gpState 129 | 130 | # ReSharper is a .NET coding add-in 131 | _ReSharper*/ 132 | *.[Rr]e[Ss]harper 133 | *.DotSettings.user 134 | 135 | # TeamCity is a build add-in 136 | _TeamCity* 137 | 138 | # DotCover is a Code Coverage Tool 139 | *.dotCover 140 | 141 | # AxoCover is a Code Coverage Tool 142 | .axoCover/* 143 | !.axoCover/settings.json 144 | 145 | # Coverlet is a free, cross platform Code Coverage Tool 146 | coverage*.json 147 | coverage*.xml 148 | coverage*.info 149 | 150 | # Visual Studio code coverage results 151 | *.coverage 152 | *.coveragexml 153 | 154 | # NCrunch 155 | _NCrunch_* 156 | .*crunch*.local.xml 157 | nCrunchTemp_* 158 | 159 | # MightyMoose 160 | *.mm.* 161 | AutoTest.Net/ 162 | 163 | # Web workbench (sass) 164 | .sass-cache/ 165 | 166 | # Installshield output folder 167 | [Ee]xpress/ 168 | 169 | # DocProject is a documentation generator add-in 170 | DocProject/buildhelp/ 171 | DocProject/Help/*.HxT 172 | DocProject/Help/*.HxC 173 | DocProject/Help/*.hhc 174 | DocProject/Help/*.hhk 175 | DocProject/Help/*.hhp 176 | DocProject/Help/Html2 177 | DocProject/Help/html 178 | 179 | # Click-Once directory 180 | publish/ 181 | 182 | # Publish Web Output 183 | *.[Pp]ublish.xml 184 | *.azurePubxml 185 | # Note: Comment the next line if you want to checkin your web deploy settings, 186 | # but database connection strings (with potential passwords) will be unencrypted 187 | *.pubxml 188 | *.publishproj 189 | 190 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 191 | # checkin your Azure Web App publish settings, but sensitive information contained 192 | # in these scripts will be unencrypted 193 | PublishScripts/ 194 | 195 | # NuGet Packages 196 | *.nupkg 197 | # NuGet Symbol Packages 198 | *.snupkg 199 | # The packages folder can be ignored because of Package Restore 200 | **/[Pp]ackages/* 201 | # except build/, which is used as an MSBuild target. 202 | !**/[Pp]ackages/build/ 203 | # Uncomment if necessary however generally it will be regenerated when needed 204 | #!**/[Pp]ackages/repositories.config 205 | # NuGet v3's project.json files produces more ignorable files 206 | *.nuget.props 207 | *.nuget.targets 208 | 209 | # Microsoft Azure Build Output 210 | csx/ 211 | *.build.csdef 212 | 213 | # Microsoft Azure Emulator 214 | ecf/ 215 | rcf/ 216 | 217 | # Windows Store app package directories and files 218 | AppPackages/ 219 | BundleArtifacts/ 220 | Package.StoreAssociation.xml 221 | _pkginfo.txt 222 | *.appx 223 | *.appxbundle 224 | *.appxupload 225 | 226 | # Visual Studio cache files 227 | # files ending in .cache can be ignored 228 | *.[Cc]ache 229 | # but keep track of directories ending in .cache 230 | !?*.[Cc]ache/ 231 | 232 | # Others 233 | ClientBin/ 234 | ~$* 235 | *~ 236 | *.dbmdl 237 | *.dbproj.schemaview 238 | *.jfm 239 | *.pfx 240 | *.publishsettings 241 | orleans.codegen.cs 242 | 243 | # Including strong name files can present a security risk 244 | # (https://github.com/github/gitignore/pull/2483#issue-259490424) 245 | #*.snk 246 | 247 | # Since there are multiple workflows, uncomment next line to ignore bower_components 248 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 249 | #bower_components/ 250 | 251 | # RIA/Silverlight projects 252 | Generated_Code/ 253 | 254 | # Backup & report files from converting an old project file 255 | # to a newer Visual Studio version. Backup files are not needed, 256 | # because we have git ;-) 257 | _UpgradeReport_Files/ 258 | Backup*/ 259 | UpgradeLog*.XML 260 | UpgradeLog*.htm 261 | ServiceFabricBackup/ 262 | *.rptproj.bak 263 | 264 | # SQL Server files 265 | *.mdf 266 | *.ldf 267 | *.ndf 268 | 269 | # Business Intelligence projects 270 | *.rdl.data 271 | *.bim.layout 272 | *.bim_*.settings 273 | *.rptproj.rsuser 274 | *- [Bb]ackup.rdl 275 | *- [Bb]ackup ([0-9]).rdl 276 | *- [Bb]ackup ([0-9][0-9]).rdl 277 | 278 | # Microsoft Fakes 279 | FakesAssemblies/ 280 | 281 | # GhostDoc plugin setting file 282 | *.GhostDoc.xml 283 | 284 | # Node.js Tools for Visual Studio 285 | .ntvs_analysis.dat 286 | node_modules/ 287 | 288 | # Visual Studio 6 build log 289 | *.plg 290 | 291 | # Visual Studio 6 workspace options file 292 | *.opt 293 | 294 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 295 | *.vbw 296 | 297 | # Visual Studio LightSwitch build output 298 | **/*.HTMLClient/GeneratedArtifacts 299 | **/*.DesktopClient/GeneratedArtifacts 300 | **/*.DesktopClient/ModelManifest.xml 301 | **/*.Server/GeneratedArtifacts 302 | **/*.Server/ModelManifest.xml 303 | _Pvt_Extensions 304 | 305 | # Paket dependency manager 306 | .paket/paket.exe 307 | paket-files/ 308 | 309 | # FAKE - F# Make 310 | .fake/ 311 | 312 | # CodeRush personal settings 313 | .cr/personal 314 | 315 | # Python Tools for Visual Studio (PTVS) 316 | __pycache__/ 317 | *.pyc 318 | 319 | # Cake - Uncomment if you are using it 320 | # tools/** 321 | # !tools/packages.config 322 | 323 | # Tabs Studio 324 | *.tss 325 | 326 | # Telerik's JustMock configuration file 327 | *.jmconfig 328 | 329 | # BizTalk build output 330 | *.btp.cs 331 | *.btm.cs 332 | *.odx.cs 333 | *.xsd.cs 334 | 335 | # OpenCover UI analysis results 336 | OpenCover/ 337 | 338 | # Azure Stream Analytics local run output 339 | ASALocalRun/ 340 | 341 | # MSBuild Binary and Structured Log 342 | *.binlog 343 | 344 | # NVidia Nsight GPU debugger configuration file 345 | *.nvuser 346 | 347 | # MFractors (Xamarin productivity tool) working folder 348 | .mfractor/ 349 | 350 | # Local History for Visual Studio 351 | .localhistory/ 352 | 353 | # BeatPulse healthcheck temp database 354 | healthchecksdb 355 | 356 | # Backup folder for Package Reference Convert tool in Visual Studio 2017 357 | MigrationBackup/ 358 | 359 | # Ionide (cross platform F# VS Code tools) working folder 360 | .ionide/ 361 | 362 | # Fody - auto-generated XML schema 363 | FodyWeavers.xsd -------------------------------------------------------------------------------- /MarkItDown/Converters/URLConverter.cs: -------------------------------------------------------------------------------- 1 | // Converters/URLConverter.cs 2 | 3 | using System; 4 | using System.Collections.Generic; 5 | using System.IO; 6 | using System.Net.Http; 7 | using System.Text; 8 | using System.Threading.Tasks; 9 | using MarkItDownSharp.Exceptions; 10 | using MarkItDownSharp.Models; 11 | 12 | namespace MarkItDownSharp.Converters 13 | { 14 | public class UrlConverter : DocumentConverter 15 | { 16 | private readonly HttpClient _httpClient; 17 | private readonly List _specificUrlConverters; 18 | 19 | public UrlConverter() 20 | { 21 | _httpClient = new HttpClient(); 22 | 23 | // Initialize specific URL converters 24 | _specificUrlConverters = new List 25 | { 26 | new ConfluenceConverter(), 27 | new WikipediaConverter(), 28 | new YouTubeConverter(), 29 | new BingSerpConverter(), 30 | new HtmlConverter() 31 | // Add other specific URL converters here 32 | }; 33 | } 34 | 35 | public override bool CanConvertUrl(string url) 36 | { 37 | foreach (var converter in _specificUrlConverters) 38 | if (converter.CanConvertUrl(url)) 39 | return true; 40 | return false; 41 | } 42 | 43 | public override bool CanConvertFile(string extension) 44 | { 45 | // URLConverter doesn't handle file extensions directly. 46 | return false; 47 | } 48 | 49 | public override async Task ConvertAsync(string pathOrUrl, ConversionOptions options) 50 | { 51 | if (!IsValidUrl(pathOrUrl)) 52 | return null; 53 | 54 | var url = pathOrUrl; 55 | options.Url = url; 56 | 57 | // Determine which specific converter to use based on the URL. 58 | var selectedConverter = GetSpecificConverter(url); 59 | if (selectedConverter == null) 60 | // Fallback to a default converter, for example, HtmlConverter. 61 | selectedConverter = new HtmlConverter(); 62 | 63 | // If the selected converter is ConfluenceConverter, call its ConvertAsync directly. 64 | if (selectedConverter is ConfluenceConverter) 65 | { 66 | return await selectedConverter.ConvertAsync(url, options); 67 | } 68 | 69 | // Otherwise, download content into a temporary file. 70 | var extension = GetExtensionFromUrl(url); 71 | var tempFilePath = Path.Combine(Path.GetTempPath(), Path.GetRandomFileName() + extension); 72 | 73 | try 74 | { 75 | var contentBytes = await DownloadContentAsync(url); 76 | 77 | if (IsTextFile(extension)) 78 | { 79 | // For text files, decode the bytes to string and write as text. 80 | var content = Encoding.UTF8.GetString(contentBytes); 81 | await Task.Run(() => File.WriteAllText(tempFilePath, content)); 82 | } 83 | else 84 | { 85 | // For binary files, write bytes directly. 86 | await Task.Run(() => File.WriteAllBytes(tempFilePath, contentBytes)); 87 | } 88 | 89 | // Update the options with the determined file extension. 90 | options.FileExtension = extension; 91 | 92 | // Delegate conversion to the selected specific converter. 93 | var result = await selectedConverter.ConvertAsync(tempFilePath, options); 94 | return result; 95 | } 96 | catch (Exception ex) 97 | { 98 | throw new ConversionException($"Failed to convert URL: {url}", ex); 99 | } 100 | finally 101 | { 102 | // Clean up the temporary file. 103 | if (File.Exists(tempFilePath)) 104 | { 105 | try { File.Delete(tempFilePath); } catch { /* Optionally log deletion errors */ } 106 | } 107 | } 108 | } 109 | 110 | public override async Task> ConvertToListAsync(string pathOrUrl, ConversionOptions options) 111 | { 112 | if (!IsValidUrl(pathOrUrl)) 113 | return null; 114 | 115 | var url = pathOrUrl; 116 | options.Url = url; 117 | 118 | // Determine which specific converter to use based on the URL. 119 | var selectedConverter = GetSpecificConverter(url); 120 | if (selectedConverter == null) 121 | selectedConverter = new HtmlConverter(); 122 | 123 | // If the selected converter is ConfluenceConverter (or any converter that properly supports multiple results), 124 | // delegate directly to its ConvertToListAsync. 125 | if (selectedConverter is ConfluenceConverter) 126 | { 127 | return await selectedConverter.ConvertToListAsync(url, options); 128 | } 129 | 130 | // Otherwise, proceed to download the URL content to a temp file. 131 | var extension = GetExtensionFromUrl(url); 132 | var tempFilePath = Path.Combine(Path.GetTempPath(), Path.GetRandomFileName() + extension); 133 | 134 | try 135 | { 136 | var contentBytes = await DownloadContentAsync(url); 137 | 138 | if (IsTextFile(extension)) 139 | { 140 | var content = Encoding.UTF8.GetString(contentBytes); 141 | await Task.Run(() => File.WriteAllText(tempFilePath, content)); 142 | } 143 | else 144 | { 145 | await Task.Run(() => File.WriteAllBytes(tempFilePath, contentBytes)); 146 | } 147 | 148 | options.FileExtension = extension; 149 | 150 | // Delegate to the specific converter’s ConvertToListAsync. 151 | var results = await selectedConverter.ConvertToListAsync(tempFilePath, options); 152 | return results; 153 | } 154 | catch (Exception ex) 155 | { 156 | throw new ConversionException($"Failed to convert URL to list: {url}", ex); 157 | } 158 | finally 159 | { 160 | if (File.Exists(tempFilePath)) 161 | { 162 | try { File.Delete(tempFilePath); } catch { /* Optionally log deletion errors */ } 163 | } 164 | } 165 | } 166 | 167 | private bool IsTextFile(string extension) 168 | { 169 | var textExtensions = new HashSet { ".html", ".htm", ".txt", ".docx", ".pptx", ".xlsx", ".csv", ".md" }; 170 | return textExtensions.Contains(extension); 171 | } 172 | 173 | private bool IsValidUrl(string input) 174 | { 175 | return Uri.TryCreate(input, UriKind.Absolute, out var uriResult) && 176 | (uriResult.Scheme == Uri.UriSchemeHttp || uriResult.Scheme == Uri.UriSchemeHttps); 177 | } 178 | 179 | private DocumentConverter GetSpecificConverter(string url) 180 | { 181 | foreach (var converter in _specificUrlConverters) 182 | { 183 | if (converter.CanConvertUrl(url)) 184 | return converter; 185 | } 186 | return null; 187 | } 188 | 189 | private async Task DownloadContentAsync(string url) 190 | { 191 | var response = await _httpClient.GetAsync(url); 192 | response.EnsureSuccessStatusCode(); 193 | return await response.Content.ReadAsByteArrayAsync(); 194 | } 195 | 196 | private string GetExtensionFromUrl(string url) 197 | { 198 | try 199 | { 200 | var uri = new Uri(url); 201 | var path = uri.AbsolutePath; 202 | var extension = Path.GetExtension(path).ToLowerInvariant(); 203 | if (!string.IsNullOrEmpty(extension)) 204 | return extension; 205 | if (url.EndsWith("/")) 206 | return ".html"; 207 | return ".html"; // Default to .html if no extension is found. 208 | } 209 | catch 210 | { 211 | return ".html"; // Fallback to .html. 212 | } 213 | } 214 | } 215 | } 216 | -------------------------------------------------------------------------------- /MarkItDown/Converters/DocxConverter.cs: -------------------------------------------------------------------------------- 1 | // Converters/DocxConverter.cs 2 | 3 | using System; 4 | using System.Collections.Generic; 5 | using System.IO; 6 | using System.Linq; 7 | using System.Text; 8 | using System.Threading.Tasks; 9 | using DocumentFormat.OpenXml.Packaging; 10 | using DocumentFormat.OpenXml.Wordprocessing; 11 | using ReverseMarkdown; 12 | using MarkItDownSharp.Models; 13 | using DocumentFormat.OpenXml; 14 | 15 | namespace MarkItDownSharp.Converters 16 | { 17 | public class DocxConverter : DocumentConverter 18 | { 19 | private static readonly Converter MarkdownConverter = new Converter(new Config 20 | { 21 | UnknownTags = Config.UnknownTagsOption.Bypass, 22 | GithubFlavored = true, 23 | RemoveComments = true, 24 | SmartHrefHandling = true 25 | }); 26 | 27 | public override bool CanConvertUrl(string url) => false; 28 | 29 | public override bool CanConvertFile(string extension) => 30 | extension.Equals(".docx", StringComparison.OrdinalIgnoreCase); 31 | 32 | public override async Task ConvertAsync(string pathOrUrl, ConversionOptions options) 33 | { 34 | if (!CanConvertFile(options.FileExtension)) 35 | return null; 36 | 37 | var textBuilder = new StringBuilder(); 38 | 39 | try 40 | { 41 | using (WordprocessingDocument doc = WordprocessingDocument.Open(pathOrUrl, false)) 42 | { 43 | var mainPart = doc.MainDocumentPart; 44 | var body = mainPart?.Document?.Body; 45 | 46 | if (body == null) return new DocumentConverterResult { TextContent = "" }; 47 | 48 | foreach (var element in body.ChildElements) 49 | { 50 | try 51 | { 52 | if (element is DocumentFormat.OpenXml.Wordprocessing.Paragraph paragraph) 53 | { 54 | textBuilder.AppendLine(ProcessParagraph(paragraph, mainPart)); 55 | } 56 | else if (element is DocumentFormat.OpenXml.Wordprocessing.Table table) 57 | { 58 | textBuilder.AppendLine(ProcessTable(table, mainPart)); 59 | } 60 | } 61 | catch 62 | { 63 | // Handle specific element processing errors if needed 64 | } 65 | } 66 | } 67 | } 68 | catch 69 | { 70 | return new DocumentConverterResult { TextContent = "Error processing document" }; 71 | } 72 | 73 | return await Task.FromResult(new DocumentConverterResult 74 | { 75 | Title = null, 76 | TextContent = textBuilder.ToString().Trim() 77 | }); 78 | } 79 | 80 | public override async Task> ConvertToListAsync(string pathOrUrl, ConversionOptions options) 81 | { 82 | var result = await ConvertAsync(pathOrUrl, options); 83 | return result != null ? new List { result } : new List(); 84 | } 85 | 86 | private string ProcessParagraph(DocumentFormat.OpenXml.Wordprocessing.Paragraph paragraph, MainDocumentPart mainPart) 87 | { 88 | var htmlContent = new StringBuilder(); 89 | ProcessInlineElements(paragraph.Elements(), htmlContent, mainPart); 90 | 91 | var markdown = MarkdownConverter.Convert(htmlContent.ToString().Trim()); 92 | return AddHeadingStyle(paragraph, markdown) + AddListPrefix(paragraph, mainPart, markdown); 93 | } 94 | 95 | private void ProcessInlineElements(IEnumerable elements, StringBuilder htmlContent, MainDocumentPart mainPart) 96 | { 97 | foreach (var element in elements) 98 | { 99 | if (element is DocumentFormat.OpenXml.Wordprocessing.Run run) 100 | { 101 | ProcessRun(run, htmlContent); 102 | } 103 | else if (element is DocumentFormat.OpenXml.Wordprocessing.Hyperlink hyperlink) 104 | { 105 | ProcessHyperlink(hyperlink, htmlContent, mainPart); 106 | } 107 | else if (element is DocumentFormat.OpenXml.Wordprocessing.Text text) 108 | { 109 | htmlContent.Append(text.Text); 110 | } 111 | } 112 | } 113 | 114 | private void ProcessRun(DocumentFormat.OpenXml.Wordprocessing.Run run, StringBuilder htmlContent) 115 | { 116 | var text = string.Concat(run.Elements().Select(t => t.Text)); 117 | if (string.IsNullOrEmpty(text)) return; 118 | 119 | var props = run.RunProperties; 120 | var tags = new Stack(); 121 | 122 | if (props?.Bold != null) { htmlContent.Append(""); tags.Push(""); } 123 | if (props?.Italic != null) { htmlContent.Append(""); tags.Push(""); } 124 | if (props?.Strike != null) { htmlContent.Append(""); tags.Push(""); } 125 | if (props?.Underline != null) { htmlContent.Append(""); tags.Push(""); } 126 | 127 | htmlContent.Append(text); 128 | 129 | while (tags.Count > 0) 130 | { 131 | htmlContent.Append(tags.Pop()); 132 | } 133 | 134 | htmlContent.Append(" "); 135 | } 136 | 137 | private void ProcessHyperlink(DocumentFormat.OpenXml.Wordprocessing.Hyperlink hyperlink, StringBuilder htmlContent, MainDocumentPart mainPart) 138 | { 139 | var url = GetHyperlinkUrl(hyperlink, mainPart); 140 | if (url == null) return; 141 | 142 | htmlContent.Append($""); 143 | ProcessInlineElements(hyperlink.Elements(), htmlContent, mainPart); 144 | htmlContent.Append(" "); 145 | } 146 | 147 | private string GetHyperlinkUrl(DocumentFormat.OpenXml.Wordprocessing.Hyperlink hyperlink, MainDocumentPart mainPart) 148 | { 149 | return hyperlink.Id != null 150 | ? mainPart.HyperlinkRelationships.FirstOrDefault(r => r.Id == hyperlink.Id)?.Uri.ToString() 151 | : null; 152 | } 153 | 154 | private string AddHeadingStyle(DocumentFormat.OpenXml.Wordprocessing.Paragraph paragraph, string markdown) 155 | { 156 | var style = paragraph.ParagraphProperties?.ParagraphStyleId?.Val?.Value; 157 | var headingLevel = style?.StartsWith("Heading", StringComparison.OrdinalIgnoreCase) == true 158 | ? int.TryParse(style.Substring(7), out int level) ? level : 0 159 | : 0; 160 | 161 | return headingLevel > 0 && headingLevel <= 6 162 | ? $"{new string('#', headingLevel)} {markdown}" 163 | : markdown; 164 | } 165 | 166 | private string AddListPrefix(DocumentFormat.OpenXml.Wordprocessing.Paragraph paragraph, MainDocumentPart mainPart, string markdown) 167 | { 168 | var numberingProps = paragraph.ParagraphProperties?.NumberingProperties; 169 | if (numberingProps == null) return markdown; 170 | 171 | var level = numberingProps.NumberingLevelReference?.Val?.Value ?? 0; 172 | var indent = new string(' ', level * 2); 173 | return $"{indent}* {markdown}"; // Simplified bullet list handling 174 | } 175 | 176 | private string ProcessTable(DocumentFormat.OpenXml.Wordprocessing.Table table, MainDocumentPart mainPart) 177 | { 178 | var sb = new StringBuilder(); 179 | var rows = table.Elements().ToList(); 180 | 181 | if (!rows.Any()) return ""; 182 | 183 | // Process header row 184 | var headers = ProcessRow(rows.First(), mainPart); 185 | sb.AppendLine($"| {string.Join(" | ", headers)} |"); 186 | sb.AppendLine($"|{string.Join("|", headers.Select(_ => "---"))}|"); 187 | 188 | // Process remaining rows 189 | foreach (var row in rows.Skip(1)) 190 | { 191 | var cells = ProcessRow(row, mainPart); 192 | sb.AppendLine($"| {string.Join(" | ", cells)} |"); 193 | } 194 | 195 | return sb.ToString(); 196 | } 197 | 198 | private List ProcessRow(DocumentFormat.OpenXml.Wordprocessing.TableRow row, MainDocumentPart mainPart) 199 | { 200 | return row.Elements() 201 | .Select(cell => 202 | { 203 | var content = new StringBuilder(); 204 | foreach (var element in cell.Elements()) 205 | { 206 | content.Append(ProcessParagraph(element, mainPart)); 207 | } 208 | return content.ToString().Replace("\n", "
").Trim(); 209 | }) 210 | .ToList(); 211 | } 212 | } 213 | } 214 | 215 | -------------------------------------------------------------------------------- /MarkItDown/Helpers/CustomMarkdownConverter.cs: -------------------------------------------------------------------------------- 1 | // Helpers/CustomMarkdownConverter.cs 2 | 3 | using System; 4 | using System.Collections.Generic; 5 | using System.Linq; 6 | using System.Text; 7 | using HtmlAgilityPack; 8 | using ReverseMarkdown; 9 | using ReverseMarkdown.Converters; 10 | 11 | namespace MarkItDownSharp.Helpers 12 | { 13 | public class CustomMarkdownConverter 14 | { 15 | /// 16 | /// Converts an HTML string to Markdown. 17 | /// 18 | /// The HTML string. 19 | /// Markdown as a string. 20 | public string ConvertToMarkdown(string html) 21 | { 22 | // Load the HTML into an HtmlDocument 23 | var doc = new HtmlDocument(); 24 | doc.LoadHtml(html); 25 | 26 | // Remove unwanted nodes (script, style, and macro buttons) 27 | RemoveNodes(doc, "//script|//style|//button[contains(@class, 'conf-macro')]"); 28 | 29 | // Remove nodes whose src attribute starts with data:image 30 | var imageNodes = doc.DocumentNode.SelectNodes("//img"); 31 | if (imageNodes != null) 32 | { 33 | foreach (var img in imageNodes) 34 | { 35 | var src = img.GetAttributeValue("src", string.Empty); 36 | if (!string.IsNullOrEmpty(src) && src.StartsWith("data:image/")) 37 | img.Remove(); 38 | } 39 | } 40 | 41 | // Preprocess: Fix situations where block-level elements (like div or table) 42 | // are nested inside

tags and unwrap them. 43 | FixInvalidParagraphs(doc); 44 | 45 | // Preprocess: remove unnecessary attributes from nodes (to help the converters) 46 | SanitizeNodes(doc); 47 | 48 | // Get the cleaned-up HTML string 49 | var cleanedHtml = doc.DocumentNode.OuterHtml; 50 | 51 | // Configure ReverseMarkdown 52 | var config = new Config 53 | { 54 | UnknownTags = Config.UnknownTagsOption.Drop, 55 | GithubFlavored = true, 56 | RemoveComments = true, 57 | SmartHrefHandling = true, 58 | CleanupUnnecessarySpaces = true 59 | }; 60 | 61 | var converter = new Converter(config); 62 | 63 | // Register custom converters to enhance parsing: 64 | // • Custom list converters so that lists inside table cells get rendered inline. 65 | // • An inline converter so that tags like and

tags. 91 | // We “unwrap” the children so that ReverseMarkdown can process them correctly. 92 | private void FixInvalidParagraphs(HtmlDocument doc) 93 | { 94 | var pNodes = doc.DocumentNode.SelectNodes("//p"); 95 | if (pNodes != null) 96 | { 97 | // Using ToList because we will modify the document. 98 | foreach (var p in pNodes.ToList()) 99 | { 100 | // If the paragraph contains a div or a table... 101 | if (p.SelectSingleNode(".//div|.//table") != null) 102 | { 103 | var parent = p.ParentNode; 104 | // Insert each child of the paragraph before the

itself… 105 | foreach (var child in p.ChildNodes.ToList()) 106 | { 107 | parent.InsertBefore(child, p); 108 | } 109 | // …then remove the (now empty or redundant)

110 | p.Remove(); 111 | } 112 | } 113 | } 114 | } 115 | 116 | // Remove most attributes from nodes (except for some tags like and that require them) 117 | // This way extra classes, styles, data-* attributes etc. do not interfere with conversion. 118 | private void SanitizeNodes(HtmlDocument doc) 119 | { 120 | // Keep attributes for these tags only. 121 | var whitelist = new HashSet(StringComparer.OrdinalIgnoreCase) 122 | { 123 | "a", 124 | "img" 125 | }; 126 | 127 | foreach (var node in doc.DocumentNode.Descendants()) 128 | { 129 | if (node.NodeType == HtmlNodeType.Element && !whitelist.Contains(node.Name)) 130 | { 131 | node.Attributes.RemoveAll(); 132 | } 133 | } 134 | } 135 | } 136 | 137 | // Custom converter for unordered lists (