├── .github └── workflows │ └── lint.yml ├── .gitignore ├── .golangci.yml ├── LICENSE ├── README.md ├── cmd └── markitdown │ └── main.go ├── converter.go ├── converters ├── converter.go ├── html │ └── converter.go ├── optons.go ├── pdf │ └── converter.go └── prompt.md ├── examples └── data │ ├── deepseek-r1.pdf │ ├── il-1040.pdf │ └── quantum_computing.pdf ├── file_type_utils.go ├── go.mod ├── go.sum └── markitdown.go /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | name: lint 2 | on: 3 | push: 4 | branches: [ main, master ] 5 | pull_request: 6 | branches: [ main, master ] 7 | 8 | permissions: 9 | contents: read 10 | 11 | jobs: 12 | golangci: 13 | name: lint 14 | runs-on: ubuntu-latest 15 | steps: 16 | - uses: actions/checkout@v4 17 | - uses: actions/setup-go@v5 18 | with: 19 | go-version: '1.23' 20 | cache: true 21 | - name: golangci-lint 22 | uses: golangci/golangci-lint-action@v6 23 | with: 24 | version: latest 25 | args: --timeout=5m 26 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # If you prefer the allow list template instead of the deny list, see community template: 2 | # https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore 3 | # 4 | # Binaries for programs and plugins 5 | *.exe 6 | *.exe~ 7 | *.dll 8 | *.so 9 | *.dylib 10 | 11 | # Test binary, built with `go test -c` 12 | *.test 13 | 14 | # Output of the go coverage tool, specifically when used with LiteIDE 15 | *.out 16 | 17 | # Dependency directories (remove the comment below to include it) 18 | # vendor/ 19 | 20 | # Go workspace file 21 | go.work 22 | go.work.sum 23 | 24 | # env file 25 | .env 26 | 27 | # GoReleaser 28 | dist/ 29 | -------------------------------------------------------------------------------- /.golangci.yml: -------------------------------------------------------------------------------- 1 | linters: 2 | enable: 3 | - gofmt 4 | - govet 5 | - errcheck 6 | - staticcheck 7 | - gosimple 8 | - unused 9 | - gocyclo 10 | - gosec 11 | - bodyclose 12 | - unconvert 13 | - misspell 14 | disable: 15 | - typecheck 16 | 17 | run: 18 | timeout: 3m 19 | tests: true 20 | concurrency: 4 21 | 22 | issues: 23 | exclude-rules: 24 | - path: _test\.go 25 | linters: 26 | - gocyclo 27 | - errcheck 28 | - dupl 29 | 30 | linters-settings: 31 | gocyclo: 32 | min-complexity: 15 33 | misspell: 34 | locale: US 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 recally-io 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # go-markitdown 2 | A CLI tool and library written in Go for converting documents to Markdown format. 3 | 4 | ## Features 5 | - Convert PDF, HTML documents to Markdown 6 | - Support for both local files and URLs 7 | - Preserve semantic structure during conversion 8 | - Easy to use CLI interface 9 | - Flexible library integration 10 | 11 | ## Installation 12 | 13 | > **Note**: This tool requires CGO to be enabled. Make sure to set `CGO_ENABLED=1` when installing or building the tool. 14 | 15 | ```bash 16 | CGO_ENABLED=1 go install github.com/recally-io/go-markitdown/cmd/markitdown@latest 17 | ``` 18 | 19 | ## Usage 20 | 21 | ### CLI Usage 22 | 23 | The `markitdown` command line tool provides a simple interface for document conversion: 24 | 25 | #### Required Environment Variables 26 | 27 | Before using the CLI tool, make sure to set the following environment variables: 28 | 29 | ```bash 30 | export OPENAI_BASE_URL="https://api.openai.com/v1" # Or your custom OpenAI API endpoint 31 | export OPENAI_API_KEY="your-api-key-here" # Your OpenAI API key 32 | ``` 33 | 34 | These environment variables are required for PDF text extraction using OpenAI's models. 35 | 36 | ```bash 37 | # Convert a local file 38 | markitdown document.pdf -o output.md 39 | 40 | # Convert from URL 41 | markitdown https://example.com/document.html -o output.md 42 | 43 | # Specify a different LLM model 44 | markitdown document.pdf -m gpt-4 -o output.md 45 | ``` 46 | 47 | Available flags: 48 | - `-o, --output`: Output file path (if not specified, outputs to stdout) 49 | - `-m, --model`: LLM model to use (default: gpt-4o-mini) 50 | 51 | ### Library Usage 52 | 53 | To use go-markitdown as a library in your Go project: 54 | 55 | ```go 56 | package main 57 | 58 | import ( 59 | "context" 60 | "github.com/recally-io/go-markitdown" 61 | "github.com/recally-io/go-markitdown/converters" 62 | ) 63 | 64 | func main() { 65 | // Create a new MarkitDown instance with options 66 | md := markitdown.NewMarkitDown( 67 | converters.WithPreserveLayout(true), 68 | // Add other options as needed 69 | ) 70 | 71 | // Convert a local file 72 | markdown, err := md.ConvertLocal(context.Background(), "document.pdf") 73 | if err != nil { 74 | // Handle error 75 | } 76 | 77 | // Convert from URL 78 | markdown, err = md.ConvertURL(context.Background(), "https://example.com/document.html") 79 | if err != nil { 80 | // Handle error 81 | } 82 | 83 | // Generic convert method (auto-detects source type) 84 | markdown, err = md.Convert(context.Background(), "document.pdf") 85 | if err != nil { 86 | // Handle error 87 | } 88 | } 89 | ``` 90 | 91 | ## Supported Formats 92 | 93 | Input formats: 94 | - PDF (.pdf) 95 | - HTML (.html, .htm) 96 | - URLs (http://, https://) 97 | 98 | Output format: 99 | - Markdown (.md) 100 | 101 | ## Development 102 | 103 | ### Linting 104 | This project uses GolangCI-lint for code quality checks. To run the linter: 105 | 106 | ```bash 107 | # Install golangci-lint 108 | go install github.com/golangci/golangci-lint/cmd/golangci-lint@latest 109 | 110 | # Run linter 111 | golangci-lint run 112 | ``` 113 | 114 | ### Releasing 115 | We use GoReleaser to automate the release process. To create a new release: 116 | 117 | 1. Tag your release: 118 | ```bash 119 | git tag -a v0.1.0 -m "First release" 120 | git push origin v0.1.0 121 | ``` 122 | 123 | 2. Test the release locally: 124 | ```bash 125 | goreleaser check # Check if the config is valid 126 | goreleaser release --snapshot --clean # Test the release process 127 | ``` 128 | 129 | 3. To perform an actual release: 130 | ```bash 131 | goreleaser release --clean 132 | ``` 133 | 134 | ## License 135 | 136 | [MIT License](./LICENSE) 137 | -------------------------------------------------------------------------------- /cmd/markitdown/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "log/slog" 7 | "os" 8 | "path/filepath" 9 | "strings" 10 | "time" 11 | 12 | "slices" 13 | 14 | "github.com/recally-io/go-markitdown" 15 | "github.com/recally-io/go-markitdown/converters" 16 | "github.com/sashabaranov/go-openai" 17 | "github.com/spf13/cobra" 18 | ) 19 | 20 | // Supported input formats and schemes 21 | const ( 22 | defaultModel = "gpt-4o-mini" 23 | ) 24 | 25 | var ( 26 | supportedExtensions = []string{".pdf", ".html", ".htm"} 27 | supportedSchemes = []string{"http://", "https://"} 28 | ) 29 | 30 | // Command line flags 31 | type convertFlags struct { 32 | outputPath string 33 | model string 34 | } 35 | 36 | func main() { 37 | // Initialize slog 38 | slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, nil))) 39 | 40 | if err := newRootCmd().Execute(); err != nil { 41 | slog.Error("command execution failed", "error", err) 42 | os.Exit(1) 43 | } 44 | } 45 | 46 | // newRootCmd creates the root command 47 | func newRootCmd() *cobra.Command { 48 | flags := &convertFlags{} 49 | cmd := &cobra.Command{ 50 | Use: "markitdown ", 51 | Short: "Convert documents to markdown", 52 | Long: `A powerful document conversion tool that preserves semantic structure`, 53 | Example: ` # Convert a local PDF file 54 | markitdown document.pdf -o output.md 55 | 56 | # Convert from a URL 57 | markitdown https://example.com/document.html -o output.md 58 | 59 | # Use a specific LLM model 60 | markitdown document.pdf -m gpt-4 -o output.md 61 | 62 | # Output to stdout (no -o flag) 63 | markitdown document.pdf`, 64 | Args: cobra.ExactArgs(1), 65 | RunE: flags.runConvert, 66 | } 67 | 68 | // Define flags 69 | cmd.Flags().StringVarP(&flags.outputPath, "output", "o", "", "Output file path") 70 | cmd.Flags().StringVarP(&flags.model, "model", "m", defaultModel, "LLM model to use") 71 | 72 | return cmd 73 | } 74 | 75 | // runConvert handles the conversion logic 76 | func (f *convertFlags) runConvert(cmd *cobra.Command, args []string) error { 77 | input := args[0] 78 | 79 | if err := validateInput(input); err != nil { 80 | return err 81 | } 82 | 83 | // Show progress 84 | slog.Info("Converting file to markdown", "input", input) 85 | startTime := time.Now() 86 | 87 | // Convert document 88 | text, err := f.convertDocument(input) 89 | if err != nil { 90 | return fmt.Errorf("conversion failed: %w", err) 91 | } 92 | 93 | // Handle output 94 | if err := f.writeOutput(text); err != nil { 95 | return err 96 | } 97 | 98 | slog.Info("Conversion completed", "duration", time.Since(startTime).Round(time.Millisecond)) 99 | return nil 100 | } 101 | 102 | // convertDocument performs the actual document conversion 103 | func (f *convertFlags) convertDocument(input string) (string, error) { 104 | ctx := context.Background() 105 | converter := initializeConverter(f.model) 106 | return converter.Convert(ctx, input) 107 | } 108 | 109 | // initializeConverter creates and configures the converter 110 | func initializeConverter(model string) *markitdown.MarkitDown { 111 | llmCfg := openai.DefaultConfig(os.Getenv("OPENAI_API_KEY")) 112 | llmCfg.BaseURL = os.Getenv("OPENAI_BASE_URL") 113 | llmClient := openai.NewClientWithConfig(llmCfg) 114 | 115 | return markitdown.NewMarkitDown( 116 | converters.WithLLMClient(llmClient), 117 | converters.WithLLMModel(model), 118 | ) 119 | } 120 | 121 | // validateInput checks if the input is valid 122 | func validateInput(input string) error { 123 | if isURL(input) { 124 | return validateURL(input) 125 | } 126 | return validateFile(input) 127 | } 128 | 129 | // isURL checks if the input is a URL 130 | func isURL(input string) bool { 131 | for _, scheme := range supportedSchemes { 132 | if strings.HasPrefix(input, scheme) { 133 | return true 134 | } 135 | } 136 | return false 137 | } 138 | 139 | // validateURL checks if the URL scheme is supported 140 | func validateURL(url string) error { 141 | if !isURL(url) { 142 | return fmt.Errorf("invalid URL scheme: must be http or https") 143 | } 144 | return nil 145 | } 146 | 147 | // validateFile checks if the file exists and has a supported extension 148 | func validateFile(path string) error { 149 | if _, err := os.Stat(path); err != nil { 150 | return fmt.Errorf("input file not found: %w", err) 151 | } 152 | 153 | ext := strings.ToLower(filepath.Ext(path)) 154 | if slices.Contains(supportedExtensions, ext) { 155 | return nil 156 | } 157 | return fmt.Errorf("unsupported file type: %s (supported: %v)", ext, supportedExtensions) 158 | } 159 | 160 | // writeOutput writes the converted text to file or stdout 161 | func (f *convertFlags) writeOutput(text string) error { 162 | if f.outputPath == "" { 163 | fmt.Println(text) 164 | return nil 165 | } 166 | 167 | if err := os.WriteFile(f.outputPath, []byte(text), 0600); err != nil { 168 | return fmt.Errorf("failed to write output: %w", err) 169 | } 170 | return nil 171 | } 172 | -------------------------------------------------------------------------------- /converter.go: -------------------------------------------------------------------------------- 1 | package markitdown 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/recally-io/go-markitdown/converters" 7 | "github.com/recally-io/go-markitdown/converters/html" 8 | "github.com/recally-io/go-markitdown/converters/pdf" 9 | ) 10 | 11 | func NewConverter(extension string, opts ...converters.Option) (converters.Converter, error) { 12 | switch extension { 13 | case "html": 14 | return html.NewConverter(opts...), nil 15 | case "pdf": 16 | return pdf.NewConverter(opts...), nil 17 | } 18 | return nil, fmt.Errorf("unsupported converter type: %s", extension) 19 | } 20 | -------------------------------------------------------------------------------- /converters/converter.go: -------------------------------------------------------------------------------- 1 | package converters 2 | 3 | import ( 4 | "context" 5 | "io" 6 | ) 7 | 8 | type Converter interface { 9 | Convert(ctx context.Context, reader io.ReadCloser) (string, error) 10 | } 11 | -------------------------------------------------------------------------------- /converters/html/converter.go: -------------------------------------------------------------------------------- 1 | package html 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "io" 7 | "log/slog" 8 | 9 | md "github.com/JohannesKaufmann/html-to-markdown" 10 | "github.com/go-shiori/go-readability" 11 | "github.com/recally-io/go-markitdown/converters" 12 | ) 13 | 14 | type Converter struct { 15 | options *converters.Options 16 | } 17 | 18 | func NewConverter(opts ...converters.Option) *Converter { 19 | return &Converter{ 20 | options: converters.NewOptions(opts...), 21 | } 22 | } 23 | 24 | func (c *Converter) Convert(ctx context.Context, reader io.ReadCloser) (string, error) { 25 | defer reader.Close() 26 | 27 | var htmlContent []byte 28 | var err error 29 | 30 | if c.options.HtmlReadability { 31 | slog.Info("parsing HTML with readability mode enabled", "host", c.options.HtmlHost) 32 | article, err := readability.FromReader(reader, nil) 33 | if err != nil { 34 | slog.Error("failed to parse HTML content", "error", err) 35 | return "", fmt.Errorf("failed to parse HTML content: %w", err) 36 | } 37 | htmlContent = []byte(article.Content) 38 | } else { 39 | slog.Info("parsing raw HTML content", "host", c.options.HtmlHost) 40 | htmlContent, err = io.ReadAll(reader) 41 | if err != nil { 42 | slog.Error("failed to read HTML content", "error", err) 43 | return "", fmt.Errorf("failed to read HTML content: %w", err) 44 | } 45 | } 46 | 47 | converter := md.NewConverter(c.options.HtmlHost, true, &md.Options{}) 48 | markdown, err := converter.ConvertString(string(htmlContent)) 49 | if err != nil { 50 | slog.Error("markdown conversion failed", "error", err) 51 | return "", fmt.Errorf("webreader markdown converter error: %w", err) 52 | } 53 | 54 | slog.Info("completed HTML conversion", "content_length", len(markdown)) 55 | return markdown, nil 56 | } 57 | -------------------------------------------------------------------------------- /converters/optons.go: -------------------------------------------------------------------------------- 1 | package converters 2 | 3 | import ( 4 | _ "embed" 5 | 6 | "github.com/sashabaranov/go-openai" 7 | ) 8 | 9 | //go:embed prompt.md 10 | var systemMessage string 11 | 12 | // Options represents the configuration options for the converter. 13 | type Options struct { 14 | // LLMClient is the OpenAI client used for language model interactions. 15 | LLMClient *openai.Client 16 | 17 | // LLMPrompt is the system message or prompt used for the language model. 18 | LLMPrompt string 19 | 20 | // LLMModel specifies the language model to be used, default is "gpt-4o-mini". 21 | LLMModel string 22 | 23 | // NumWorkers determines the number of concurrent workers for processing. 24 | NumWorkers int 25 | 26 | // ImageDPI specifies the DPI for image extraction. 27 | ImageDPI float64 28 | 29 | // HtmlHost specifies the host for the HTML converter. 30 | HtmlHost string 31 | 32 | // HtmlReadability enables HTML readability mode to clean up the html before conversion. 33 | HtmlReadability bool 34 | } 35 | 36 | type Option func(*Options) 37 | 38 | func NewOptions(opts ...Option) *Options { 39 | options := &Options{ 40 | LLMPrompt: systemMessage, 41 | LLMModel: "gpt-4o-mini", 42 | NumWorkers: 10, 43 | ImageDPI: 300, 44 | 45 | HtmlReadability: true, 46 | } 47 | for _, opt := range opts { 48 | opt(options) 49 | } 50 | return options 51 | } 52 | 53 | func WithLLMClient(client *openai.Client) Option { 54 | return func(o *Options) { 55 | o.LLMClient = client 56 | } 57 | } 58 | 59 | func WithLLMPrompt(prompt string) Option { 60 | return func(o *Options) { 61 | o.LLMPrompt = prompt 62 | } 63 | } 64 | 65 | func WithLLMModel(model string) Option { 66 | return func(o *Options) { 67 | o.LLMModel = model 68 | } 69 | } 70 | 71 | func WithNumWorkers(num int) Option { 72 | return func(o *Options) { 73 | o.NumWorkers = num 74 | } 75 | } 76 | 77 | func WithImageDPI(dpi float64) Option { 78 | return func(o *Options) { 79 | o.ImageDPI = dpi 80 | } 81 | } 82 | 83 | func WithHtmlHost(host string) Option { 84 | return func(o *Options) { 85 | o.HtmlHost = host 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /converters/pdf/converter.go: -------------------------------------------------------------------------------- 1 | package pdf 2 | 3 | import ( 4 | "context" 5 | _ "embed" 6 | "encoding/base64" 7 | "fmt" 8 | "io" 9 | "log/slog" 10 | "strings" 11 | "sync" 12 | 13 | "github.com/gen2brain/go-fitz" 14 | "github.com/recally-io/go-markitdown/converters" 15 | "github.com/sashabaranov/go-openai" 16 | "golang.org/x/sync/errgroup" 17 | ) 18 | 19 | type Converter struct { 20 | options *converters.Options 21 | } 22 | 23 | func NewConverter(opts ...converters.Option) *Converter { 24 | return &Converter{ 25 | options: converters.NewOptions(opts...), 26 | } 27 | } 28 | 29 | func (c *Converter) Convert(ctx context.Context, reader io.ReadCloser) (string, error) { 30 | defer reader.Close() 31 | doc, err := fitz.NewFromReader(reader) 32 | if err != nil { 33 | return "", fmt.Errorf("failed to open PDF: %w", err) 34 | } 35 | defer doc.Close() 36 | 37 | var texts []string 38 | if c.options.LLMClient != nil { 39 | slog.Info("converting PDF with LLM assistance") 40 | texts, err = c.ConvertPagesWithLLM(ctx, doc) 41 | } else { 42 | slog.Info("converting PDF without LLM") 43 | texts, err = c.ConvertPages(ctx, doc) 44 | } 45 | if err != nil { 46 | slog.Error("failed to convert pages", "error", err) 47 | return "", fmt.Errorf("failed to convert pages: %w", err) 48 | } 49 | 50 | return strings.Join(texts, "\n\n"), nil 51 | } 52 | 53 | func (c *Converter) ConvertPages(ctx context.Context, doc *fitz.Document) ([]string, error) { 54 | totalPages := doc.NumPage() 55 | slog.Info("starting PDF conversion", "pages", totalPages) 56 | 57 | results := make([]string, totalPages) 58 | for i := range results { 59 | slog.Debug("extracting text from page", "page", i+1, "total", totalPages) 60 | pageText, err := doc.Text(i) 61 | if err != nil { 62 | slog.Error("failed to extract text", "page", i+1, "error", err) 63 | return nil, fmt.Errorf("failed to extract text from page %d: %w", i+1, err) 64 | } 65 | results[i] = pageText 66 | } 67 | 68 | slog.Info("completed PDF conversion", "pages", totalPages) 69 | return results, nil 70 | } 71 | 72 | func (c *Converter) ConvertPagesWithLLM(ctx context.Context, doc *fitz.Document) ([]string, error) { 73 | totalPages := doc.NumPage() 74 | slog.Info("starting PDF conversion with LLM", "pages", totalPages, "workers", c.options.NumWorkers) 75 | 76 | g, ctx := errgroup.WithContext(ctx) 77 | results := make([]string, totalPages) 78 | var mu sync.Mutex 79 | 80 | // Process pages in parallel with bounded concurrency 81 | sem := make(chan struct{}, c.options.NumWorkers) 82 | for i := range results { 83 | i := i // Create new variable for goroutine 84 | 85 | g.Go(func() error { 86 | sem <- struct{}{} // Acquire semaphore 87 | defer func() { <-sem }() // Release semaphore 88 | 89 | slog.Debug("processing page with LLM", "page", i+1, "total", totalPages) 90 | markdown, err := c.processPage(ctx, doc, i) 91 | if err != nil { 92 | slog.Error("failed to process page", "page", i+1, "error", err) 93 | return fmt.Errorf("failed to process page %d: %w", i, err) 94 | } 95 | 96 | mu.Lock() 97 | results[i] = markdown 98 | mu.Unlock() 99 | return nil 100 | }) 101 | } 102 | 103 | if err := g.Wait(); err != nil { 104 | slog.Error("page processing failed", "error", err) 105 | return nil, fmt.Errorf("page processing failed: %w", err) 106 | } 107 | 108 | slog.Info("completed PDF conversion with LLM", "pages", totalPages) 109 | return results, nil 110 | } 111 | 112 | // processPage handles the conversion of a single PDF page 113 | func (c *Converter) processPage(ctx context.Context, doc *fitz.Document, pageNum int) (string, error) { 114 | // Extract text 115 | slog.Debug("extracting text from page", "page", pageNum) 116 | pageText, err := doc.Text(pageNum) 117 | if err != nil { 118 | slog.Error("failed to extract text", "page", pageNum, "error", err) 119 | return "", fmt.Errorf("failed to extract text from page %d: %w", pageNum, err) 120 | } 121 | 122 | // Extract image 123 | slog.Debug("extracting image from page", "page", pageNum) 124 | img, err := doc.ImagePNG(pageNum, c.options.ImageDPI) 125 | if err != nil { 126 | slog.Error("failed to extract image", "page", pageNum, "error", err) 127 | return "", fmt.Errorf("failed to extract image from page %d: %w", pageNum, err) 128 | } 129 | 130 | // Create chat completion request 131 | req := openai.ChatCompletionRequest{ 132 | Model: c.options.LLMModel, 133 | Messages: []openai.ChatCompletionMessage{ 134 | { 135 | Role: openai.ChatMessageRoleSystem, 136 | Content: c.options.LLMPrompt, 137 | }, 138 | { 139 | Role: openai.ChatMessageRoleUser, 140 | MultiContent: []openai.ChatMessagePart{ 141 | { 142 | Type: openai.ChatMessagePartTypeText, 143 | Text: fmt.Sprintf("%s", pageText), 144 | }, 145 | { 146 | Type: openai.ChatMessagePartTypeImageURL, 147 | ImageURL: &openai.ChatMessageImageURL{ 148 | URL: fmt.Sprintf("data:image/png;base64,%s", 149 | base64.StdEncoding.EncodeToString(img)), 150 | }, 151 | }, 152 | }, 153 | }, 154 | }, 155 | } 156 | 157 | // Get response from API 158 | slog.Debug("calling LLM API", "page", pageNum) 159 | resp, err := c.options.LLMClient.CreateChatCompletion(ctx, req) 160 | if err != nil { 161 | slog.Error("API call failed", "page", pageNum, "error", err) 162 | return "", fmt.Errorf("API call failed for page %d: %w", pageNum, err) 163 | } 164 | 165 | content := resp.Choices[0].Message.Content 166 | content = strings.TrimPrefix(content, "```markdown\n") 167 | content = strings.TrimSuffix(content, "\n```") 168 | 169 | return content, nil 170 | } 171 | -------------------------------------------------------------------------------- /converters/prompt.md: -------------------------------------------------------------------------------- 1 | Act as a meticulous PDF-to-Markdown converter. I will provide you with: 2 | 1. An image of a PDF page (to infer visual structure, tables, images, and formatting). 3 | 2. Extracted text from the PDF (optional; may be incomplete or misformatted). 4 | 5 | Your task is to reconstruct the content into clean, organized Markdown that **exactly mirrors the original PDF's structure**. Prioritize the following: 6 | 7 | ### Requirements: 8 | 1. **Layout Preservation**: 9 | - Maintain headers, sections, bullet points, numbered lists, indentation, and fonts (use `**bold**`, `*italic*`, etc.). 10 | - Replicate spacing, alignment, and text hierarchy (e.g., `## H2` after `# H1`). 11 | 12 | 2. **Tables**: 13 | - Convert to Markdown tables with precise alignment. Use `---` and pipes (`|`). 14 | - If the PDF table has complex formatting (merged cells, multi-line text), approximate it creatively using colspan/rowspan syntax or notes. 15 | 16 | 3. **Images/Figures**: 17 | - Identify images in the PDF page and embed them as Markdown links with alt text (e.g., `![Alt Text](image.jpg)`). 18 | - If image filenames are unavailable, label them as `Fig. 1`, `Fig. 2`, etc., and note their position (e.g., ``). 19 | 20 | 4. **Code/Formulas**: 21 | - Wrap code snippets in ` ``` ` blocks with language specifiers (e.g., ` ```python `). 22 | - Render mathematical equations using LaTeX (`$$E=mc^2$$`). 23 | 24 | 5. **Handling Ambiguity**: 25 | - If the extracted text conflicts with the PDF image, **trust the visual structure of the image** to resolve formatting. 26 | - If text is missing, infer content from the image (e.g., handwritten notes, diagrams) and annotate with ``. 27 | 28 | 6. **Output**: 29 | - Return **only the Markdown**, without extra commentary, Ensure it’s ready to render. 30 | -------------------------------------------------------------------------------- /examples/data/deepseek-r1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/recally-io/go-markitdown/0ebd534b3e1c28d408a5e29909d6731f0be14cf3/examples/data/deepseek-r1.pdf -------------------------------------------------------------------------------- /examples/data/il-1040.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/recally-io/go-markitdown/0ebd534b3e1c28d408a5e29909d6731f0be14cf3/examples/data/il-1040.pdf -------------------------------------------------------------------------------- /examples/data/quantum_computing.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/recally-io/go-markitdown/0ebd534b3e1c28d408a5e29909d6731f0be14cf3/examples/data/quantum_computing.pdf -------------------------------------------------------------------------------- /file_type_utils.go: -------------------------------------------------------------------------------- 1 | package markitdown 2 | 3 | import ( 4 | "fmt" 5 | "net/http" 6 | "path/filepath" 7 | "strings" 8 | ) 9 | 10 | // getFileType determines the file type from an HTTP response that can be used for conversion. 11 | // It returns a simple file type string (e.g., "html", "pdf") that can be mapped to converters. 12 | // Returns an error if the file type is unsupported or cannot be determined. 13 | func getFileType(resp *http.Response, url string) (string, error) { 14 | if resp == nil { 15 | return "", fmt.Errorf("http response is nil") 16 | } 17 | 18 | // First try Content-Type header 19 | contentType := resp.Header.Get("Content-Type") 20 | if contentType != "" { 21 | // Strip any charset or boundary information 22 | if idx := strings.Index(contentType, ";"); idx != -1 { 23 | contentType = contentType[:idx] 24 | } 25 | contentType = strings.TrimSpace(contentType) 26 | 27 | // Map MIME types to file types 28 | switch contentType { 29 | case "text/html", "application/xhtml+xml": 30 | return "html", nil 31 | case "application/pdf": 32 | return "pdf", nil 33 | case "application/epub+zip": 34 | return "epub", nil 35 | case "application/msword", "application/vnd.openxmlformats-officedocument.wordprocessingml.document": 36 | return "doc", nil 37 | case "text/markdown": 38 | return "md", nil 39 | case "text/plain": 40 | return "txt", nil 41 | } 42 | } 43 | 44 | // Fallback to URL extension 45 | ext, err := getFileTypeFromPath(url) 46 | if err != nil { 47 | return "", fmt.Errorf("failed to determine file type from URL: %w", err) 48 | } 49 | return ext, nil 50 | } 51 | 52 | // getFileTypeFromPath determines the file type from a file path. 53 | func getFileTypeFromPath(filePath string) (string, error) { 54 | ext := strings.ToLower(filepath.Ext(filePath)) 55 | switch ext { 56 | case ".html", ".htm": 57 | return "html", nil 58 | case ".pdf": 59 | return "pdf", nil 60 | case ".epub": 61 | return "epub", nil 62 | case ".doc", ".docx": 63 | return "doc", nil 64 | case ".md", ".markdown": 65 | return "md", nil 66 | case ".txt": 67 | return "txt", nil 68 | } 69 | return "", fmt.Errorf("unsupported or unknown file type: %s", ext) 70 | } 71 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/recally-io/go-markitdown 2 | 3 | go 1.23.0 4 | 5 | require ( 6 | github.com/JohannesKaufmann/html-to-markdown v1.6.0 7 | github.com/gen2brain/go-fitz v1.24.14 8 | github.com/go-shiori/go-readability v0.0.0-20250217085726-9f5bf5ca7612 9 | github.com/sashabaranov/go-openai v1.37.0 10 | golang.org/x/sync v0.11.0 11 | ) 12 | 13 | require ( 14 | github.com/inconshreveable/mousetrap v1.1.0 // indirect 15 | github.com/spf13/pflag v1.0.6 // indirect 16 | ) 17 | 18 | require ( 19 | github.com/PuerkitoBio/goquery v1.9.2 // indirect 20 | github.com/andybalholm/cascadia v1.3.3 // indirect 21 | github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de // indirect 22 | github.com/ebitengine/purego v0.8.0 // indirect 23 | github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect 24 | github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect 25 | github.com/jupiterrider/ffi v0.2.0 // indirect 26 | github.com/spf13/cobra v1.9.1 27 | golang.org/x/net v0.35.0 // indirect 28 | golang.org/x/text v0.22.0 // indirect 29 | ) 30 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/JohannesKaufmann/html-to-markdown v1.6.0 h1:04VXMiE50YYfCfLboJCLcgqF5x+rHJnb1ssNmqpLH/k= 2 | github.com/JohannesKaufmann/html-to-markdown v1.6.0/go.mod h1:NUI78lGg/a7vpEJTz/0uOcYMaibytE4BUOQS8k78yPQ= 3 | github.com/PuerkitoBio/goquery v1.9.2 h1:4/wZksC3KgkQw7SQgkKotmKljk0M6V8TUvA8Wb4yPeE= 4 | github.com/PuerkitoBio/goquery v1.9.2/go.mod h1:GHPCaP0ODyyxqcNoFGYlAprUFH81NuRPd0GX3Zu2Mvk= 5 | github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU= 6 | github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM= 7 | github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA= 8 | github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de h1:FxWPpzIjnTlhPwqqXc4/vE0f7GvRjuAsbW+HOIe8KnA= 9 | github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de/go.mod h1:DCaWoUhZrYW9p1lxo/cm8EmUOOzAPSEZNGF2DK1dJgw= 10 | github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= 11 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 12 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 13 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 14 | github.com/ebitengine/purego v0.8.0 h1:JbqvnEzRvPpxhCJzJJ2y0RbiZ8nyjccVUrSM3q+GvvE= 15 | github.com/ebitengine/purego v0.8.0/go.mod h1:iIjxzd6CiRiOG0UyXP+V1+jWqUXVjPKLAI0mRfJZTmQ= 16 | github.com/gen2brain/go-fitz v1.24.14 h1:09weRkjVtLYNGo7l0J7DyOwBExbwi8SJ9h8YPhw9WEo= 17 | github.com/gen2brain/go-fitz v1.24.14/go.mod h1:0KaZeQgASc20Yp5R/pFzyy7SmP01XcoHKNF842U2/S4= 18 | github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c h1:wpkoddUomPfHiOziHZixGO5ZBS73cKqVzZipfrLmO1w= 19 | github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c/go.mod h1:oVDCh3qjJMLVUSILBRwrm+Bc6RNXGZYtoh9xdvf1ffM= 20 | github.com/go-shiori/go-readability v0.0.0-20250217085726-9f5bf5ca7612 h1:BYLNYdZaepitbZreRIa9xeCQZocWmy/wj4cGIH0qyw0= 21 | github.com/go-shiori/go-readability v0.0.0-20250217085726-9f5bf5ca7612/go.mod h1:wgqthQa8SAYs0yyljVeCOQlZ027VW5CmLsbi9jWC08c= 22 | github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f h1:3BSP1Tbs2djlpprl7wCLuiqMaUh5SJkkzI2gDs+FgLs= 23 | github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f/go.mod h1:Pcatq5tYkCW2Q6yrR2VRHlbHpZ/R4/7qyL1TCF7vl14= 24 | github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= 25 | github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= 26 | github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= 27 | github.com/jupiterrider/ffi v0.2.0 h1:tMM70PexgYNmV+WyaYhJgCvQAvtTCs3wXeILPutihnA= 28 | github.com/jupiterrider/ffi v0.2.0/go.mod h1:yqYqX5DdEccAsHeMn+6owkoI2llBLySVAF8dwCDZPVs= 29 | github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= 30 | github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= 31 | github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= 32 | github.com/mattn/go-runewidth v0.0.10/go.mod h1:RAqKPSqVFrSLVXbA8x7dzmKdmGzieGRCM46jaSJTDAk= 33 | github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= 34 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 35 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 36 | github.com/rivo/uniseg v0.1.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= 37 | github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= 38 | github.com/sashabaranov/go-openai v1.37.0 h1:hQQowgYm4OXJ1Z/wTrE+XZaO20BYsL0R3uRPSpfNZkY= 39 | github.com/sashabaranov/go-openai v1.37.0/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg= 40 | github.com/scylladb/termtables v0.0.0-20191203121021-c4c0b6d42ff4/go.mod h1:C1a7PQSMz9NShzorzCiG2fk9+xuCgLkPeCvMHYR2OWg= 41 | github.com/sebdah/goldie/v2 v2.5.3 h1:9ES/mNN+HNUbNWpVAlrzuZ7jE+Nrczbj8uFRjM7624Y= 42 | github.com/sebdah/goldie/v2 v2.5.3/go.mod h1:oZ9fp0+se1eapSRjfYbsV/0Hqhbuu3bJVvKI/NNtssI= 43 | github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo= 44 | github.com/sergi/go-diff v1.3.1 h1:xkr+Oxo4BOQKmkn/B9eMK0g5Kg/983T9DqqPHwYqD+8= 45 | github.com/sergi/go-diff v1.3.1/go.mod h1:aMJSSKb2lpPvRNec0+w3fl7LP9IOFzdc9Pa4NFbPK1I= 46 | github.com/spf13/cobra v1.9.1 h1:CXSaggrXdbHK9CF+8ywj8Amf7PBRmPCOJugH954Nnlo= 47 | github.com/spf13/cobra v1.9.1/go.mod h1:nDyEzZ8ogv936Cinf6g1RU9MRY64Ir93oCnqb9wxYW0= 48 | github.com/spf13/pflag v1.0.6 h1:jFzHGLGAlb3ruxLB8MhbI6A8+AQX/2eW4qeyNZXNp2o= 49 | github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= 50 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 51 | github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= 52 | github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= 53 | github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY= 54 | github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= 55 | github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= 56 | github.com/yuin/goldmark v1.7.1 h1:3bajkSilaCbjdKVsKdZjZCLBNPL9pYzrCakKaf4U49U= 57 | github.com/yuin/goldmark v1.7.1/go.mod h1:uzxRWxtg69N339t3louHJ7+O03ezfj6PlliRlaOzY1E= 58 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= 59 | golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= 60 | golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc= 61 | golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU= 62 | golang.org/x/crypto v0.22.0/go.mod h1:vr6Su+7cTlO45qkww3VDJlzDn0ctJvRgYbC2NvXHt+M= 63 | golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8= 64 | golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk= 65 | golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= 66 | golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= 67 | golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= 68 | golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= 69 | golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= 70 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= 71 | golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= 72 | golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= 73 | golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= 74 | golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= 75 | golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= 76 | golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= 77 | golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= 78 | golang.org/x/net v0.24.0/go.mod h1:2Q7sJY5mzlzWjKtYUEXSlBWCdyaioyXzRB2RtU8KVE8= 79 | golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= 80 | golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4= 81 | golang.org/x/net v0.35.0 h1:T5GQRQb2y08kTAByq9L4/bz8cipCdA8FbRTXewonqY8= 82 | golang.org/x/net v0.35.0/go.mod h1:EglIi67kWsHKlRzzVMUD93VMSWGFOMSZgxFjparz1Qk= 83 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 84 | golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 85 | golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 86 | golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= 87 | golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= 88 | golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= 89 | golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= 90 | golang.org/x/sync v0.11.0 h1:GGz8+XQP4FvTTrjZPzNKTMFtSXH80RAzG+5ghFPgK9w= 91 | golang.org/x/sync v0.11.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= 92 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 93 | golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 94 | golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 95 | golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 96 | golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 97 | golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 98 | golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 99 | golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 100 | golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 101 | golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= 102 | golang.org/x/sys v0.19.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= 103 | golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= 104 | golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= 105 | golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE= 106 | golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= 107 | golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= 108 | golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= 109 | golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY= 110 | golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= 111 | golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU= 112 | golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk= 113 | golang.org/x/term v0.19.0/go.mod h1:2CuTdWZ7KHSQwUzKva0cbMg6q2DMI3Mmxp+gKJbskEk= 114 | golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY= 115 | golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM= 116 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 117 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= 118 | golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= 119 | golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= 120 | golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= 121 | golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= 122 | golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= 123 | golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= 124 | golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= 125 | golang.org/x/text v0.22.0 h1:bofq7m3/HAFvbF51jz3Q9wLg3jkvSPuiZu/pD1XwgtM= 126 | golang.org/x/text v0.22.0/go.mod h1:YRoo4H8PVmsu+E3Ou7cqLVH8oXWIHVoX0jqUWALQhfY= 127 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 128 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= 129 | golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= 130 | golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= 131 | golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58= 132 | golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= 133 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 134 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 135 | gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 136 | gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= 137 | gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= 138 | gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= 139 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 140 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= 141 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 142 | -------------------------------------------------------------------------------- /markitdown.go: -------------------------------------------------------------------------------- 1 | package markitdown 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "net/http" 7 | "net/url" 8 | "os" 9 | "strings" 10 | 11 | "github.com/recally-io/go-markitdown/converters" 12 | ) 13 | 14 | type MarkitDown struct { 15 | options []converters.Option 16 | } 17 | 18 | func NewMarkitDown(opts ...converters.Option) *MarkitDown { 19 | return &MarkitDown{ 20 | options: opts, 21 | } 22 | } 23 | 24 | func (m *MarkitDown) Convert(ctx context.Context, source string) (string, error) { 25 | if strings.HasPrefix(source, "http://") || strings.HasPrefix(source, "https://") || strings.HasPrefix(source, "file://") { 26 | return m.ConvertURL(ctx, source) 27 | } 28 | return m.ConvertLocal(ctx, source) 29 | } 30 | 31 | func (m *MarkitDown) ConvertLocal(ctx context.Context, filePath string) (string, error) { 32 | fileType, err := getFileTypeFromPath(filePath) 33 | if err != nil { 34 | return "", fmt.Errorf("failed to determine file type: %w", err) 35 | } 36 | 37 | file, err := os.Open(filePath) 38 | if err != nil { 39 | return "", fmt.Errorf("failed to open file: %w", err) 40 | } 41 | defer file.Close() 42 | 43 | converter, err := NewConverter(fileType, m.options...) 44 | if err != nil { 45 | return "", fmt.Errorf("failed to create converter for type %s: %w", fileType, err) 46 | } 47 | 48 | markdown, err := converter.Convert(ctx, file) 49 | if err != nil { 50 | return "", fmt.Errorf("failed to convert %s to Markdown: %w", fileType, err) 51 | } 52 | 53 | return markdown, nil 54 | } 55 | 56 | func (m *MarkitDown) ConvertURL(ctx context.Context, uri string) (string, error) { 57 | 58 | u, err := url.Parse(uri) 59 | if err != nil { 60 | return "", fmt.Errorf("failed to parse URL: %w", err) 61 | } 62 | 63 | resp, err := http.Get(u.String()) 64 | if err != nil { 65 | return "", fmt.Errorf("failed to fetch URL: %w", err) 66 | } 67 | defer resp.Body.Close() 68 | 69 | fileType, err := getFileType(resp, u.String()) 70 | if err != nil { 71 | return "", fmt.Errorf("failed to determine file type: %w", err) 72 | } 73 | 74 | options := append(m.options, converters.WithHtmlHost(u.Host)) 75 | converter, err := NewConverter(fileType, options...) 76 | if err != nil { 77 | return "", fmt.Errorf("failed to create converter for type %s: %w", fileType, err) 78 | } 79 | 80 | markdown, err := converter.Convert(ctx, resp.Body) 81 | if err != nil { 82 | return "", fmt.Errorf("failed to convert %s to Markdown: %w", fileType, err) 83 | } 84 | 85 | return markdown, nil 86 | } 87 | --------------------------------------------------------------------------------