├── mdctl.png
├── main.go
├── action.yml
├── go.mod
├── .gitignore
├── .github
    └── workflows
    │   ├── update-homebrew.yml
    │   ├── release.yml
    │   ├── pr-review.yml
    │   ├── idoc.yml
    │   └── docker-build.yml
├── .goreleaser.yaml
├── LICENSE
├── Dockerfile
├── cmd
    ├── download.go
    ├── root.go
    ├── llmstxt.go
    ├── translate.go
    ├── export.go
    ├── lint.go
    └── upload.go
├── internal
    ├── storage
    │   └── provider.go
    ├── llmstxt
    │   ├── formatter.go
    │   ├── fetcher.go
    │   ├── generator.go
    │   ├── extractor.go
    │   └── sitemap.go
    ├── exporter
    │   ├── sitereader
    │   │   ├── reader.go
    │   │   └── mkdocs.go
    │   ├── heading.go
    │   ├── exporter.go
    │   ├── merger.go
    │   └── pandoc.go
    ├── linter
    │   ├── config.go
    │   ├── fixer.go
    │   ├── linter.go
    │   ├── linter_test.go
    │   └── rules_test.go
    ├── cache
    │   └── cache.go
    ├── markdownfmt
    │   └── formatter.go
    ├── processor
    │   └── processor.go
    ├── config
    │   └── config.go
    └── translator
    │   └── translator.go
├── docs
    ├── DEVELOPMENT.md
    └── features
    │   ├── export.md
    │   └── upload.md
├── README.md
├── go.sum
└── Makefile


/mdctl.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samzong/mdctl/HEAD/mdctl.png


--------------------------------------------------------------------------------
/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"github.com/samzong/mdctl/cmd"
 5 | )
 6 | 
 7 | func main() {
 8 | 	cmd.Execute()
 9 | }
10 | 


--------------------------------------------------------------------------------
/action.yml:
--------------------------------------------------------------------------------
 1 | name: "mdctl CLI"
 2 | description: "Run mdctl (Markdown docs toolkit) in GitHub Actions via Docker."
 3 | author: "samzong"
 4 | branding:
 5 |   icon: "book"
 6 |   color: "blue"
 7 | 
 8 | inputs:
 9 |   args:
10 |     description: "Arguments to pass to mdctl (e.g., \"export -f README.md -o out.docx\")."
11 |     required: false
12 |     default: "--help"
13 | 
14 | runs:
15 |   using: "docker"
16 |   image: "Dockerfile"
17 |   entrypoint: "/bin/sh"
18 |   args:
19 |     - -c
20 |     - mdctl ${{ inputs.args }}
21 | 
22 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/samzong/mdctl
 2 | 
 3 | go 1.23.4
 4 | 
 5 | require (
 6 | 	github.com/PuerkitoBio/goquery v1.9.1
 7 | 	github.com/aws/aws-sdk-go v1.55.6
 8 | 	github.com/gobwas/glob v0.2.3
 9 | 	github.com/spf13/cobra v1.8.1
10 | 	golang.org/x/text v0.23.0
11 | 	gopkg.in/yaml.v3 v3.0.1
12 | )
13 | 
14 | require (
15 | 	github.com/andybalholm/cascadia v1.3.2 // indirect
16 | 	github.com/inconshreveable/mousetrap v1.1.0 // indirect
17 | 	github.com/jmespath/go-jmespath v0.4.0 // indirect
18 | 	github.com/spf13/pflag v1.0.5 // indirect
19 | 	golang.org/x/net v0.33.0 // indirect
20 | )
21 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Binaries for programs and plugins
 2 | *.exe
 3 | *.exe~
 4 | *.dll
 5 | *.so
 6 | *.dylib
 7 | mdctl
 8 | bin/
 9 | 
10 | # Test binary, built with `go test -c`
11 | *.test
12 | 
13 | # Output of the go coverage tool, specifically when used with LiteIDE
14 | *.out
15 | 
16 | # Dependency directories (remove the comment below to include it)
17 | vendor/
18 | 
19 | # Go workspace file
20 | go.work
21 | 
22 | # IDE specific files
23 | .idea/
24 | .vscode/
25 | *.swp
26 | *.swo
27 | 
28 | # OS generated files
29 | .DS_Store
30 | .DS_Store?
31 | ._*
32 | .Spotlight-V100
33 | .Trashes
34 | ehthumbs.db
35 | Thumbs.db
36 | 
37 | # Project specific
38 | images/ 
39 | dist/
40 | *.docx
41 | *.pdf


--------------------------------------------------------------------------------
/.github/workflows/update-homebrew.yml:
--------------------------------------------------------------------------------
 1 | name: Update Homebrew Tap
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |     inputs:
 6 |       tag:
 7 |         description: 'Select the tag to update Homebrew'
 8 |         required: true
 9 |         type: string
10 |   repository_dispatch:
11 |     types: [trigger-homebrew-update]
12 | 
13 | jobs:
14 |   update-homebrew:
15 |     runs-on: macos-latest
16 |     steps:
17 |     - name: Set version
18 |       run: |
19 |         if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
20 |           echo "VERSION=${{ inputs.tag }}" >> $GITHUB_ENV
21 |         else
22 |           echo "VERSION=${{ github.event.client_payload.version }}" >> $GITHUB_ENV
23 |         fi
24 | 
25 |     - name: Checkout repository
26 |       uses: actions/checkout@v4
27 |       with:
28 |         fetch-depth: 0
29 | 
30 |     - name: Update Homebrew Formula
31 |       env:
32 |         GH_PAT: ${{ secrets.GH_PAT }}
33 |       run: make update-homebrew


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Release
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - 'v*'
 7 | 
 8 | permissions:
 9 |   contents: write
10 |   packages: write
11 | 
12 | jobs:
13 |   goreleaser:
14 |     runs-on: ubuntu-latest
15 |     steps:
16 |       - name: Checkout
17 |         uses: actions/checkout@v4
18 |         with:
19 |           fetch-depth: 0
20 | 
21 |       - name: Set up Go
22 |         uses: actions/setup-go@v4
23 |         with:
24 |           go-version: '>=1.21.0'
25 |           cache: true
26 | 
27 |       - name: Run GoReleaser
28 |         uses: goreleaser/goreleaser-action@v5
29 |         with:
30 |           distribution: goreleaser
31 |           version: latest
32 |           args: release --clean
33 |         env:
34 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 
35 |       
36 |       - name: Trigger Homebrew Update
37 |         if: success()
38 |         uses: peter-evans/repository-dispatch@v2
39 |         with:
40 |           token: ${{ secrets.GH_PAT }}
41 |           event-type: trigger-homebrew-update
42 |           client-payload: '{"version": "${{ env.VERSION }}"}'


--------------------------------------------------------------------------------
/.github/workflows/pr-review.yml:
--------------------------------------------------------------------------------
 1 | name: PR Review
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     types: [opened, synchronize, reopened]
 6 |     paths-ignore:
 7 |       - '**.md'
 8 |       - 'docs/**'
 9 |       - '.gitignore'
10 | 
11 | jobs:
12 |   review:
13 |     name: Build & Test
14 |     runs-on: ubuntu-latest
15 |     steps:
16 |       - name: Checkout code
17 |         uses: actions/checkout@v4
18 |         with:
19 |           fetch-depth: 0
20 | 
21 |       - name: Set up Go
22 |         uses: actions/setup-go@v5
23 |         with:
24 |           go-version: '1.21'
25 |           cache: true
26 | 
27 |       - name: Install dependencies
28 |         run: make deps
29 | 
30 |       - name: Format check
31 |         run: |
32 |           make fmt
33 |           git diff --exit-code || (echo "Code is not formatted. Please run 'make fmt'" && exit 1)
34 | 
35 |       - name: Run tests
36 |         run: make test
37 | 
38 |       - name: Build
39 |         run: make build
40 | 
41 |       - name: Upload artifact
42 |         uses: actions/upload-artifact@v4
43 |         with:
44 |           name: mdctl
45 |           path: bin/mdctl 


--------------------------------------------------------------------------------
/.goreleaser.yaml:
--------------------------------------------------------------------------------
 1 | before:
 2 |   hooks:
 3 |     - go mod tidy
 4 | 
 5 | builds:
 6 |   - env:
 7 |       - CGO_ENABLED=0
 8 |     goos:
 9 |       - linux
10 |       - windows
11 |       - darwin
12 |     goarch:
13 |       - amd64
14 |       - arm64
15 |     ignore:
16 |       - goos: windows
17 |         goarch: arm64
18 |     ldflags:
19 |       - -s -w -X github.com/samzong/mdctl/cmd.Version={{.Version}} -X github.com/samzong/mdctl/cmd.BuildTime={{.Date}}
20 |     binary: mdctl
21 | 
22 | archives:
23 |   - format: tar.gz
24 |     name_template: >-
25 |       {{ .ProjectName }}_
26 |       {{- title .Os }}_
27 |       {{- if eq .Arch "amd64" }}x86_64
28 |       {{- else if eq .Arch "386" }}i386
29 |       {{- else }}{{ .Arch }}{{ end }}
30 |       {{- if .Arm }}v{{ .Arm }}{{ end }}
31 |     format_overrides:
32 |       - goos: windows
33 |         format: zip
34 | 
35 | changelog:
36 |   sort: asc
37 |   filters:
38 |     exclude:
39 |       - '^docs:'
40 |       - '^test:'
41 |       - '^ci:'
42 |       - '^chore:'
43 | 
44 | checksum:
45 |   name_template: 'checksums.txt'
46 | 
47 | snapshot:
48 |   name_template: "{{ incpatch .Version }}-next" 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 samzong
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE. 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM golang:1.23-alpine AS builder
 2 | 
 3 | WORKDIR /app
 4 | 
 5 | # Copy go mod and sum files
 6 | COPY go.mod go.sum ./
 7 | 
 8 | # Download dependencies
 9 | RUN go mod download
10 | 
11 | # Copy source code
12 | COPY . .
13 | 
14 | # Install git for version information
15 | RUN apk add --no-cache git
16 | 
17 | # Set build arguments with defaults
18 | ARG VERSION=dev
19 | ARG BUILD_TIME
20 | 
21 | # Set default build time if not provided
22 | RUN if [ -z "$BUILD_TIME" ]; then BUILD_TIME=$(date -u +"%Y-%m-%dT%H:%M:%SZ"); fi && \
23 |     echo "Building version: $VERSION, build time: $BUILD_TIME" && \
24 |     CGO_ENABLED=0 go build -trimpath -ldflags "-s -w -X github.com/samzong/mdctl/cmd.Version=${VERSION} -X github.com/samzong/mdctl/cmd.BuildTime=${BUILD_TIME}" -o /app/bin/mdctl
25 | 
26 | # Use a minimal alpine image for the final stage
27 | FROM alpine:3.19
28 | 
29 | # Install ca-certificates for HTTPS requests
30 | RUN apk --no-cache add ca-certificates
31 | 
32 | WORKDIR /root/
33 | 
34 | # Copy the binary from the builder stage
35 | COPY --from=builder /app/bin/mdctl /usr/local/bin/mdctl
36 | 
37 | # Create config directory
38 | RUN mkdir -p /root/.config/mdctl
39 | 
40 | # Set the entrypoint
41 | ENTRYPOINT ["mdctl"]
42 | 
43 | # Default command
44 | CMD ["--help"]
45 | 


--------------------------------------------------------------------------------
/cmd/download.go:
--------------------------------------------------------------------------------
 1 | package cmd
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 
 6 | 	"github.com/samzong/mdctl/internal/processor"
 7 | 
 8 | 	"github.com/spf13/cobra"
 9 | )
10 | 
11 | var (
12 | 	sourceFile     string
13 | 	sourceDir      string
14 | 	imageOutputDir string
15 | 
16 | 	downloadCmd = &cobra.Command{
17 | 		Use:   "download",
18 | 		Short: "Download remote images in markdown files",
19 | 		Long: `Download remote images in markdown files to local storage and update references.
20 | Examples:
21 |   mdctl download -f post.md
22 |   mdctl download -d content/posts
23 |   mdctl download -f post.md -o assets/images`,
24 | 		RunE: func(cmd *cobra.Command, args []string) error {
25 | 			if sourceFile == "" && sourceDir == "" {
26 | 				return fmt.Errorf("either source file (-f) or source directory (-d) must be specified")
27 | 			}
28 | 			if sourceFile != "" && sourceDir != "" {
29 | 				return fmt.Errorf("cannot specify both source file (-f) and source directory (-d)")
30 | 			}
31 | 
32 | 			p := processor.New(sourceFile, sourceDir, imageOutputDir)
33 | 			return p.Process()
34 | 		},
35 | 	}
36 | )
37 | 
38 | func init() {
39 | 	downloadCmd.Flags().StringVarP(&sourceFile, "file", "f", "", "Source markdown file to process")
40 | 	downloadCmd.Flags().StringVarP(&sourceDir, "dir", "d", "", "Source directory containing markdown files to process")
41 | 	downloadCmd.Flags().StringVarP(&imageOutputDir, "output", "o", "", "Output directory for downloaded images (optional)")
42 | }
43 | 


--------------------------------------------------------------------------------
/cmd/root.go:
--------------------------------------------------------------------------------
 1 | package cmd
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"os"
 6 | 
 7 | 	"github.com/spf13/cobra"
 8 | )
 9 | 
10 | var (
11 | 	Version     = "dev"
12 | 	BuildTime   = "unknown"
13 | 	verbose     bool
14 | 	veryVerbose bool
15 | 
16 | 	rootCmd = &cobra.Command{
17 | 		Use:   "mdctl",
18 | 		Short: "A CLI tool for markdown file operations",
19 | 		Long: `mdctl is a CLI tool that helps you manage and process markdown files.
20 | Currently supports downloading remote images and more features to come.`,
21 | 		Version: fmt.Sprintf("%s (built at %s)", Version, BuildTime),
22 | 	}
23 | )
24 | 
25 | func Execute() {
26 | 	if err := rootCmd.Execute(); err != nil {
27 | 		fmt.Println(err)
28 | 		os.Exit(1)
29 | 	}
30 | }
31 | 
32 | func init() {
33 | 	// Add commands first
34 | 	rootCmd.AddCommand(translateCmd)
35 | 	rootCmd.AddCommand(downloadCmd)
36 | 	rootCmd.AddCommand(configCmd)
37 | 	rootCmd.AddCommand(uploadCmd)
38 | 	rootCmd.AddCommand(exportCmd)
39 | 	rootCmd.AddCommand(llmstxtCmd)
40 | 	rootCmd.AddCommand(lintCmd)
41 | 
42 | 	// Add global flags
43 | 	rootCmd.PersistentFlags().BoolVarP(&verbose, "verbose", "v", false, "Enable verbose output")
44 | 	rootCmd.PersistentFlags().BoolVar(&veryVerbose, "vv", false, "Enable very verbose output with detailed information")
45 | 
46 | 	// Then add groups and set group IDs
47 | 	rootCmd.AddGroup(&cobra.Group{
48 | 		ID:    "core",
49 | 		Title: "Core Commands:",
50 | 	})
51 | 	rootCmd.AddGroup(&cobra.Group{
52 | 		ID:    "config",
53 | 		Title: "Configuration Commands:",
54 | 	})
55 | 
56 | 	// Set group for each command
57 | 	translateCmd.GroupID = "core"
58 | 	downloadCmd.GroupID = "core"
59 | 	uploadCmd.GroupID = "core"
60 | 	exportCmd.GroupID = "core"
61 | 	llmstxtCmd.GroupID = "core"
62 | 	lintCmd.GroupID = "core"
63 | 	configCmd.GroupID = "config"
64 | }
65 | 


--------------------------------------------------------------------------------
/internal/storage/provider.go:
--------------------------------------------------------------------------------
 1 | package storage
 2 | 
 3 | import (
 4 | 	"github.com/samzong/mdctl/internal/config"
 5 | )
 6 | 
 7 | // Provider defines the interface for storage providers
 8 | type Provider interface {
 9 | 	// Upload uploads a file to cloud storage
10 | 	Upload(localPath, remotePath string, metadata map[string]string) (string, error)
11 | 
12 | 	// Configure sets up the provider with the given configuration
13 | 	Configure(config config.CloudConfig) error
14 | 
15 | 	// GetPublicURL returns the public URL for a remote path
16 | 	GetPublicURL(remotePath string) string
17 | 
18 | 	// ObjectExists checks if an object exists in the storage
19 | 	ObjectExists(remotePath string) (bool, error)
20 | 
21 | 	// CompareHash compares a local hash with a remote object's hash
22 | 	CompareHash(remotePath, localHash string) (bool, error)
23 | 
24 | 	// SetObjectMetadata sets metadata for an object
25 | 	SetObjectMetadata(remotePath string, metadata map[string]string) error
26 | 
27 | 	// GetObjectMetadata retrieves metadata for an object
28 | 	GetObjectMetadata(remotePath string) (map[string]string, error)
29 | }
30 | 
31 | // ProviderFactory is a function that creates a new storage provider
32 | type ProviderFactory func() Provider
33 | 
34 | var providers = make(map[string]ProviderFactory)
35 | 
36 | // RegisterProvider registers a storage provider factory
37 | func RegisterProvider(name string, factory ProviderFactory) {
38 | 	providers[name] = factory
39 | }
40 | 
41 | // GetProvider returns a storage provider by name
42 | func GetProvider(name string) (Provider, bool) {
43 | 	factory, exists := providers[name]
44 | 	if !exists {
45 | 		return nil, false
46 | 	}
47 | 	return factory(), true
48 | }
49 | 
50 | // ListProviders returns a list of available provider names
51 | func ListProviders() []string {
52 | 	var names []string
53 | 	for name := range providers {
54 | 		names = append(names, name)
55 | 	}
56 | 	return names
57 | }
58 | 


--------------------------------------------------------------------------------
/internal/llmstxt/formatter.go:
--------------------------------------------------------------------------------
 1 | package llmstxt
 2 | 
 3 | import (
 4 | 	"strings"
 5 | 	"unicode"
 6 | )
 7 | 
 8 | // Format to Markdown content
 9 | func (g *Generator) formatContent(sections map[string][]PageInfo) string {
10 | 	var buf strings.Builder
11 | 
12 | 	// Get sorted section list
13 | 	sectionNames := g.getSortedSections(sections)
14 | 
15 | 	// Find root page info
16 | 	var rootPage PageInfo
17 | 	if rootPages, ok := sections["ROOT"]; ok && len(rootPages) > 0 {
18 | 		rootPage = rootPages[0]
19 | 	}
20 | 
21 | 	// Add document title
22 | 	buf.WriteString("# ")
23 | 	buf.WriteString(rootPage.Title)
24 | 	buf.WriteString("\n\n")
25 | 
26 | 	// Add document description
27 | 	buf.WriteString("> ")
28 | 	buf.WriteString(rootPage.Description)
29 | 	buf.WriteString("\n\n")
30 | 
31 | 	// Handle each section
32 | 	for _, section := range sectionNames {
33 | 		// Skip ROOT section, because it's already used for title and description
34 | 		if section == "ROOT" {
35 | 			continue
36 | 		}
37 | 
38 | 		// Add section title
39 | 		buf.WriteString("## ")
40 | 		buf.WriteString(capitalizeString(section))
41 | 		buf.WriteString("\n\n")
42 | 
43 | 		// Add page info for each page in section
44 | 		for _, page := range sections[section] {
45 | 			buf.WriteString("- [")
46 | 			buf.WriteString(page.Title)
47 | 			buf.WriteString("](")
48 | 			buf.WriteString(page.URL)
49 | 			buf.WriteString("): ")
50 | 			buf.WriteString(page.Description)
51 | 			buf.WriteString("\n")
52 | 
53 | 			// Add page content in full mode
54 | 			if g.config.FullMode && page.Content != "" {
55 | 				buf.WriteString("\n")
56 | 				buf.WriteString(page.Content)
57 | 				buf.WriteString("\n")
58 | 			}
59 | 
60 | 			buf.WriteString("\n")
61 | 		}
62 | 	}
63 | 
64 | 	return buf.String()
65 | }
66 | 
67 | // Capitalize first letter, lowercase the rest
68 | func capitalizeString(str string) string {
69 | 	if str == "" {
70 | 		return ""
71 | 	}
72 | 
73 | 	runes := []rune(str)
74 | 	return string(unicode.ToUpper(runes[0])) + strings.ToLower(string(runes[1:]))
75 | }
76 | 


--------------------------------------------------------------------------------
/.github/workflows/idoc.yml:
--------------------------------------------------------------------------------
 1 | # 📖 Simple document generation tool! Dependence Node.js run.
 2 | # https://github.com/jaywcjlove/idoc
 3 | 
 4 | name: idoc
 5 | on:
 6 |   push:
 7 |     branches:
 8 |       - main
 9 | 
10 | jobs:
11 |   build-deploy:
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |       - uses: actions/checkout@v4
15 |       - uses: actions/setup-node@v4
16 |         with:
17 |           node-version: 20
18 |           registry-url: "https://registry.npmjs.org"
19 | 
20 |       - name: Create idoc config.
21 |         run: |
22 |           cat > idoc.yml << EOF
23 |           site: mdctl
24 |           description: A command-line tool for processing Markdown files. Currently, it supports automatically downloading remote images to local storage and updating the image references in Markdown files, as well as translating markdown files using AI models.
25 |           keywords: Markdown processor,CLI tool,Image downloader,Markdown translator,AI translation,Markdown automation,Remote image handling,Markdown utilities,AI-powered Markdown,Markdown enhancement,Markdown file management
26 |           favicon: assets/favicon.ico
27 |           logo: assets/icon.png
28 | 
29 |           openSource: https://github.com/samzong/mdctl
30 | 
31 |           tocs: false
32 | 
33 |           element:
34 |             wrapper: style=max-width:720px;
35 | 
36 |           menus:
37 |             Home: index.html
38 |             About:
39 |               url: https://github.com/samzong
40 |               target: __blank
41 |           sideEffectFiles:
42 |             - README_zh.md
43 | 
44 |           cacheFileStat: true
45 | 
46 |           footer: |
47 |             Copyright © {{idocYear}} <a href="https://github.com/samzong" target="_blank">samzong</a><br />
48 |           EOF
49 | 
50 |       - run: npm install idoc@1 -g
51 |       - run: idoc
52 | 
53 |       - name: Deploy
54 |         uses: peaceiris/actions-gh-pages@v4
55 |         if: github.ref == 'refs/heads/main'
56 |         with:
57 |           github_token: ${{ secrets.GITHUB_TOKEN }}
58 |           publish_dir: ./dist
59 | 


--------------------------------------------------------------------------------
/internal/exporter/sitereader/reader.go:
--------------------------------------------------------------------------------
 1 | package sitereader
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"io"
 6 | 	"log"
 7 | 	"os"
 8 | 	"path/filepath"
 9 | )
10 | 
11 | // SiteReader Define Site Reader Interface
12 | type SiteReader interface {
13 | 	// Detect if given directory is this type of site
14 | 	Detect(dir string) bool
15 | 
16 | 	// Read site structure, return sorted list of files
17 | 	// navPath parameter is used to specify the navigation path to export, empty to export all
18 | 	ReadStructure(dir string, configPath string, navPath string) ([]string, error)
19 | }
20 | 
21 | // GetSiteReader Return the appropriate reader based on site type
22 | func GetSiteReader(siteType string, verbose bool, logger *log.Logger) (SiteReader, error) {
23 | 	// If no logger is provided, create a default one
24 | 	if logger == nil {
25 | 		if verbose {
26 | 			logger = log.New(os.Stdout, "[SITE-READER] ", log.LstdFlags)
27 | 		} else {
28 | 			logger = log.New(io.Discard, "", 0)
29 | 		}
30 | 	}
31 | 
32 | 	logger.Printf("Creating site reader for type: %s", siteType)
33 | 
34 | 	switch siteType {
35 | 	case "mkdocs":
36 | 		logger.Println("Using MkDocs site reader")
37 | 		return &MkDocsReader{Logger: logger}, nil
38 | 	case "hugo":
39 | 		logger.Println("Hugo site type is not yet implemented")
40 | 		return nil, fmt.Errorf("hugo site type is not yet implemented")
41 | 	case "docusaurus":
42 | 		logger.Println("Docusaurus site type is not yet implemented")
43 | 		return nil, fmt.Errorf("docusaurus site type is not yet implemented")
44 | 	default:
45 | 		logger.Printf("Unsupported site type: %s", siteType)
46 | 		return nil, fmt.Errorf("unsupported site type: %s", siteType)
47 | 	}
48 | }
49 | 
50 | // FindConfigFile Find config file in given directory
51 | func FindConfigFile(dir string, configNames []string) (string, error) {
52 | 	// If no config file name is provided, use default values
53 | 	if len(configNames) == 0 {
54 | 		configNames = []string{"config.yml", "config.yaml"}
55 | 	}
56 | 
57 | 	// Find config file
58 | 	for _, name := range configNames {
59 | 		configPath := filepath.Join(dir, name)
60 | 		if _, err := os.Stat(configPath); err == nil {
61 | 			return configPath, nil
62 | 		}
63 | 	}
64 | 
65 | 	return "", fmt.Errorf("no config file found in %s", dir)
66 | }
67 | 


--------------------------------------------------------------------------------
/cmd/llmstxt.go:
--------------------------------------------------------------------------------
 1 | package cmd
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"os"
 6 | 
 7 | 	"github.com/samzong/mdctl/internal/llmstxt"
 8 | 	"github.com/spf13/cobra"
 9 | )
10 | 
11 | var (
12 | 	includePaths []string
13 | 	excludePaths []string
14 | 	outputPath   string
15 | 	fullMode     bool
16 | 	concurrency  int
17 | 	timeout      int
18 | 	maxPages     int
19 | 
20 | 	llmstxtCmd = &cobra.Command{
21 | 		Use:   "llmstxt [url]",
22 | 		Short: "Generate llms.txt from sitemap.xml",
23 | 		Long: `Generate a llms.txt file from a website's sitemap.xml. This file is a curated 
24 | list of the website's pages in markdown format, perfect for training or fine-tuning 
25 | language models.
26 | 
27 | In standard mode, only title and description are extracted. In full mode (-f flag), 
28 | the content of each page is also extracted.
29 | 
30 | Examples:
31 |   # Standard mode
32 |   mdctl llmstxt https://example.com/sitemap.xml > llms.txt
33 | 
34 |   # Full-content mode
35 |   mdctl llmstxt -f https://example.com/sitemap.xml > llms-full.txt`,
36 | 		Args: cobra.ExactArgs(1),
37 | 		RunE: func(cmd *cobra.Command, args []string) error {
38 | 			sitemapURL := args[0]
39 | 
40 | 			// Create a generator and configure options
41 | 			config := llmstxt.GeneratorConfig{
42 | 				SitemapURL:   sitemapURL,
43 | 				IncludePaths: includePaths,
44 | 				ExcludePaths: excludePaths,
45 | 				FullMode:     fullMode,
46 | 				Concurrency:  concurrency,
47 | 				Timeout:      timeout,
48 | 				UserAgent:    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
49 | 				Verbose:      verbose,
50 | 				VeryVerbose:  veryVerbose,
51 | 				MaxPages:     maxPages,
52 | 			}
53 | 
54 | 			generator := llmstxt.NewGenerator(config)
55 | 
56 | 			// Execute generation
57 | 			content, err := generator.Generate()
58 | 			if err != nil {
59 | 				return err
60 | 			}
61 | 
62 | 			// Output content
63 | 			if outputPath == "" {
64 | 				// Output to standard output
65 | 				fmt.Println(content)
66 | 			} else {
67 | 				// Output to file
68 | 				return os.WriteFile(outputPath, []byte(content), 0644)
69 | 			}
70 | 
71 | 			return nil
72 | 		},
73 | 	}
74 | )
75 | 
76 | func init() {
77 | 	llmstxtCmd.Flags().StringVarP(&outputPath, "output", "o", "", "Output file path (default: stdout)")
78 | 	llmstxtCmd.Flags().StringSliceVarP(&includePaths, "include-path", "i", []string{}, "Glob patterns for paths to include (can be specified multiple times)")
79 | 	llmstxtCmd.Flags().StringSliceVarP(&excludePaths, "exclude-path", "e", []string{}, "Glob patterns for paths to exclude (can be specified multiple times)")
80 | 	llmstxtCmd.Flags().BoolVarP(&fullMode, "full", "f", false, "Enable full-content mode (extract page content)")
81 | 	llmstxtCmd.Flags().IntVarP(&concurrency, "concurrency", "c", 5, "Number of concurrent requests")
82 | 	llmstxtCmd.Flags().IntVar(&timeout, "timeout", 30, "Request timeout in seconds")
83 | 	llmstxtCmd.Flags().IntVar(&maxPages, "max-pages", 0, "Maximum number of pages to process (0 for unlimited)")
84 | 
85 | 	// Add command to core group
86 | 	llmstxtCmd.GroupID = "core"
87 | 
88 | 	rootCmd.AddCommand(llmstxtCmd)
89 | }
90 | 


--------------------------------------------------------------------------------
/internal/llmstxt/fetcher.go:
--------------------------------------------------------------------------------
  1 | package llmstxt
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"net/http"
  6 | 	"sync"
  7 | 	"time"
  8 | )
  9 | 
 10 | // Fetch pages concurrently using a worker pool
 11 | func (g *Generator) fetchPages(urls []string) ([]PageInfo, error) {
 12 | 	g.logger.Printf("Starting to fetch %d pages with concurrency %d", len(urls), g.config.Concurrency)
 13 | 
 14 | 	// Create result and error channels
 15 | 	resultChan := make(chan PageInfo, len(urls))
 16 | 	errorChan := make(chan error, len(urls))
 17 | 
 18 | 	// Create work channel, controlling concurrency
 19 | 	workChan := make(chan string, len(urls))
 20 | 
 21 | 	// Start worker pool
 22 | 	var wg sync.WaitGroup
 23 | 	for i := 0; i < g.config.Concurrency; i++ {
 24 | 		wg.Add(1)
 25 | 		go func() {
 26 | 			defer wg.Done()
 27 | 			for urlStr := range workChan {
 28 | 				pageInfo, err := g.fetchPageContent(urlStr)
 29 | 				if err != nil {
 30 | 					g.logger.Printf("Warning: failed to fetch page %s: %v", urlStr, err)
 31 | 					errorChan <- fmt.Errorf("failed to fetch page %s: %w", urlStr, err)
 32 | 					continue
 33 | 				}
 34 | 				resultChan <- pageInfo
 35 | 			}
 36 | 		}()
 37 | 	}
 38 | 
 39 | 	// Send all URLs to work channel
 40 | 	for _, urlStr := range urls {
 41 | 		workChan <- urlStr
 42 | 	}
 43 | 	close(workChan)
 44 | 
 45 | 	// Wait for all work to finish
 46 | 	wg.Wait()
 47 | 	close(resultChan)
 48 | 	close(errorChan)
 49 | 
 50 | 	// Collect results
 51 | 	var results []PageInfo
 52 | 	for result := range resultChan {
 53 | 		results = append(results, result)
 54 | 		g.logger.Printf("Fetched page: %s", result.URL)
 55 | 	}
 56 | 
 57 | 	// Check for errors (don't interrupt processing, just log warnings)
 58 | 	for err := range errorChan {
 59 | 		g.logger.Printf("Warning: %v", err)
 60 | 	}
 61 | 
 62 | 	g.logger.Printf("Successfully fetched %d/%d pages", len(results), len(urls))
 63 | 
 64 | 	return results, nil
 65 | }
 66 | 
 67 | // Get the content of a single page
 68 | func (g *Generator) fetchPageContent(urlStr string) (PageInfo, error) {
 69 | 	// Set HTTP client
 70 | 	client := &http.Client{
 71 | 		Timeout: time.Duration(g.config.Timeout) * time.Second,
 72 | 	}
 73 | 
 74 | 	// Build request
 75 | 	req, err := http.NewRequest("GET", urlStr, nil)
 76 | 	if err != nil {
 77 | 		return PageInfo{}, fmt.Errorf("failed to create request: %w", err)
 78 | 	}
 79 | 
 80 | 	// Set User-Agent
 81 | 	req.Header.Set("User-Agent", g.config.UserAgent)
 82 | 
 83 | 	// Send request
 84 | 	start := time.Now()
 85 | 	resp, err := client.Do(req)
 86 | 	if err != nil {
 87 | 		return PageInfo{}, fmt.Errorf("failed to fetch page: %w", err)
 88 | 	}
 89 | 	defer resp.Body.Close()
 90 | 
 91 | 	if resp.StatusCode != http.StatusOK {
 92 | 		return PageInfo{}, fmt.Errorf("failed to fetch page, status code: %d", resp.StatusCode)
 93 | 	}
 94 | 
 95 | 	// Extract page information
 96 | 	pageInfo, err := g.extractPageInfo(urlStr, resp)
 97 | 	if err != nil {
 98 | 		return PageInfo{}, fmt.Errorf("failed to extract page info: %w", err)
 99 | 	}
100 | 
101 | 	// Record timing information
102 | 	elapsed := time.Since(start).Round(time.Millisecond)
103 | 	g.logger.Printf("Fetched %s in %v", urlStr, elapsed)
104 | 
105 | 	return pageInfo, nil
106 | }
107 | 


--------------------------------------------------------------------------------
/docs/DEVELOPMENT.md:
--------------------------------------------------------------------------------
  1 | # mdctl 开发者指南
  2 | 
  3 | ## 项目介绍
  4 | 
  5 | mdctl 是一个用于处理 Markdown 文件的命令行工具，主要功能包括：
  6 | 
  7 | 1. **下载功能**：自动下载 Markdown 文件中的远程图片到本地，并更新引用路径
  8 | 2. **翻译功能**：使用 AI 模型将 Markdown 文件翻译成多种语言
  9 | 3. **上传功能**：将本地图片上传到云存储，并更新 Markdown 文件中的引用
 10 | 4. **配置管理**：管理工具的配置信息
 11 | 5. **其他功能**：如导出为其他格式、生成 llms.txt 文件等
 12 | 
 13 | ## 项目结构
 14 | 
 15 | ```bash
 16 | ../mdctl
 17 | ├── cmd
 18 | │   ├── config.go
 19 | │   ├── download.go
 20 | │   ├── export.go
 21 | │   ├── llmstxt.go
 22 | │   ├── root.go
 23 | │   ├── translate.go
 24 | │   └── upload.go
 25 | ├── internal
 26 | │   ├── cache
 27 | │   ├── config
 28 | │   ├── exporter
 29 | │   ├── llmstxt
 30 | │   ├── markdownfmt
 31 | │   ├── processor
 32 | │   ├── storage
 33 | │   ├── translator
 34 | │   └── uploader
 35 | ├── main.go
 36 | ├── go.mod
 37 | ├── go.sum
 38 | ```
 39 | 
 40 | ## 核心模块说明
 41 | 
 42 | ### 命令行模块 (cmd/)
 43 | 
 44 | 使用 [Cobra](https://github.com/spf13/cobra) 库实现命令行界面，主要命令包括：
 45 | 
 46 | - **root**: 根命令，定义基本信息和版本
 47 | - **download**: 下载远程图片到本地
 48 | - **translate**: 翻译 Markdown 文件
 49 | - **upload**: 上传本地图片到云存储
 50 | - **config**: 管理配置信息
 51 | 
 52 | ### 处理器模块 (internal/processor/)
 53 | 
 54 | 负责处理 Markdown 文件中的远程图片下载，主要功能：
 55 | 
 56 | - 解析 Markdown 文件中的图片链接
 57 | - 下载远程图片到本地
 58 | - 更新 Markdown 文件中的图片引用路径
 59 | 
 60 | ### 翻译模块 (internal/translator/)
 61 | 
 62 | 负责翻译 Markdown 文件，主要功能：
 63 | 
 64 | - 支持多种语言翻译
 65 | - 保持 Markdown 格式和 front matter 不变
 66 | - 使用 AI 模型进行翻译
 67 | - 支持目录结构的翻译
 68 | 
 69 | ### 上传模块 (internal/uploader/)
 70 | 
 71 | 负责上传本地图片到云存储，主要功能：
 72 | 
 73 | - 解析 Markdown 文件中的本地图片链接
 74 | - 上传图片到云存储
 75 | - 更新 Markdown 文件中的图片引用路径
 76 | - 支持多种冲突处理策略
 77 | 
 78 | ### 存储模块 (internal/storage/)
 79 | 
 80 | 定义存储提供者接口和实现，主要功能：
 81 | 
 82 | - 提供统一的存储接口
 83 | - 支持 S3 兼容的存储服务
 84 | - 处理文件上传和元数据管理
 85 | 
 86 | ### llms.txt 生成模块 (internal/llmstxt/)
 87 | 
 88 | 负责从网站的 sitemap.xml 生成 llms.txt 文件，主要功能：
 89 | 
 90 | - 解析 sitemap.xml 文件
 91 | - 访问每个 URL 并提取页面内容
 92 | - 生成格式化的 llms.txt 文档
 93 | 
 94 | ### 配置模块 (internal/config/)
 95 | 
 96 | 负责管理配置信息，主要功能：
 97 | 
 98 | - 加载和保存配置文件
 99 | - 管理 AI 模型配置
100 | - 管理云存储配置
101 | 
102 | ## 开发风格和约定
103 | 
104 | ### 代码组织
105 | 
106 | 1. **命令与实现分离**：命令行接口在 `cmd/` 目录，具体实现在 `internal/` 目录
107 | 2. **模块化设计**：每个功能都有独立的模块，如处理器、翻译器、上传器等
108 | 3. **接口定义**：使用接口定义模块间交互，如存储提供者接口
109 | 
110 | ### 错误处理
111 | 
112 | 错误处理采用 Go 语言的标准方式，通过返回错误值进行传递和处理。
113 | 
114 | ### 配置管理
115 | 
116 | 配置文件存储在 `~/.config/mdctl/config.json`，包含：
117 | 
118 | - AI 模型配置（端点、API 密钥、模型名称等）
119 | - 云存储配置（提供者、区域、访问密钥等）
120 | 
121 | ### 日志输出
122 | 
123 | 使用标准输出进行日志记录，提供详细的处理信息和错误信息。
124 | 
125 | ## 添加新功能的步骤
126 | 
127 | 1. **定义命令**：在 `cmd/` 目录下创建新的命令文件，定义命令行接口
128 | 2. **实现功能**：在 `internal/` 目录下创建相应的实现模块
129 | 3. **注册命令**：在 `cmd/root.go` 的 `init()` 函数中注册新命令
130 | 4. **更新文档**：更新 README 文件，添加新功能的说明
131 | 
132 | ## 构建和发布
133 | 
134 | 项目使用 Makefile 和 GoReleaser 进行构建和发布：
135 | 
136 | - **构建**：使用 `make build` 命令构建项目
137 | - **发布**：使用 `make release` 命令发布新版本
138 | 
139 | ## 扩展点
140 | 
141 | ### 添加新的存储提供者
142 | 
143 | 1. 在 `internal/storage/` 目录下创建新的提供者实现
144 | 2. 实现 `Provider` 接口
145 | 3. 在初始化时注册提供者
146 | 
147 | ### 添加新的 AI 模型支持
148 | 
149 | 1. 在 `internal/translator/` 目录下扩展翻译器实现
150 | 2. 添加新模型的 API 调用
151 | 3. 更新配置模块以支持新模型的配置
152 | 
153 | ### 添加新的 Markdown 处理功能
154 | 
155 | 1. 创建新的处理器模块
156 | 2. 实现 Markdown 解析和处理逻辑
157 | 3. 添加新的命令行接口
158 | 


--------------------------------------------------------------------------------
/internal/exporter/heading.go:
--------------------------------------------------------------------------------
  1 | package exporter
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"fmt"
  6 | 	"regexp"
  7 | 	"strings"
  8 | )
  9 | 
 10 | var (
 11 | 	// Match ATX-style headings (those starting with #)
 12 | 	atxHeadingRegex = regexp.MustCompile(`^(#{1,6})\s+(.+)$`)
 13 | 	// Match Setext-style headings (underline style)
 14 | 	setextHeading1Regex = regexp.MustCompile(`^=+\s*$`)
 15 | 	setextHeading2Regex = regexp.MustCompile(`^-+\s*$`)
 16 | )
 17 | 
 18 | // ShiftHeadings Adjust heading levels in Markdown text
 19 | func ShiftHeadings(content string, shiftBy int) string {
 20 | 	if shiftBy == 0 {
 21 | 		return content
 22 | 	}
 23 | 
 24 | 	scanner := bufio.NewScanner(strings.NewReader(content))
 25 | 	var result []string
 26 | 	var prevLine string
 27 | 	var isPrevLineHeading bool
 28 | 
 29 | 	for scanner.Scan() {
 30 | 		line := scanner.Text()
 31 | 
 32 | 		// Handle ATX-style headings
 33 | 		if matches := atxHeadingRegex.FindStringSubmatch(line); matches != nil {
 34 | 			level := len(matches[1]) + shiftBy
 35 | 			heading := matches[2]
 36 | 
 37 | 			if level <= 6 {
 38 | 				// Still valid heading level
 39 | 				result = append(result, fmt.Sprintf("%s %s", strings.Repeat("#", level), heading))
 40 | 			} else {
 41 | 				// Exceeded max heading level, convert to bold text
 42 | 				result = append(result, fmt.Sprintf("**%s**", heading))
 43 | 			}
 44 | 			isPrevLineHeading = false
 45 | 		} else if setextHeading1Regex.MatchString(line) && prevLine != "" {
 46 | 			// Handle Setext-style level 1 headings
 47 | 			level := 1 + shiftBy
 48 | 			if level <= 6 {
 49 | 				result[len(result)-1] = fmt.Sprintf("%s %s", strings.Repeat("#", level), prevLine)
 50 | 			} else {
 51 | 				result[len(result)-1] = fmt.Sprintf("**%s**", prevLine)
 52 | 			}
 53 | 			isPrevLineHeading = true
 54 | 		} else if setextHeading2Regex.MatchString(line) && prevLine != "" {
 55 | 			// Handle Setext-style level 2 headings
 56 | 			level := 2 + shiftBy
 57 | 			if level <= 6 {
 58 | 				result[len(result)-1] = fmt.Sprintf("%s %s", strings.Repeat("#", level), prevLine)
 59 | 			} else {
 60 | 				result[len(result)-1] = fmt.Sprintf("**%s**", prevLine)
 61 | 			}
 62 | 			isPrevLineHeading = true
 63 | 		} else {
 64 | 			// Ordinary line
 65 | 			result = append(result, line)
 66 | 			isPrevLineHeading = false
 67 | 		}
 68 | 
 69 | 		if !isPrevLineHeading {
 70 | 			prevLine = line
 71 | 		}
 72 | 	}
 73 | 
 74 | 	return strings.Join(result, "\n")
 75 | }
 76 | 
 77 | // AddTitleFromFilename Add heading from filename
 78 | func AddTitleFromFilename(content, filename string, level int) string {
 79 | 	// Extract heading from filename (remove extension)
 80 | 	title := strings.TrimSuffix(filename, ".md")
 81 | 	title = strings.TrimSuffix(title, ".markdown")
 82 | 
 83 | 	// Replace underscores and hyphens with spaces, making the heading more readable
 84 | 	title = strings.ReplaceAll(title, "_", " ")
 85 | 	title = strings.ReplaceAll(title, "-", " ")
 86 | 
 87 | 	// Capitalize the first letter of each word
 88 | 	title = strings.Title(title)
 89 | 
 90 | 	// Create heading line
 91 | 	var titleLine string
 92 | 	if level <= 6 {
 93 | 		titleLine = fmt.Sprintf("%s %s\n\n", strings.Repeat("#", level), title)
 94 | 	} else {
 95 | 		titleLine = fmt.Sprintf("**%s**\n\n", title)
 96 | 	}
 97 | 
 98 | 	return titleLine + content
 99 | }
100 | 


--------------------------------------------------------------------------------
/.github/workflows/docker-build.yml:
--------------------------------------------------------------------------------
  1 | name: Docker Image Build
  2 | 
  3 | on:
  4 |   release:
  5 |     types: [published, created]
  6 |   push:
  7 |     tags:
  8 |       - 'v*'
  9 |     paths-ignore:
 10 |       - '**.md'
 11 |       - '.github/**'
 12 |       - '!.github/workflows/docker-build.yml'
 13 |   workflow_dispatch:
 14 | 
 15 | jobs:
 16 |   docker:
 17 |     name: Build and Push Multi-arch Image
 18 |     runs-on: ubuntu-latest
 19 |     permissions:
 20 |       contents: write
 21 |       packages: write
 22 |     steps:
 23 |       - name: Checkout Code
 24 |         uses: actions/checkout@v4
 25 |         with:
 26 |           fetch-depth: 0
 27 | 
 28 |       - name: Set up QEMU
 29 |         uses: docker/setup-qemu-action@v3
 30 | 
 31 |       - name: Setup Docker Buildx
 32 |         uses: docker/setup-buildx-action@v3
 33 | 
 34 |       - name: Login to GitHub Container Registry
 35 |         uses: docker/login-action@v3
 36 |         with:
 37 |           registry: ghcr.io
 38 |           username: ${{ github.actor }}
 39 |           password: ${{ secrets.GITHUB_TOKEN }}
 40 | 
 41 |       - name: Extract Metadata
 42 |         id: meta
 43 |         uses: docker/metadata-action@v5
 44 |         with:
 45 |           images: ghcr.io/${{ github.repository_owner }}/mdctl
 46 |           tags: |
 47 |             type=ref,event=branch
 48 |             type=ref,event=pr
 49 |             type=semver,pattern={{version}}
 50 |             type=semver,pattern={{major}}.{{minor}}
 51 |             type=semver,pattern={{major}}
 52 |             type=sha,format=short
 53 |             type=raw,value=latest,enable=${{ github.ref_type == 'tag' }}
 54 | 
 55 |       - name: Display tags
 56 |         run: |
 57 |           echo "Generated tags: ${{ steps.meta.outputs.tags }}"
 58 |           echo "Ref type: ${{ github.ref_type }}"
 59 |           echo "Ref: ${{ github.ref }}"
 60 | 
 61 |       # Set explicit latest tag for tag events
 62 |       - name: Set explicit latest tag
 63 |         if: startsWith(github.ref, 'refs/tags/')
 64 |         run: echo "EXTRA_TAGS=ghcr.io/${{ github.repository_owner }}/mdctl:latest" >> $GITHUB_ENV
 65 | 
 66 |       # Get version information
 67 |       - name: Get version info
 68 |         id: version_info
 69 |         run: |
 70 |           # Get version from tag or git describe
 71 |           if [[ "$GITHUB_REF_TYPE" == "tag" ]]; then
 72 |             VERSION="${GITHUB_REF_NAME}"
 73 |           else
 74 |             VERSION="$(git describe --tags --always || echo 'dev')"
 75 |           fi
 76 | 
 77 |           # Get build time
 78 |           BUILD_TIME="$(date -u +"%Y-%m-%dT%H:%M:%SZ")"
 79 | 
 80 |           # Set outputs
 81 |           echo "VERSION=${VERSION}" >> $GITHUB_OUTPUT
 82 |           echo "BUILD_TIME=${BUILD_TIME}" >> $GITHUB_OUTPUT
 83 | 
 84 |           # Display for debugging
 85 |           echo "Version: ${VERSION}"
 86 |           echo "Build time: ${BUILD_TIME}"
 87 | 
 88 |       - name: Build and Push Multi-arch Image
 89 |         uses: docker/build-push-action@v5
 90 |         with:
 91 |           context: .
 92 |           platforms: linux/amd64,linux/arm64
 93 |           push: true
 94 |           tags: ${{ steps.meta.outputs.tags }}${{ env.EXTRA_TAGS != '' && format(',{0}', env.EXTRA_TAGS) || '' }}
 95 |           labels: ${{ steps.meta.outputs.labels }}
 96 |           build-args: |
 97 |             VERSION=${{ steps.version_info.outputs.VERSION }}
 98 |             BUILD_TIME=${{ steps.version_info.outputs.BUILD_TIME }}
 99 |           cache-from: type=gha
100 |           cache-to: type=gha,mode=max
101 | 


--------------------------------------------------------------------------------
/internal/linter/config.go:
--------------------------------------------------------------------------------
  1 | package linter
  2 | 
  3 | import (
  4 | 	"encoding/json"
  5 | 	"os"
  6 | 	"path/filepath"
  7 | )
  8 | 
  9 | // ConfigFile represents a markdownlint configuration file
 10 | type ConfigFile struct {
 11 | 	// Default configuration
 12 | 	Default bool `json:"default,omitempty"`
 13 | 
 14 | 	// Extends other configuration files
 15 | 	Extends string `json:"extends,omitempty"`
 16 | 
 17 | 	// Rule-specific configuration
 18 | 	MD001 *RuleConfig `json:"MD001,omitempty"`
 19 | 	MD003 *RuleConfig `json:"MD003,omitempty"`
 20 | 	MD009 *RuleConfig `json:"MD009,omitempty"`
 21 | 	MD010 *RuleConfig `json:"MD010,omitempty"`
 22 | 	MD012 *RuleConfig `json:"MD012,omitempty"`
 23 | 	MD013 *RuleConfig `json:"MD013,omitempty"`
 24 | 	MD018 *RuleConfig `json:"MD018,omitempty"`
 25 | 	MD019 *RuleConfig `json:"MD019,omitempty"`
 26 | 	MD023 *RuleConfig `json:"MD023,omitempty"`
 27 | 	MD032 *RuleConfig `json:"MD032,omitempty"`
 28 | 	MD047 *RuleConfig `json:"MD047,omitempty"`
 29 | }
 30 | 
 31 | // RuleConfig represents configuration for a specific rule
 32 | type RuleConfig struct {
 33 | 	// Whether the rule is enabled
 34 | 	Enabled *bool `json:"enabled,omitempty"`
 35 | 
 36 | 	// Rule-specific options
 37 | 	Options map[string]interface{} `json:"options,omitempty"`
 38 | }
 39 | 
 40 | // LoadConfigFile loads configuration from a file
 41 | func LoadConfigFile(filename string) (*ConfigFile, error) {
 42 | 	// Try to find config file if not specified
 43 | 	if filename == "" {
 44 | 		filename = findConfigFile()
 45 | 	}
 46 | 
 47 | 	if filename == "" {
 48 | 		return &ConfigFile{Default: true}, nil
 49 | 	}
 50 | 
 51 | 	data, err := os.ReadFile(filename)
 52 | 	if err != nil {
 53 | 		return nil, err
 54 | 	}
 55 | 
 56 | 	var config ConfigFile
 57 | 	if err := json.Unmarshal(data, &config); err != nil {
 58 | 		return nil, err
 59 | 	}
 60 | 
 61 | 	return &config, nil
 62 | }
 63 | 
 64 | // ApplyToRuleSet applies the configuration to a rule set
 65 | func (c *ConfigFile) ApplyToRuleSet(rs *RuleSet) {
 66 | 	ruleConfigs := map[string]*RuleConfig{
 67 | 		"MD001": c.MD001,
 68 | 		"MD003": c.MD003,
 69 | 		"MD009": c.MD009,
 70 | 		"MD010": c.MD010,
 71 | 		"MD012": c.MD012,
 72 | 		"MD013": c.MD013,
 73 | 		"MD018": c.MD018,
 74 | 		"MD019": c.MD019,
 75 | 		"MD023": c.MD023,
 76 | 		"MD032": c.MD032,
 77 | 		"MD047": c.MD047,
 78 | 	}
 79 | 
 80 | 	for ruleID, ruleConfig := range ruleConfigs {
 81 | 		if ruleConfig != nil && ruleConfig.Enabled != nil {
 82 | 			if rule, exists := rs.rules[ruleID]; exists {
 83 | 				rule.SetEnabled(*ruleConfig.Enabled)
 84 | 			}
 85 | 		}
 86 | 	}
 87 | }
 88 | 
 89 | // findConfigFile looks for common markdownlint config files
 90 | func findConfigFile() string {
 91 | 	configFiles := []string{
 92 | 		".markdownlint.json",
 93 | 		".markdownlint.jsonc",
 94 | 		".markdownlintrc",
 95 | 		".markdownlintrc.json",
 96 | 		".markdownlintrc.jsonc",
 97 | 	}
 98 | 
 99 | 	for _, filename := range configFiles {
100 | 		if _, err := os.Stat(filename); err == nil {
101 | 			return filename
102 | 		}
103 | 	}
104 | 
105 | 	// Also check in home directory
106 | 	if home, err := os.UserHomeDir(); err == nil {
107 | 		for _, filename := range configFiles {
108 | 			fullPath := filepath.Join(home, filename)
109 | 			if _, err := os.Stat(fullPath); err == nil {
110 | 				return fullPath
111 | 			}
112 | 		}
113 | 	}
114 | 
115 | 	return ""
116 | }
117 | 
118 | // CreateDefaultConfig creates a default configuration file
119 | func CreateDefaultConfig(filename string) error {
120 | 	config := ConfigFile{
121 | 		Default: true,
122 | 		MD001:   &RuleConfig{Enabled: boolPtr(true)},
123 | 		MD003:   &RuleConfig{Enabled: boolPtr(true)},
124 | 		MD009:   &RuleConfig{Enabled: boolPtr(true)},
125 | 		MD010:   &RuleConfig{Enabled: boolPtr(true)},
126 | 		MD012:   &RuleConfig{Enabled: boolPtr(true)},
127 | 		MD013:   &RuleConfig{Enabled: boolPtr(true)},
128 | 		MD018:   &RuleConfig{Enabled: boolPtr(true)},
129 | 		MD019:   &RuleConfig{Enabled: boolPtr(true)},
130 | 		MD023:   &RuleConfig{Enabled: boolPtr(true)},
131 | 		MD032:   &RuleConfig{Enabled: boolPtr(true)},
132 | 		MD047:   &RuleConfig{Enabled: boolPtr(true)},
133 | 	}
134 | 
135 | 	data, err := json.MarshalIndent(config, "", "  ")
136 | 	if err != nil {
137 | 		return err
138 | 	}
139 | 
140 | 	return os.WriteFile(filename, data, 0644)
141 | }
142 | 
143 | // boolPtr returns a pointer to a bool value
144 | func boolPtr(b bool) *bool {
145 | 	return &b
146 | }
147 | 


--------------------------------------------------------------------------------
/internal/cache/cache.go:
--------------------------------------------------------------------------------
  1 | package cache
  2 | 
  3 | import (
  4 | 	"encoding/json"
  5 | 	"fmt"
  6 | 	"os"
  7 | 	"path/filepath"
  8 | 	"sync"
  9 | 	"time"
 10 | )
 11 | 
 12 | // CacheItem represents a single cached file information
 13 | type CacheItem struct {
 14 | 	LocalPath  string    `json:"local_path"`
 15 | 	RemotePath string    `json:"remote_path"`
 16 | 	URL        string    `json:"url"`
 17 | 	Hash       string    `json:"hash"`
 18 | 	UploadTime time.Time `json:"upload_time"`
 19 | }
 20 | 
 21 | // Cache manages information about uploaded files
 22 | type Cache struct {
 23 | 	Items    map[string]CacheItem `json:"items"`
 24 | 	Version  string               `json:"version"`
 25 | 	CacheDir string               `json:"cache_dir,omitempty"`
 26 | 	mutex    sync.RWMutex
 27 | }
 28 | 
 29 | // New creates a new cache instance
 30 | func New(cacheDir string) *Cache {
 31 | 	if cacheDir == "" {
 32 | 		homeDir, err := os.UserHomeDir()
 33 | 		if err == nil {
 34 | 			cacheDir = filepath.Join(homeDir, ".cache", "mdctl")
 35 | 		} else {
 36 | 			// Fallback to temp directory
 37 | 			cacheDir = filepath.Join(os.TempDir(), "mdctl-cache")
 38 | 		}
 39 | 	}
 40 | 
 41 | 	return &Cache{
 42 | 		Items:    make(map[string]CacheItem),
 43 | 		Version:  "1.0",
 44 | 		CacheDir: cacheDir,
 45 | 	}
 46 | }
 47 | 
 48 | // saveWithoutLock writes cache to disk without acquiring the lock
 49 | // This should only be called from methods that already hold a lock
 50 | func (c *Cache) saveWithoutLock() error {
 51 | 	// Ensure cache directory exists
 52 | 	if err := os.MkdirAll(c.CacheDir, 0755); err != nil {
 53 | 		return fmt.Errorf("failed to create cache directory: %v", err)
 54 | 	}
 55 | 
 56 | 	cacheFile := filepath.Join(c.CacheDir, "upload-cache.json")
 57 | 	data, err := json.MarshalIndent(c, "", "  ")
 58 | 	if err != nil {
 59 | 		return fmt.Errorf("failed to marshal cache: %v", err)
 60 | 	}
 61 | 
 62 | 	if err := os.WriteFile(cacheFile, data, 0644); err != nil {
 63 | 		return fmt.Errorf("failed to write cache file: %v", err)
 64 | 	}
 65 | 
 66 | 	return nil
 67 | }
 68 | 
 69 | // Load reads cache from disk
 70 | func (c *Cache) Load() error {
 71 | 	c.mutex.Lock()
 72 | 	defer c.mutex.Unlock()
 73 | 
 74 | 	// Ensure cache directory exists
 75 | 	if err := os.MkdirAll(c.CacheDir, 0755); err != nil {
 76 | 		return fmt.Errorf("failed to create cache directory: %v", err)
 77 | 	}
 78 | 
 79 | 	cacheFile := filepath.Join(c.CacheDir, "upload-cache.json")
 80 | 	if _, err := os.Stat(cacheFile); os.IsNotExist(err) {
 81 | 		// Cache file doesn't exist yet, create a new one
 82 | 		c.Items = make(map[string]CacheItem)
 83 | 		return c.saveWithoutLock()
 84 | 	}
 85 | 
 86 | 	data, err := os.ReadFile(cacheFile)
 87 | 	if err != nil {
 88 | 		return fmt.Errorf("failed to read cache file: %v", err)
 89 | 	}
 90 | 
 91 | 	if err := json.Unmarshal(data, c); err != nil {
 92 | 		// If cache is corrupt, start with a fresh one
 93 | 		c.Items = make(map[string]CacheItem)
 94 | 		return nil
 95 | 	}
 96 | 
 97 | 	return nil
 98 | }
 99 | 
100 | // Save persists the cache to disk
101 | func (c *Cache) Save() error {
102 | 	c.mutex.Lock()
103 | 	defer c.mutex.Unlock()
104 | 
105 | 	return c.saveWithoutLock() // Use the lockless version to avoid deadlock
106 | }
107 | 
108 | // AddItem adds or updates a cache item
109 | func (c *Cache) AddItem(localPath, remotePath, url, hash string) {
110 | 	c.mutex.Lock()
111 | 	defer c.mutex.Unlock()
112 | 
113 | 	c.Items[localPath] = CacheItem{
114 | 		LocalPath:  localPath,
115 | 		RemotePath: remotePath,
116 | 		URL:        url,
117 | 		Hash:       hash,
118 | 		UploadTime: time.Now(),
119 | 	}
120 | }
121 | 
122 | // GetItem retrieves a cache item by local path
123 | func (c *Cache) GetItem(localPath string) (CacheItem, bool) {
124 | 	c.mutex.RLock()
125 | 	defer c.mutex.RUnlock()
126 | 
127 | 	item, exists := c.Items[localPath]
128 | 	return item, exists
129 | }
130 | 
131 | // HasItemWithHash checks if an item with the same hash exists
132 | func (c *Cache) HasItemWithHash(hash string) (CacheItem, bool) {
133 | 	c.mutex.RLock()
134 | 	defer c.mutex.RUnlock()
135 | 
136 | 	for _, item := range c.Items {
137 | 		if item.Hash == hash {
138 | 			return item, true
139 | 		}
140 | 	}
141 | 	return CacheItem{}, false
142 | }
143 | 
144 | // RemoveItem removes an item from the cache
145 | func (c *Cache) RemoveItem(localPath string) {
146 | 	c.mutex.Lock()
147 | 	defer c.mutex.Unlock()
148 | 
149 | 	delete(c.Items, localPath)
150 | }
151 | 


--------------------------------------------------------------------------------
/internal/llmstxt/generator.go:
--------------------------------------------------------------------------------
  1 | package llmstxt
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"io"
  6 | 	"log"
  7 | 	"os"
  8 | 	"sort"
  9 | 	"time"
 10 | )
 11 | 
 12 | // GeneratorConfig contains the configuration required to generate llms.txt
 13 | type GeneratorConfig struct {
 14 | 	SitemapURL   string
 15 | 	IncludePaths []string
 16 | 	ExcludePaths []string
 17 | 	FullMode     bool
 18 | 	Concurrency  int
 19 | 	Timeout      int
 20 | 	UserAgent    string
 21 | 	Verbose      bool
 22 | 	VeryVerbose  bool // More detailed log output
 23 | 	MaxPages     int  // Maximum number of pages to process, 0 means no limit
 24 | }
 25 | 
 26 | // PageInfo stores page information
 27 | type PageInfo struct {
 28 | 	Title       string
 29 | 	URL         string
 30 | 	Description string
 31 | 	Content     string // Page content, only filled in full mode
 32 | 	Section     string // First segment of URL path as section
 33 | }
 34 | 
 35 | // Generator is the llms.txt generator
 36 | type Generator struct {
 37 | 	config GeneratorConfig
 38 | 	logger *log.Logger
 39 | }
 40 | 
 41 | // NewGenerator creates a new generator instance
 42 | func NewGenerator(config GeneratorConfig) *Generator {
 43 | 	var logger *log.Logger
 44 | 	if config.Verbose || config.VeryVerbose {
 45 | 		logger = log.New(os.Stdout, "[LLMSTXT] ", log.LstdFlags)
 46 | 	} else {
 47 | 		logger = log.New(io.Discard, "", 0)
 48 | 	}
 49 | 
 50 | 	return &Generator{
 51 | 		config: config,
 52 | 		logger: logger,
 53 | 	}
 54 | }
 55 | 
 56 | // Generate performs the generation process and returns the generated content
 57 | func (g *Generator) Generate() (string, error) {
 58 | 	startTime := time.Now()
 59 | 	g.logger.Printf("Starting generation for sitemap: %s", g.config.SitemapURL)
 60 | 	if g.config.FullMode {
 61 | 		g.logger.Println("Full-content mode enabled")
 62 | 	}
 63 | 
 64 | 	// 1. Parse sitemap.xml to get URL list
 65 | 	urls, err := g.parseSitemap()
 66 | 	if err != nil {
 67 | 		return "", fmt.Errorf("failed to parse sitemap: %w", err)
 68 | 	}
 69 | 	g.logger.Printf("Found %d URLs in sitemap", len(urls))
 70 | 
 71 | 	// 2. Filter URLs (based on include/exclude mode)
 72 | 	urls = g.filterURLs(urls)
 73 | 	g.logger.Printf("%d URLs after filtering", len(urls))
 74 | 
 75 | 	// 2.1. Apply max page limit
 76 | 	if g.config.MaxPages > 0 && len(urls) > g.config.MaxPages {
 77 | 		g.logger.Printf("Limiting to %d pages as requested (--max-pages)", g.config.MaxPages)
 78 | 		urls = urls[:g.config.MaxPages]
 79 | 	}
 80 | 
 81 | 	// 3. Create worker pool and get page info
 82 | 	pages, err := g.fetchPages(urls)
 83 | 	if err != nil {
 84 | 		return "", fmt.Errorf("failed to fetch pages: %w", err)
 85 | 	}
 86 | 
 87 | 	// 4. Group pages by section
 88 | 	sections := g.groupBySections(pages)
 89 | 
 90 | 	// 5. Format to Markdown content
 91 | 	content := g.formatContent(sections)
 92 | 
 93 | 	elapsedTime := time.Since(startTime).Round(time.Millisecond)
 94 | 	g.logger.Printf("Generation completed successfully in %v", elapsedTime)
 95 | 	return content, nil
 96 | }
 97 | 
 98 | // Group pages by section
 99 | func (g *Generator) groupBySections(pages []PageInfo) map[string][]PageInfo {
100 | 	sections := make(map[string][]PageInfo)
101 | 
102 | 	for _, page := range pages {
103 | 		sections[page.Section] = append(sections[page.Section], page)
104 | 	}
105 | 
106 | 	// Sort pages within each section by URL path length
107 | 	for section, sectionPages := range sections {
108 | 		sort.Slice(sectionPages, func(i, j int) bool {
109 | 			return len(sectionPages[i].URL) < len(sectionPages[j].URL)
110 | 		})
111 | 		sections[section] = sectionPages
112 | 	}
113 | 
114 | 	return sections
115 | }
116 | 
117 | // Get sorted section name list, ensuring ROOT section is always first
118 | func (g *Generator) getSortedSections(sections map[string][]PageInfo) []string {
119 | 	sectionNames := make([]string, 0, len(sections))
120 | 
121 | 	// Add ROOT section first (if exists)
122 | 	if _, hasRoot := sections["ROOT"]; hasRoot {
123 | 		sectionNames = append(sectionNames, "ROOT")
124 | 	}
125 | 
126 | 	// Add other sections and sort alphabetically
127 | 	for section := range sections {
128 | 		if section != "ROOT" {
129 | 			sectionNames = append(sectionNames, section)
130 | 		}
131 | 	}
132 | 
133 | 	// Only sort if there are non-ROOT sections
134 | 	if len(sectionNames) > 1 {
135 | 		// Only sort non-ROOT sections
136 | 		nonRootSections := sectionNames[1:]
137 | 		sort.Strings(nonRootSections)
138 | 	}
139 | 
140 | 	return sectionNames
141 | }
142 | 


--------------------------------------------------------------------------------
/cmd/translate.go:
--------------------------------------------------------------------------------
  1 | package cmd
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"os"
  6 | 	"path/filepath"
  7 | 	"strings"
  8 | 
  9 | 	"github.com/samzong/mdctl/internal/config"
 10 | 	"github.com/samzong/mdctl/internal/translator"
 11 | 	"github.com/spf13/cobra"
 12 | )
 13 | 
 14 | var (
 15 | 	fromPath string
 16 | 	toPath   string
 17 | 	locale   string
 18 | 	force    bool
 19 | 	format   bool
 20 | )
 21 | 
 22 | // Generate target file path
 23 | func generateTargetPath(sourcePath, lang string) string {
 24 | 	dir := filepath.Dir(sourcePath)
 25 | 	base := filepath.Base(sourcePath)
 26 | 	ext := filepath.Ext(base)
 27 | 	nameWithoutExt := strings.TrimSuffix(base, ext)
 28 | 	return filepath.Join(dir, nameWithoutExt+"_"+lang+ext)
 29 | }
 30 | 
 31 | var translateCmd = &cobra.Command{
 32 | 	Use:   "translate",
 33 | 	Short: "Translate markdown files using AI models",
 34 | 	Long: `Translate markdown files or directories to specified language using AI models.
 35 | 
 36 | Supported AI Models:
 37 |   - OpenAI (Current)
 38 |   - DeepSeek R1 (Current)
 39 |   - Llama (Current)
 40 | 	
 41 | Supported Languages:
 42 |   ar (العربية), de (Deutsch), en (English), es (Español), fr (Français),
 43 |   hi (हिन्दी), it (Italiano), ja (日本語), ko (한국어), pt (Português),
 44 |   ru (Русский), th (ไทย), vi (Tiếng Việt), zh (中文)
 45 | 
 46 | Examples:
 47 |   # Translate a single file to Chinese
 48 |   mdctl translate -f README.md -l zh
 49 | 
 50 |   # Translate a directory to Japanese
 51 |   mdctl translate -f docs -l ja
 52 | 
 53 |   # Force translate an already translated file
 54 |   mdctl translate -f README.md -l ko -F
 55 | 
 56 |   # Format markdown content after translation
 57 |   mdctl translate -f README.md -l zh -m
 58 | 
 59 |   # Translate to a specific output path
 60 |   mdctl translate -f docs -l fr -t translated_docs`,
 61 | 	RunE: func(cmd *cobra.Command, args []string) error {
 62 | 		cfg, err := config.LoadConfig()
 63 | 		if err != nil {
 64 | 			return fmt.Errorf("failed to load config: %v", err)
 65 | 		}
 66 | 
 67 | 		// Validate language option
 68 | 		if !translator.IsLanguageSupported(locale) {
 69 | 			return fmt.Errorf("unsupported locale: %s\nSupported languages: %s",
 70 | 				locale,
 71 | 				translator.GetSupportedLanguages())
 72 | 		}
 73 | 
 74 | 		// Check if source path exists
 75 | 		if _, err := os.Stat(fromPath); os.IsNotExist(err) {
 76 | 			return fmt.Errorf("source path does not exist: %s", fromPath)
 77 | 		}
 78 | 
 79 | 		// Get absolute path of source path
 80 | 		srcAbs, err := filepath.Abs(fromPath)
 81 | 		if err != nil {
 82 | 			return fmt.Errorf("failed to get absolute path: %v", err)
 83 | 		}
 84 | 
 85 | 		// Check if it's a file or directory
 86 | 		fi, err := os.Stat(srcAbs)
 87 | 		if err != nil {
 88 | 			return fmt.Errorf("failed to get file info: %v", err)
 89 | 		}
 90 | 
 91 | 		if fi.IsDir() {
 92 | 			// If it's a directory and no target path specified, use the same directory structure
 93 | 			if toPath == "" {
 94 | 				return translator.ProcessDirectory(srcAbs, srcAbs, locale, cfg, force, format)
 95 | 			}
 96 | 			// If target path is specified, use the specified path
 97 | 			dstAbs, err := filepath.Abs(toPath)
 98 | 			if err != nil {
 99 | 				return fmt.Errorf("failed to get absolute path: %v", err)
100 | 			}
101 | 			return translator.ProcessDirectory(srcAbs, dstAbs, locale, cfg, force, format)
102 | 		}
103 | 
104 | 		// Process single file
105 | 		var dstAbs string
106 | 		if toPath == "" {
107 | 			// If no target path specified, generate name_lang.md in the same directory as source
108 | 			dstAbs = generateTargetPath(srcAbs, locale)
109 | 		} else {
110 | 			// If target path specified, use the specified path
111 | 			dstAbs, err = filepath.Abs(toPath)
112 | 			if err != nil {
113 | 				return fmt.Errorf("failed to get absolute path: %v", err)
114 | 			}
115 | 		}
116 | 
117 | 		return translator.ProcessFile(srcAbs, dstAbs, locale, cfg, format, force)
118 | 	},
119 | }
120 | 
121 | func init() {
122 | 	translateCmd.Flags().StringVarP(&fromPath, "from", "f", "", "Source file or directory path")
123 | 	translateCmd.Flags().StringVarP(&toPath, "to", "t", "", "Target file or directory path (optional, default: generate in same directory as source)")
124 | 	translateCmd.Flags().StringVarP(&locale, "locales", "l", "", "Target language code (e.g., zh, en, ja, ko, fr, de, es, etc.)")
125 | 	translateCmd.Flags().BoolVarP(&force, "force", "F", false, "Force translate even if already translated")
126 | 	translateCmd.Flags().BoolVarP(&format, "format", "m", false, "Format markdown content after translation")
127 | 
128 | 	translateCmd.MarkFlagRequired("from")
129 | 	translateCmd.MarkFlagRequired("locales")
130 | }
131 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # mdctl - A CLI Tool for Markdown File Operations
  2 | 
  3 | <div align="center">
  4 |   <img src="./mdctl.png" alt="mdctl logo" width="200" />
  5 |   <br />
  6 |   <p>An AI-powered CLI tool to enhance your Markdown workflow, with auto-image downloading, translation, and more features coming soon!</p>
  7 |   <p>
  8 |     <a href="https://github.com/samzong/mdctl/actions/workflows/docker-build.yml"><img src="https://github.com/samzong/mdctl/actions/workflows/docker-build.yml/badge.svg" alt="Build Status"></a>
  9 |     <a href="https://github.com/samzong/mdctl/releases"><img src="https://img.shields.io/github/v/release/samzong/mdctl" alt="Release Version" /></a>
 10 |     <a href="https://goreportcard.com/report/github.com/samzong/mdctl"><img src="https://goreportcard.com/badge/github.com/samzong/mdctl" alt="go report" /></a>
 11 |     <a href="https://github.com/samzong/mdctl/blob/main/LICENSE"><img src="https://img.shields.io/github/license/samzong/mdctl" alt="MIT License" /></a>
 12 |     <a href="https://deepwiki.com/samzong/mdctl"><img src="https://deepwiki.com/badge.svg" alt="Ask DeepWiki"></a>
 13 |   </p>
 14 | </div>
 15 | 
 16 | ## Key Features
 17 | 
 18 | - Automatically downloads remote images to a specified local directory.
 19 | - Translates markdown files using AI models with support for multiple languages.
 20 | - Uploads local images in markdown files to cloud storage services and updates references.
 21 | - Exports markdown files to various document formats (DOCX, PDF, EPUB) with customization options.
 22 | - Generates llms.txt files from website sitemaps for training language models.
 23 | 
 24 | ## Installation
 25 | 
 26 | Use Homebrew to install mdctl. Follow the [Homebrew Installation Guide](https://brew.sh/) to install Homebrew.
 27 | 
 28 | ```bash
 29 | brew tap samzong/tap
 30 | brew install samzong/tap/mdctl
 31 | ```
 32 | 
 33 | Or use go to install mdctl.
 34 | 
 35 | ```bash
 36 | go install github.com/samzong/mdctl@latest
 37 | ```
 38 | 
 39 | ## Usage
 40 | 
 41 | Quick examples for common tasks:
 42 | 
 43 | ### Downloading Images
 44 | 
 45 | ```bash
 46 | # Process a single file
 47 | mdctl download -f path/to/your/file.md
 48 | 
 49 | # Process a directory
 50 | mdctl download -d path/to/your/directory
 51 | ```
 52 | 
 53 | ### Translating I18n
 54 | 
 55 | ```bash
 56 | # Translate to Chinese
 57 | mdctl translate -f README.md -l zh
 58 | 
 59 | # Translate a directory to Japanese
 60 | mdctl translate -d docs/ -l ja
 61 | ```
 62 | 
 63 | ### Uploading Images to Cloud Storage
 64 | 
 65 | ```bash
 66 | # Upload images from a file
 67 | mdctl upload -f post.md
 68 | 
 69 | # Upload images from a directory
 70 | mdctl upload -d docs/
 71 | ```
 72 | 
 73 | ### Exporting Documents to `.docx`
 74 | 
 75 | ```bash
 76 | # Export to DOCX
 77 | mdctl export -f README.md -o output.docx
 78 | 
 79 | # Export to PDF with table of contents
 80 | mdctl export -d docs/ -o documentation.pdf -F pdf --toc
 81 | ```
 82 | 
 83 | ### Generating `llms.txt` from `sitemap.xml`
 84 | 
 85 | ```bash
 86 | # Standard mode (titles and descriptions)
 87 | mdctl llmstxt https://example.com/sitemap.xml > llms.txt
 88 | 
 89 | # Full-content mode
 90 | mdctl llmstxt -f https://example.com/sitemap.xml > llms-full.txt
 91 | ```
 92 | 
 93 | ### GitHub Action
 94 | 
 95 | Use mdctl in your CI with the Docker-based Action in this repo. Example workflow step:
 96 | 
 97 | ```yaml
 98 | jobs:
 99 |   docs:
100 |     runs-on: ubuntu-latest
101 |     steps:
102 |       - uses: actions/checkout@v4
103 |       - name: Export docs to DOCX
104 |         uses: samzong/mdctl@v1
105 |         with:
106 |           args: "export -f README.md -o output.docx"
107 | ```
108 | 
109 | Notes:
110 | - Set `with.args` to any mdctl command and flags (e.g., `download`, `translate`, `upload`, `export`, `llmstxt`).
111 | - Provide necessary credentials via `env` when using cloud features (e.g., S3 for `upload`).
112 | - You can set `working-directory` on the step if needed.
113 | 
114 | ## Developer's Guide
115 | 
116 | If you are interested in contributing, please refer to the [DEVELOPMENT.md](docs/DEVELOPMENT.md) file for a complete technical architecture, component design, and development guide.
117 | 
118 | ## Contributing
119 | 
120 | Welcome to contribute code, report issues, or suggest features! Please follow these steps:
121 | 
122 | 1. Fork this repository
123 | 2. Create your feature branch (`git checkout -b feature/amazing-feature`)
124 | 3. Commit your changes (`git commit -m 'Add some amazing feature'`)
125 | 4. Push to the branch (`git push origin feature/amazing-feature`)
126 | 5. Open a Pull Request
127 | 
128 | ## License
129 | 
130 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
131 | 


--------------------------------------------------------------------------------
/cmd/export.go:
--------------------------------------------------------------------------------
  1 | package cmd
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"io"
  6 | 	"log"
  7 | 	"os"
  8 | 
  9 | 	"github.com/samzong/mdctl/internal/exporter"
 10 | 	"github.com/spf13/cobra"
 11 | )
 12 | 
 13 | var (
 14 | 	exportFile          string
 15 | 	exportDir           string
 16 | 	siteType            string
 17 | 	exportOutput        string
 18 | 	exportTemplate      string
 19 | 	exportFormat        string
 20 | 	generateToc         bool
 21 | 	shiftHeadingLevelBy int
 22 | 	fileAsTitle         bool
 23 | 	tocDepth            int
 24 | 	navPath             string
 25 | 	logger              *log.Logger
 26 | 
 27 | 	exportCmd = &cobra.Command{
 28 | 		Use:   "export",
 29 | 		Short: "Export markdown files to other formats",
 30 | 		Long: `Export markdown files to other formats like DOCX, PDF, EPUB.
 31 | Uses Pandoc as the underlying conversion tool.
 32 | 
 33 | Examples:
 34 |   mdctl export -f README.md -o output.docx
 35 |   mdctl export -d docs/ -o documentation.docx
 36 |   mdctl export -d docs/ -s mkdocs -o site_docs.docx
 37 |   mdctl export -d docs/ -o report.docx -t templates/corporate.docx
 38 |   mdctl export -d docs/ -o documentation.docx --shift-heading-level-by 2
 39 |   mdctl export -d docs/ -o documentation.docx --toc --toc-depth 4
 40 |   mdctl export -d docs/ -o documentation.pdf -F pdf`,
 41 | 		RunE: func(cmd *cobra.Command, args []string) error {
 42 | 			// Initialize logger
 43 | 			if verbose {
 44 | 				logger = log.New(os.Stdout, "[EXPORT] ", log.LstdFlags)
 45 | 			} else {
 46 | 				logger = log.New(io.Discard, "", 0)
 47 | 			}
 48 | 
 49 | 			logger.Println("Starting export process...")
 50 | 
 51 | 			// Parameter validation
 52 | 			if exportFile == "" && exportDir == "" {
 53 | 				return fmt.Errorf("either source file (-f) or source directory (-d) must be specified")
 54 | 			}
 55 | 			if exportFile != "" && exportDir != "" {
 56 | 				return fmt.Errorf("cannot specify both source file (-f) and source directory (-d)")
 57 | 			}
 58 | 			if exportOutput == "" {
 59 | 				return fmt.Errorf("output file (-o) must be specified")
 60 | 			}
 61 | 
 62 | 			logger.Printf("Validating parameters: file=%s, dir=%s, output=%s, format=%s, site-type=%s",
 63 | 				exportFile, exportDir, exportOutput, exportFormat, siteType)
 64 | 
 65 | 			// Check if Pandoc is available
 66 | 			logger.Println("Checking Pandoc availability...")
 67 | 			if err := exporter.CheckPandocAvailability(); err != nil {
 68 | 				return err
 69 | 			}
 70 | 			logger.Println("Pandoc is available.")
 71 | 
 72 | 			// Create export options
 73 | 			options := exporter.ExportOptions{
 74 | 				Template:            exportTemplate,
 75 | 				GenerateToc:         generateToc,
 76 | 				ShiftHeadingLevelBy: shiftHeadingLevelBy,
 77 | 				FileAsTitle:         fileAsTitle,
 78 | 				Format:              exportFormat,
 79 | 				SiteType:            siteType,
 80 | 				Verbose:             verbose,
 81 | 				Logger:              logger,
 82 | 				TocDepth:            tocDepth,
 83 | 				NavPath:             navPath,
 84 | 			}
 85 | 
 86 | 			logger.Printf("Export options: template=%s, toc=%v, toc-depth=%d, shift-heading=%d, file-as-title=%v",
 87 | 				exportTemplate, generateToc, tocDepth, shiftHeadingLevelBy, fileAsTitle)
 88 | 
 89 | 			// Execute export
 90 | 			exp := exporter.NewExporter()
 91 | 			var err error
 92 | 
 93 | 			if exportFile != "" {
 94 | 				logger.Printf("Exporting single file: %s -> %s", exportFile, exportOutput)
 95 | 				err = exp.ExportFile(exportFile, exportOutput, options)
 96 | 			} else {
 97 | 				logger.Printf("Exporting directory: %s -> %s", exportDir, exportOutput)
 98 | 				err = exp.ExportDirectory(exportDir, exportOutput, options)
 99 | 			}
100 | 
101 | 			if err != nil {
102 | 				logger.Printf("Export failed: %s", err)
103 | 				return err
104 | 			}
105 | 
106 | 			logger.Println("Export completed successfully.")
107 | 			return nil
108 | 		},
109 | 	}
110 | )
111 | 
112 | func init() {
113 | 	exportCmd.Flags().StringVarP(&exportFile, "file", "f", "", "Source markdown file to export")
114 | 	exportCmd.Flags().StringVarP(&exportDir, "dir", "d", "", "Source directory containing markdown files to export")
115 | 	exportCmd.Flags().StringVarP(&siteType, "site-type", "s", "basic", "Site type (basic, mkdocs, hugo, docusaurus)")
116 | 	exportCmd.Flags().StringVarP(&exportOutput, "output", "o", "", "Output file path")
117 | 	exportCmd.Flags().StringVarP(&exportTemplate, "template", "t", "", "Word template file path")
118 | 	exportCmd.Flags().StringVarP(&exportFormat, "format", "F", "docx", "Output format (docx, pdf, epub)")
119 | 	exportCmd.Flags().BoolVar(&generateToc, "toc", false, "Generate table of contents")
120 | 	exportCmd.Flags().IntVar(&shiftHeadingLevelBy, "shift-heading-level-by", 0, "Shift heading level by N")
121 | 	exportCmd.Flags().BoolVar(&fileAsTitle, "file-as-title", false, "Use filename as section title")
122 | 	exportCmd.Flags().IntVar(&tocDepth, "toc-depth", 3, "Depth of table of contents (default 3)")
123 | 	exportCmd.Flags().StringVarP(&navPath, "nav-path", "n", "", "Specify the navigation path to export (e.g. 'Section1/Subsection2')")
124 | }
125 | 


--------------------------------------------------------------------------------
/internal/linter/fixer.go:
--------------------------------------------------------------------------------
  1 | package linter
  2 | 
  3 | import (
  4 | 	"regexp"
  5 | 	"strings"
  6 | )
  7 | 
  8 | // Fixer provides auto-fix functionality for markdown issues
  9 | type Fixer struct {
 10 | 	rules map[string]func([]string) ([]string, int)
 11 | }
 12 | 
 13 | // NewFixer creates a new fixer instance
 14 | func NewFixer() *Fixer {
 15 | 	f := &Fixer{
 16 | 		rules: make(map[string]func([]string) ([]string, int)),
 17 | 	}
 18 | 
 19 | 	// Register fix functions for each rule
 20 | 	f.rules["MD009"] = f.fixTrailingSpaces
 21 | 	f.rules["MD010"] = f.fixHardTabs
 22 | 	f.rules["MD012"] = f.fixMultipleBlankLines
 23 | 	f.rules["MD018"] = f.fixNoSpaceAfterHash
 24 | 	f.rules["MD019"] = f.fixMultipleSpacesAfterHash
 25 | 	f.rules["MD023"] = f.fixHeadingIndentation
 26 | 	f.rules["MD032"] = f.fixListSpacing
 27 | 	f.rules["MD047"] = f.fixFileEndNewline
 28 | 
 29 | 	return f
 30 | }
 31 | 
 32 | // ApplyFixes applies fixes for the given issues
 33 | func (f *Fixer) ApplyFixes(content string, issues []*Issue) (string, int) {
 34 | 	lines := strings.Split(content, "\n")
 35 | 	totalFixed := 0
 36 | 
 37 | 	// Group issues by rule for efficient processing
 38 | 	ruleIssues := make(map[string][]*Issue)
 39 | 	for _, issue := range issues {
 40 | 		ruleIssues[issue.Rule] = append(ruleIssues[issue.Rule], issue)
 41 | 	}
 42 | 
 43 | 	// Apply fixes for each rule
 44 | 	for rule, ruleSpecificIssues := range ruleIssues {
 45 | 		if fixFunc, exists := f.rules[rule]; exists {
 46 | 			var fixed int
 47 | 			lines, fixed = fixFunc(lines)
 48 | 			totalFixed += fixed
 49 | 
 50 | 			// Mark issues as fixed
 51 | 			for _, issue := range ruleSpecificIssues {
 52 | 				issue.Fixed = true
 53 | 			}
 54 | 		}
 55 | 	}
 56 | 
 57 | 	return strings.Join(lines, "\n"), totalFixed
 58 | }
 59 | 
 60 | // fixTrailingSpaces removes trailing spaces from lines
 61 | func (f *Fixer) fixTrailingSpaces(lines []string) ([]string, int) {
 62 | 	fixed := 0
 63 | 	for i, line := range lines {
 64 | 		trimmed := strings.TrimRight(line, " \t")
 65 | 		if trimmed != line {
 66 | 			lines[i] = trimmed
 67 | 			fixed++
 68 | 		}
 69 | 	}
 70 | 	return lines, fixed
 71 | }
 72 | 
 73 | // fixHardTabs replaces hard tabs with spaces
 74 | func (f *Fixer) fixHardTabs(lines []string) ([]string, int) {
 75 | 	fixed := 0
 76 | 	for i, line := range lines {
 77 | 		if strings.Contains(line, "\t") {
 78 | 			lines[i] = strings.ReplaceAll(line, "\t", "    ")
 79 | 			fixed++
 80 | 		}
 81 | 	}
 82 | 	return lines, fixed
 83 | }
 84 | 
 85 | // fixMultipleBlankLines removes consecutive blank lines
 86 | func (f *Fixer) fixMultipleBlankLines(lines []string) ([]string, int) {
 87 | 	var result []string
 88 | 	fixed := 0
 89 | 	prevBlank := false
 90 | 
 91 | 	for _, line := range lines {
 92 | 		isBlank := strings.TrimSpace(line) == ""
 93 | 
 94 | 		if isBlank && prevBlank {
 95 | 			fixed++ // Count removed blank lines
 96 | 			continue
 97 | 		}
 98 | 
 99 | 		result = append(result, line)
100 | 		prevBlank = isBlank
101 | 	}
102 | 
103 | 	return result, fixed
104 | }
105 | 
106 | // fixNoSpaceAfterHash adds space after hash in headings
107 | func (f *Fixer) fixNoSpaceAfterHash(lines []string) ([]string, int) {
108 | 	fixed := 0
109 | 	re := regexp.MustCompile(`^(#+)([^# ])`)
110 | 
111 | 	for i, line := range lines {
112 | 		trimmed := strings.TrimSpace(line)
113 | 		if re.MatchString(trimmed) {
114 | 			lines[i] = re.ReplaceAllString(trimmed, "$1 $2")
115 | 			fixed++
116 | 		}
117 | 	}
118 | 
119 | 	return lines, fixed
120 | }
121 | 
122 | // fixMultipleSpacesAfterHash removes extra spaces after hash in headings
123 | func (f *Fixer) fixMultipleSpacesAfterHash(lines []string) ([]string, int) {
124 | 	fixed := 0
125 | 	re := regexp.MustCompile(`^(#+)\s{2,}`)
126 | 
127 | 	for i, line := range lines {
128 | 		trimmed := strings.TrimSpace(line)
129 | 		if re.MatchString(trimmed) {
130 | 			lines[i] = re.ReplaceAllString(trimmed, "$1 ")
131 | 			fixed++
132 | 		}
133 | 	}
134 | 
135 | 	return lines, fixed
136 | }
137 | 
138 | // fixHeadingIndentation removes leading spaces from headings
139 | func (f *Fixer) fixHeadingIndentation(lines []string) ([]string, int) {
140 | 	fixed := 0
141 | 	re := regexp.MustCompile(`^ +(#.*)`)
142 | 
143 | 	for i, line := range lines {
144 | 		if re.MatchString(line) {
145 | 			lines[i] = re.ReplaceAllString(line, "$1")
146 | 			fixed++
147 | 		}
148 | 	}
149 | 
150 | 	return lines, fixed
151 | }
152 | 
153 | // fixListSpacing adds blank lines around lists
154 | func (f *Fixer) fixListSpacing(lines []string) ([]string, int) {
155 | 	fixed := 0
156 | 	var result []string
157 | 	listRe := regexp.MustCompile(`^(\s*[*+-] )`)
158 | 
159 | 	for i, line := range lines {
160 | 		if listRe.MatchString(line) {
161 | 			// Check if previous line needs a blank line
162 | 			if i > 0 && strings.TrimSpace(lines[i-1]) != "" && len(result) > 0 {
163 | 				result = append(result, "")
164 | 				fixed++
165 | 			}
166 | 		}
167 | 		result = append(result, line)
168 | 	}
169 | 
170 | 	return result, fixed
171 | }
172 | 
173 | // fixFileEndNewline ensures file ends with single newline
174 | func (f *Fixer) fixFileEndNewline(lines []string) ([]string, int) {
175 | 	if len(lines) == 0 {
176 | 		return lines, 0
177 | 	}
178 | 
179 | 	// Remove trailing empty lines
180 | 	for len(lines) > 0 && strings.TrimSpace(lines[len(lines)-1]) == "" {
181 | 		lines = lines[:len(lines)-1]
182 | 	}
183 | 
184 | 	// Add single empty line at the end
185 | 	lines = append(lines, "")
186 | 
187 | 	return lines, 1
188 | }
189 | 


--------------------------------------------------------------------------------
/internal/markdownfmt/formatter.go:
--------------------------------------------------------------------------------
  1 | package markdownfmt
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"regexp"
  6 | 	"strings"
  7 | )
  8 | 
  9 | // Formatter for formatting markdown content
 10 | type Formatter struct {
 11 | 	// Whether formatting is enabled
 12 | 	enabled bool
 13 | }
 14 | 
 15 | // New creates a new formatter
 16 | func New(enabled bool) *Formatter {
 17 | 	return &Formatter{
 18 | 		enabled: enabled,
 19 | 	}
 20 | }
 21 | 
 22 | // Format formats markdown content
 23 | func (f *Formatter) Format(content string) string {
 24 | 	if !f.enabled {
 25 | 		return content
 26 | 	}
 27 | 
 28 | 	// 1. Split content into lines
 29 | 	lines := strings.Split(content, "\n")
 30 | 
 31 | 	// 2. Process each line
 32 | 	var formatted []string
 33 | 	for i := 0; i < len(lines); i++ {
 34 | 		line := lines[i]
 35 | 
 36 | 		// Process headings: ensure there are blank lines before and after
 37 | 		if isHeading(line) {
 38 | 			// If not the first line and previous line is not blank, add a blank line
 39 | 			if i > 0 && len(strings.TrimSpace(lines[i-1])) > 0 {
 40 | 				formatted = append(formatted, "")
 41 | 			}
 42 | 			// Normalize heading format (one space after #)
 43 | 			line = formatHeading(line)
 44 | 			formatted = append(formatted, line)
 45 | 			// If not the last line, add a blank line
 46 | 			if i < len(lines)-1 {
 47 | 				formatted = append(formatted, "")
 48 | 			}
 49 | 			continue
 50 | 		}
 51 | 
 52 | 		// Process spaces in links
 53 | 		line = formatMarkdownLinks(line)
 54 | 
 55 | 		// Process content in parentheses
 56 | 		line = formatParentheses(line)
 57 | 
 58 | 		// Process spaces between Chinese and English text
 59 | 		line = formatChineseEnglishSpace(line)
 60 | 
 61 | 		formatted = append(formatted, line)
 62 | 	}
 63 | 
 64 | 	// 3. Handle consecutive blank lines
 65 | 	formatted = removeConsecutiveBlankLines(formatted)
 66 | 
 67 | 	// 4. Join lines
 68 | 	result := strings.Join(formatted, "\n")
 69 | 
 70 | 	return result
 71 | }
 72 | 
 73 | // isHeading checks if the line is a heading
 74 | func isHeading(line string) bool {
 75 | 	return strings.HasPrefix(strings.TrimSpace(line), "#")
 76 | }
 77 | 
 78 | // formatHeading formats the heading line
 79 | func formatHeading(line string) string {
 80 | 	// Remove leading spaces
 81 | 	line = strings.TrimSpace(line)
 82 | 	// Ensure only one space between # and text
 83 | 	re := regexp.MustCompile(`^(#+)\s*`)
 84 | 	return re.ReplaceAllString(line, "$1 ")
 85 | }
 86 | 
 87 | // formatParentheses processes the format within parentheses
 88 | func formatParentheses(line string) string {
 89 | 	// First handle http/https links by temporarily replacing them
 90 | 	linkPattern := regexp.MustCompile(`\([^)]*https?://[^)]+\)`)
 91 | 	links := linkPattern.FindAllString(line, -1)
 92 | 	for i, link := range links {
 93 | 		line = strings.Replace(line, link, fmt.Sprintf("__LINK_PLACEHOLDER_%d__", i), 1)
 94 | 	}
 95 | 
 96 | 	// Process regular parentheses content
 97 | 	re := regexp.MustCompile(`\(([^)]+)\)`)
 98 | 	line = re.ReplaceAllStringFunc(line, func(match string) string {
 99 | 		// Extract content within parentheses
100 | 		content := match[1 : len(match)-1]
101 | 		// Clean leading and trailing spaces
102 | 		content = strings.TrimSpace(content)
103 | 		// Replace consecutive spaces with a single space
104 | 		content = regexp.MustCompile(`\s+`).ReplaceAllString(content, " ")
105 | 		return fmt.Sprintf("(%s)", content)
106 | 	})
107 | 
108 | 	// Restore links
109 | 	for i, link := range links {
110 | 		line = strings.Replace(line, fmt.Sprintf("__LINK_PLACEHOLDER_%d__", i), link, 1)
111 | 	}
112 | 
113 | 	return line
114 | }
115 | 
116 | // formatMarkdownLinks processes spaces in markdown links
117 | func formatMarkdownLinks(line string) string {
118 | 	// Match markdown link format [text](url), including possible spaces
119 | 	linkPattern := regexp.MustCompile(`\[(.*?)\]\(\s*(.*?)\s*\)`)
120 | 
121 | 	// Process spaces in link text and URL
122 | 	line = linkPattern.ReplaceAllStringFunc(line, func(match string) string {
123 | 		// Extract link text and URL
124 | 		parts := linkPattern.FindStringSubmatch(match)
125 | 		if len(parts) != 3 {
126 | 			return match
127 | 		}
128 | 
129 | 		text := parts[1]
130 | 		url := parts[2]
131 | 
132 | 		// Clean spaces in URL
133 | 		url = strings.TrimSpace(url)
134 | 		// Remove all spaces and invisible characters in URL
135 | 		url = regexp.MustCompile(`[\s\p{Zs}\p{C}]+`).ReplaceAllString(url, "")
136 | 
137 | 		// Keep spaces in link text, but clean leading/trailing spaces and consecutive spaces
138 | 		text = strings.TrimSpace(text)
139 | 		text = regexp.MustCompile(`\s+`).ReplaceAllString(text, " ")
140 | 
141 | 		// Reassemble link
142 | 		return fmt.Sprintf("[%s](%s)", text, url)
143 | 	})
144 | 
145 | 	// Process spaces in heading links
146 | 	headingLinkPattern := regexp.MustCompile(`\]\(#(.*?)\)`)
147 | 	line = headingLinkPattern.ReplaceAllStringFunc(line, func(match string) string {
148 | 		parts := headingLinkPattern.FindStringSubmatch(match)
149 | 		if len(parts) != 2 {
150 | 			return match
151 | 		}
152 | 
153 | 		anchor := parts[1]
154 | 		// Remove all spaces
155 | 		anchor = regexp.MustCompile(`\s+`).ReplaceAllString(anchor, "")
156 | 		return fmt.Sprintf("](#%s)", anchor)
157 | 	})
158 | 
159 | 	return line
160 | }
161 | 
162 | // formatChineseEnglishSpace adds spaces between Chinese and English text
163 | func formatChineseEnglishSpace(line string) string {
164 | 	// Match boundaries between Chinese and English/numbers
165 | 	re := regexp.MustCompile(`([\p{Han}])([A-Za-z0-9])`)
166 | 	line = re.ReplaceAllString(line, "$1 $2")
167 | 
168 | 	re = regexp.MustCompile(`([A-Za-z0-9])([\p{Han}])`)
169 | 	line = re.ReplaceAllString(line, "$1 $2")
170 | 
171 | 	return line
172 | }
173 | 
174 | // removeConsecutiveBlankLines removes consecutive blank lines
175 | func removeConsecutiveBlankLines(lines []string) []string {
176 | 	var result []string
177 | 	isPrevLineBlank := false
178 | 
179 | 	for _, line := range lines {
180 | 		isCurrentLineBlank := len(strings.TrimSpace(line)) == 0
181 | 
182 | 		if !isCurrentLineBlank || !isPrevLineBlank {
183 | 			result = append(result, line)
184 | 		}
185 | 
186 | 		isPrevLineBlank = isCurrentLineBlank
187 | 	}
188 | 
189 | 	return result
190 | }
191 | 


--------------------------------------------------------------------------------
/docs/features/export.md:
--------------------------------------------------------------------------------
  1 | # Export 功能设计文档
  2 | 
  3 | ## 功能概述
  4 | 
  5 | 为 mdctl 工具增加 `export` 子命令，用于将 Markdown 文件导出为其他格式。第一版将优先支持导出为 Word 文档格式（docx），后续可扩展支持更多格式（如 PDF、EPUB 等）。
  6 | 
  7 | 该功能将利用 Pandoc 作为底层导出工具，支持 Pandoc 的模板系统，允许用户配置自定义的导出模板。
  8 | 
  9 | ## 用户需求
 10 | 
 11 | 1. 支持将单个 Markdown 文件导出为 Word 格式
 12 | 2. 支持将多个 Markdown 文件合并后导出为单个 Word 文档
 13 | 3. 支持按照文件夹中的文件名顺序合并文件
 14 | 4. 支持多种文档系统（MkDocs 第一期、Hugo、Docusaurus coming soon）的文件读取方式
 15 | 5. 在合并过程中智能调整标题层级，保持文档结构的清晰性
 16 | 6. 支持自定义 Word 模板，使最终文档具有一致的样式
 17 | 
 18 | ## 命令设计
 19 | 
 20 | ```
 21 | mdctl export [flags]
 22 | ```
 23 | 
 24 | ### 参数设计
 25 | 
 26 | - `-f, --file`: 指定单个 Markdown 文件进行导出
 27 | - `-d, --dir`: 指定包含多个 Markdown 文件的目录
 28 | - `-s, --site-type`: 指定文档站点类型，可选值：mkdocs, hugo, docusaurus（默认：mkdocs）
 29 | - `-o, --output`: 指定输出文件路径
 30 | - `-t, --template`: 指定 Word 模板文件路径
 31 | - `-F, --format`: 指定输出格式，可选值：docx, pdf, epub（默认：docx）
 32 | - `--toc`: 是否生成目录（默认：false）
 33 | - `--shift-heading-level-by`: 标题层级偏移量（默认：0）
 34 | - `--file-as-title`: 是否使用文件名作为章节标题（默认：false）
 35 | 
 36 | ### 使用示例
 37 | 
 38 | ```bash
 39 | # 导出单个文件
 40 | mdctl export -f README.md -o output.docx
 41 | 
 42 | # 导出整个目录
 43 | mdctl export -d docs/ -o documentation.docx
 44 | 
 45 | # 导出 MkDocs 站点
 46 | mdctl export -d docs/ -s mkdocs -o site_docs.docx
 47 | 
 48 | # 导出 Hugo 站点
 49 | mdctl export -d content/ -s hugo -o hugo_docs.docx
 50 | 
 51 | # 使用自定义模板
 52 | mdctl export -d docs/ -o report.docx -t templates/corporate.docx
 53 | 
 54 | # 指定标题层级偏移量
 55 | mdctl export -d docs/ -o documentation.docx --shift-heading-level-by 2
 56 | 
 57 | # 导出为 PDF 格式
 58 | mdctl export -d docs/ -o documentation.pdf -F pdf
 59 | ```
 60 | 
 61 | ## 实现设计
 62 | 
 63 | ### 整体架构
 64 | 
 65 | 按照项目的现有结构，我们将在 `cmd/` 目录下创建 `export.go` 文件定义命令接口，在 `internal/` 目录下创建 `exporter/` 模块实现具体功能。
 66 | 
 67 | ```
 68 | mdctl/
 69 | ├── cmd/
 70 | │   └── export.go        # 新增：export 命令定义
 71 | ├── internal/
 72 | │   └── exporter/        # 新增：导出功能实现
 73 | │       ├── exporter.go  # 导出器接口定义
 74 | │       ├── pandoc.go    # Pandoc 导出实现
 75 | │       ├── merger.go    # Markdown 合并实现
 76 | │       ├── sitereader/  # 新增：不同文档系统的站点结构读取
 77 | │       │   ├── reader.go    # 站点读取器接口
 78 | │       │   ├── mkdocs.go    # MkDocs 站点读取
 79 | │       │   ├── hugo.go      # Hugo 站点读取
 80 | │       │   └── docusaurus.go # Docusaurus 站点读取
 81 | │       └── heading.go   # 标题处理实现
 82 | ```
 83 | 
 84 | ### 核心组件
 85 | 
 86 | #### 1. 命令处理器 (cmd/export.go)
 87 | 
 88 | 负责解析命令行参数并调用导出功能。
 89 | 
 90 | ```go
 91 | var (
 92 |     exportFile         string
 93 |     exportDir          string
 94 |     siteType           string
 95 |     configFile         string
 96 |     exportOutput       string
 97 |     exportTemplate     string
 98 |     exportFormat       string
 99 |     pandocPath         string
100 |     generateToc        bool
101 |     shiftHeadingLevelBy int
102 |     fileAsTitle        bool
103 | 
104 |     exportCmd = &cobra.Command{
105 |         Use:   "export",
106 |         Short: "Export markdown files to other formats",
107 |         Long:  `...`,
108 |         RunE: func(cmd *cobra.Command, args []string) error {
109 |             // 参数验证和处理逻辑
110 |             // 调用 internal/exporter 的功能
111 |         },
112 |     }
113 | )
114 | ```
115 | 
116 | #### 2. 导出器接口 (internal/exporter/exporter.go)
117 | 
118 | 定义导出功能的通用接口，支持扩展其他格式。
119 | 
120 | ```go
121 | type Exporter interface {
122 |     Export(input string, output string, options ExportOptions) error
123 | }
124 | 
125 | type ExportOptions struct {
126 |     Template            string
127 |     GenerateToc         bool
128 |     ShiftHeadingLevelBy int
129 |     FileAsTitle         bool
130 |     Format              string
131 |     // 其他选项
132 | }
133 | ```
134 | 
135 | #### 3. Pandoc 导出实现 (internal/exporter/pandoc.go)
136 | 
137 | 使用 Pandoc 工具实现导出功能。
138 | 
139 | ```go
140 | type PandocExporter struct {
141 |     PandocPath string
142 | }
143 | 
144 | func (e *PandocExporter) Export(input, output string, options ExportOptions) error {
145 |     // 构建并执行 Pandoc 命令
146 |     // 如果 pandoc 不可用，返回明确的错误提示
147 | }
148 | ```
149 | 
150 | #### 4. 站点结构读取器 (internal/exporter/sitereader/)
151 | 
152 | 负责识别和解析不同文档系统的站点结构。
153 | 
154 | ```go
155 | // 站点读取器接口
156 | type SiteReader interface {
157 |     // 检测给定目录是否为此类型的站点
158 |     Detect(dir string) bool
159 |     
160 |     // 读取站点结构，返回按顺序排列的文件列表
161 |     ReadStructure(dir string, configPath string) ([]string, error)
162 | }
163 | 
164 | // 工厂函数，根据站点类型返回相应的读取器
165 | func GetSiteReader(siteType string) (SiteReader, error) {
166 |     // 返回对应类型的读取器实现
167 | }
168 | ```
169 | 
170 | #### 5. Markdown 合并器 (internal/exporter/merger.go)
171 | 
172 | 负责合并多个 Markdown 文件。
173 | 
174 | ```go
175 | type Merger struct {
176 |     ShiftHeadingLevelBy int
177 |     FileAsTitle         bool
178 | }
179 | 
180 | func (m *Merger) Merge(sources []string, target string) error {
181 |     // 合并多个 Markdown 文件的逻辑
182 |     // 自动处理标题层级
183 | }
184 | ```
185 | 
186 | #### 6. 标题处理器 (internal/exporter/heading.go)
187 | 
188 | 处理 Markdown 文件中的标题层级。
189 | 
190 | ```go
191 | func ShiftHeadings(content string, levels int) string {
192 |     // 调整标题层级的逻辑
193 | }
194 | ```
195 | 
196 | ### 工作流程
197 | 
198 | 1. **命令解析**：解析用户提供的命令行参数
199 | 2. **文件收集**：根据参数收集需要处理的 Markdown 文件
200 |    - 单文件模式：直接使用指定文件
201 |    - 目录模式：收集目录中的所有 Markdown 文件并按文件名排序
202 |    - 站点模式：使用相应的站点读取器解析站点结构
203 | 3. **文件合并**：如果有多个文件，将它们合并为一个临时 Markdown 文件
204 |    - 自动调整每个文件的标题层级
205 |    - 可选添加文件名作为章节标题
206 | 4. **格式转换**：使用 Pandoc 将 Markdown 转换为目标格式
207 |    - 应用用户指定的模板（如果有）
208 |    - 生成目录（如果启用）
209 | 5. **输出处理**：将最终结果输出到用户指定的路径
210 | 
211 | ## 标题层级处理策略
212 | 
213 | 为了解决多文件合并时标题层级的问题，系统将自动处理标题层级：
214 | 
215 | 1. 每个文件的标题层级将按照指定的偏移量调整：
216 |    - H1 -> H(1+偏移量)
217 |    - H2 -> H(2+偏移量)
218 |    - ...
219 |    - 如果调整后超过 H6，将转换为加粗文本 (**文本**)
220 | 
221 | 2. 如果启用了文件名作为标题功能，会自动在每个文件内容前添加对应层级的标题
222 | 
223 | 3. 系统会自动处理标题的相对层级关系，确保文档结构的逻辑性
224 | 
225 | ## 依赖条件
226 | 
227 | **Pandoc**：需要系统中安装 Pandoc 工具
228 | - 在执行导出命令时检查 Pandoc 是否可用
229 | - 如果找不到 Pandoc，提供明确的错误信息和安装指导
230 | 
231 | ## 错误处理
232 | 
233 | 1. Pandoc 不可用时提供明确的错误信息和安装指导
234 | 2. 文件不存在或无法访问时的错误处理
235 | 3. 合并过程中可能出现的格式问题处理
236 | 4. 模板文件异常的处理
237 | 5. 不支持的站点类型或配置文件处理
238 | 
239 | ## 未来扩展
240 | 
241 | 1. 增强模板管理功能，支持模板下载和更新
242 | 2. 支持更多的文档站点系统
243 | 3. 支持更复杂的文档结构处理，如自动生成封面、页眉页脚
244 | 4. 集成图表和公式渲染功能


--------------------------------------------------------------------------------
/internal/processor/processor.go:
--------------------------------------------------------------------------------
  1 | package processor
  2 | 
  3 | import (
  4 | 	"crypto/md5"
  5 | 	"fmt"
  6 | 	"io"
  7 | 	"net/http"
  8 | 	"os"
  9 | 	"path/filepath"
 10 | 	"regexp"
 11 | 	"strings"
 12 | )
 13 | 
 14 | type Processor struct {
 15 | 	SourceFile     string
 16 | 	SourceDir      string
 17 | 	ImageOutputDir string
 18 | }
 19 | 
 20 | func New(sourceFile, sourceDir, imageOutputDir string) *Processor {
 21 | 	return &Processor{
 22 | 		SourceFile:     sourceFile,
 23 | 		SourceDir:      sourceDir,
 24 | 		ImageOutputDir: imageOutputDir,
 25 | 	}
 26 | }
 27 | 
 28 | func (p *Processor) Process() error {
 29 | 	if p.SourceFile != "" {
 30 | 		return p.processFile(p.SourceFile)
 31 | 	}
 32 | 	return p.processDirectory(p.SourceDir)
 33 | }
 34 | 
 35 | func (p *Processor) processDirectory(dir string) error {
 36 | 	fmt.Printf("Processing directory: %s\n", dir)
 37 | 	return filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
 38 | 		if err != nil {
 39 | 			return err
 40 | 		}
 41 | 		if !info.IsDir() && (strings.HasSuffix(path, ".md") || strings.HasSuffix(path, ".markdown")) {
 42 | 			return p.processFile(path)
 43 | 		}
 44 | 		return nil
 45 | 	})
 46 | }
 47 | 
 48 | func (p *Processor) processFile(filePath string) error {
 49 | 	fmt.Printf("Processing file: %s\n", filePath)
 50 | 	content, err := os.ReadFile(filePath)
 51 | 	if err != nil {
 52 | 		return fmt.Errorf("failed to read file %s: %v", filePath, err)
 53 | 	}
 54 | 
 55 | 	// Determine image output directory
 56 | 	imgDir := p.determineImageDir(filePath)
 57 | 	if err := os.MkdirAll(imgDir, 0755); err != nil {
 58 | 		return fmt.Errorf("failed to create image directory %s: %v", imgDir, err)
 59 | 	}
 60 | 
 61 | 	// Find all image links
 62 | 	imgRegex := regexp.MustCompile(`!\[([^\]]*)\]\(([^)]+)\)`)
 63 | 	matches := imgRegex.FindAllStringSubmatch(string(content), -1)
 64 | 
 65 | 	fmt.Printf("Found %d images in file %s\n", len(matches), filePath)
 66 | 
 67 | 	newContent := string(content)
 68 | 	for _, match := range matches {
 69 | 		imgAlt := match[1]
 70 | 		imgURL := match[2]
 71 | 
 72 | 		// Replace image URL starting with "//" to "https://"
 73 | 		if strings.HasPrefix(imgURL, "//") {
 74 | 			imgURL = strings.Replace(imgURL, "//", "https://", 1)
 75 | 		}
 76 | 		// Skip local images
 77 | 		if !strings.HasPrefix(imgURL, "http://") && !strings.HasPrefix(imgURL, "https://") {
 78 | 			continue
 79 | 		}
 80 | 
 81 | 		// Download and save image
 82 | 		localPath, err := p.downloadImage(imgURL, imgDir)
 83 | 		if err != nil {
 84 | 			fmt.Printf("Warning: Failed to download image %s: %v\n", imgURL, err)
 85 | 			continue
 86 | 		}
 87 | 
 88 | 		// Calculate relative path
 89 | 		relPath, err := filepath.Rel(filepath.Dir(filePath), localPath)
 90 | 		if err != nil {
 91 | 			fmt.Printf("Warning: Failed to calculate relative path: %v\n", err)
 92 | 			continue
 93 | 		}
 94 | 
 95 | 		// Replace image link
 96 | 		oldLink := fmt.Sprintf("![%s](%s)", match[1], match[2])
 97 | 		newLink := fmt.Sprintf("![%s](%s)", imgAlt, relPath)
 98 | 		newContent = strings.Replace(newContent, oldLink, newLink, 1)
 99 | 	}
100 | 
101 | 	// Write back to file
102 | 	if err := os.WriteFile(filePath, []byte(newContent), 0644); err != nil {
103 | 		return fmt.Errorf("failed to write file %s: %v", filePath, err)
104 | 	}
105 | 
106 | 	return nil
107 | }
108 | 
109 | func (p *Processor) determineImageDir(filePath string) string {
110 | 	if p.ImageOutputDir != "" {
111 | 		return p.ImageOutputDir
112 | 	}
113 | 	if p.SourceDir != "" {
114 | 		return filepath.Join(p.SourceDir, "images")
115 | 	}
116 | 	return filepath.Join(filepath.Dir(filePath), "images")
117 | }
118 | 
119 | func (p *Processor) downloadImage(url string, destDir string) (string, error) {
120 | 	resp, err := http.Get(url)
121 | 	if err != nil {
122 | 		return "", err
123 | 	}
124 | 	defer resp.Body.Close()
125 | 
126 | 	// Get filename from URL or Content-Disposition
127 | 	filename := getFilenameFromURL(url, resp)
128 | 
129 | 	// If no extension, try to get from Content-Type
130 | 	if filepath.Ext(filename) == "" {
131 | 		contentType := resp.Header.Get("Content-Type")
132 | 		ext := getExtensionFromContentType(contentType)
133 | 		if ext != "" {
134 | 			filename += ext
135 | 		}
136 | 	}
137 | 
138 | 	// Ensure filename is unique
139 | 	hash := md5.New()
140 | 	io.WriteString(hash, url)
141 | 	urlHash := fmt.Sprintf("%x", hash.Sum(nil))[:8]
142 | 
143 | 	ext := filepath.Ext(filename)
144 | 	basename := strings.TrimSuffix(filename, ext)
145 | 	filename = fmt.Sprintf("%s_%s%s", basename, urlHash, ext)
146 | 
147 | 	localPath := filepath.Join(destDir, filename)
148 | 
149 | 	// Create target file
150 | 	out, err := os.Create(localPath)
151 | 	if err != nil {
152 | 		return "", err
153 | 	}
154 | 	defer out.Close()
155 | 
156 | 	// Write to file
157 | 	_, err = io.Copy(out, resp.Body)
158 | 	if err != nil {
159 | 		return "", err
160 | 	}
161 | 
162 | 	fmt.Printf("Downloaded image to: %s\n", localPath)
163 | 	return localPath, nil
164 | }
165 | 
166 | func getFilenameFromURL(url string, resp *http.Response) string {
167 | 	// First try to get from Content-Disposition
168 | 	if cd := resp.Header.Get("Content-Disposition"); cd != "" {
169 | 		if strings.Contains(cd, "filename=") {
170 | 			parts := strings.Split(cd, "filename=")
171 | 			if len(parts) > 1 {
172 | 				filename := strings.Trim(parts[1], `"'`)
173 | 				if filename != "" {
174 | 					return filename
175 | 				}
176 | 			}
177 | 		}
178 | 	}
179 | 
180 | 	// Get from URL path
181 | 	parts := strings.Split(url, "/")
182 | 	if len(parts) > 0 {
183 | 		filename := parts[len(parts)-1]
184 | 		// Remove URL parameters
185 | 		if idx := strings.Index(filename, "?"); idx != -1 {
186 | 			filename = filename[:idx]
187 | 		}
188 | 		// Remove trailing "@" character
189 | 		if idx := strings.LastIndex(filename, "@"); idx != -1 {
190 | 			if idx > strings.LastIndex(filename, ".") {
191 | 				filename = filename[:idx]
192 | 			}
193 | 		}
194 | 		if filename != "" {
195 | 			return filename
196 | 		}
197 | 	}
198 | 
199 | 	// Use default name
200 | 	return "image"
201 | }
202 | 
203 | func getExtensionFromContentType(contentType string) string {
204 | 	switch contentType {
205 | 	case "image/jpeg", "image/jpg":
206 | 		return ".jpg"
207 | 	case "image/png":
208 | 		return ".png"
209 | 	case "image/gif":
210 | 		return ".gif"
211 | 	case "image/webp":
212 | 		return ".webp"
213 | 	default:
214 | 		return ""
215 | 	}
216 | }
217 | 


--------------------------------------------------------------------------------
/internal/linter/linter.go:
--------------------------------------------------------------------------------
  1 | package linter
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"io"
  6 | 	"os"
  7 | 	"strings"
  8 | 
  9 | 	"github.com/samzong/mdctl/internal/markdownfmt"
 10 | )
 11 | 
 12 | // Config holds the linter configuration
 13 | type Config struct {
 14 | 	AutoFix      bool
 15 | 	OutputFormat string
 16 | 	RulesFile    string
 17 | 	EnableRules  []string
 18 | 	DisableRules []string
 19 | 	Verbose      bool
 20 | }
 21 | 
 22 | // Issue represents a linting issue
 23 | type Issue struct {
 24 | 	Line    int    `json:"line"`
 25 | 	Column  int    `json:"column,omitempty"`
 26 | 	Rule    string `json:"rule"`
 27 | 	Message string `json:"message"`
 28 | 	Context string `json:"context,omitempty"`
 29 | 	Fixed   bool   `json:"fixed,omitempty"`
 30 | }
 31 | 
 32 | // Result holds the linting results for a file
 33 | type Result struct {
 34 | 	Filename   string   `json:"filename"`
 35 | 	Issues     []*Issue `json:"issues"`
 36 | 	FixedCount int      `json:"fixed_count"`
 37 | }
 38 | 
 39 | // Linter performs markdown linting
 40 | type Linter struct {
 41 | 	config    *Config
 42 | 	rules     *RuleSet
 43 | 	formatter *markdownfmt.Formatter
 44 | 	fixer     *Fixer
 45 | }
 46 | 
 47 | // New creates a new linter instance
 48 | func New(config *Config) *Linter {
 49 | 	rules := NewRuleSet()
 50 | 
 51 | 	// Load configuration file if specified
 52 | 	if config.RulesFile != "" {
 53 | 		if configFile, err := LoadConfigFile(config.RulesFile); err == nil {
 54 | 			configFile.ApplyToRuleSet(rules)
 55 | 		} else if config.Verbose {
 56 | 			fmt.Printf("Warning: Could not load rules file %s: %v\n", config.RulesFile, err)
 57 | 		}
 58 | 	} else {
 59 | 		// Try to find and load default config file
 60 | 		if configFile, err := LoadConfigFile(""); err == nil {
 61 | 			configFile.ApplyToRuleSet(rules)
 62 | 		}
 63 | 	}
 64 | 
 65 | 	// Apply rule configuration from command line
 66 | 	if len(config.EnableRules) > 0 {
 67 | 		rules.EnableOnly(config.EnableRules)
 68 | 	}
 69 | 
 70 | 	if len(config.DisableRules) > 0 {
 71 | 		rules.Disable(config.DisableRules)
 72 | 	}
 73 | 
 74 | 	return &Linter{
 75 | 		config:    config,
 76 | 		rules:     rules,
 77 | 		formatter: markdownfmt.New(true), // Enable formatter for auto-fix
 78 | 		fixer:     NewFixer(),
 79 | 	}
 80 | }
 81 | 
 82 | // LintFile lints a single markdown file
 83 | func (l *Linter) LintFile(filename string) (*Result, error) {
 84 | 	// Check file size limit (10MB)
 85 | 	const maxFileSize = 10 * 1024 * 1024
 86 | 	if info, err := os.Stat(filename); err == nil {
 87 | 		if info.Size() > maxFileSize {
 88 | 			return nil, fmt.Errorf("file too large: %s (max %d bytes)", filename, maxFileSize)
 89 | 		}
 90 | 	}
 91 | 
 92 | 	content, err := os.ReadFile(filename)
 93 | 	if err != nil {
 94 | 		return nil, fmt.Errorf("failed to read file: %v", err)
 95 | 	}
 96 | 
 97 | 	return l.LintContent(filename, string(content))
 98 | }
 99 | 
100 | // LintContent lints markdown content
101 | func (l *Linter) LintContent(filename, content string) (*Result, error) {
102 | 	result := &Result{
103 | 		Filename: filename,
104 | 		Issues:   []*Issue{},
105 | 	}
106 | 
107 | 	lines := strings.Split(content, "\n")
108 | 
109 | 	// Apply all enabled rules
110 | 	for _, rule := range l.rules.GetEnabledRules() {
111 | 		issues := rule.Check(lines)
112 | 		result.Issues = append(result.Issues, issues...)
113 | 	}
114 | 
115 | 	// Apply auto-fix if requested
116 | 	if l.config.AutoFix && len(result.Issues) > 0 {
117 | 		fixedContent, fixedCount := l.applyFixes(content, result.Issues)
118 | 		result.FixedCount = fixedCount
119 | 
120 | 		// Write fixed content back to file with backup
121 | 		if fixedCount > 0 {
122 | 			// Create backup before modifying the file
123 | 			if err := l.createBackup(filename); err != nil {
124 | 				return nil, fmt.Errorf("failed to create backup: %v", err)
125 | 			}
126 | 
127 | 			if err := os.WriteFile(filename, []byte(fixedContent), 0644); err != nil {
128 | 				return nil, fmt.Errorf("failed to write fixed content: %v", err)
129 | 			}
130 | 
131 | 			// Mark issues as fixed
132 | 			for _, issue := range result.Issues {
133 | 				if issue.Rule != "MD013" { // Don't mark line length issues as fixed automatically
134 | 					issue.Fixed = true
135 | 				}
136 | 			}
137 | 		}
138 | 	}
139 | 
140 | 	return result, nil
141 | }
142 | 
143 | // applyFixes applies automatic fixes to the content
144 | func (l *Linter) applyFixes(content string, issues []*Issue) (string, int) {
145 | 	// Use the dedicated fixer for rule-specific fixes
146 | 	fixedContent, fixedCount := l.fixer.ApplyFixes(content, issues)
147 | 
148 | 	// Then apply general formatting fixes
149 | 	finalContent := l.formatter.Format(fixedContent)
150 | 
151 | 	// If formatter made additional changes, count them
152 | 	if finalContent != fixedContent && fixedCount == 0 {
153 | 		fixedCount = l.countFixableIssues(issues)
154 | 	}
155 | 
156 | 	return finalContent, fixedCount
157 | }
158 | 
159 | // createBackup creates a backup of the file before modification
160 | func (l *Linter) createBackup(filename string) error {
161 | 	backupFilename := filename + ".orig"
162 | 
163 | 	// Open source file
164 | 	src, err := os.Open(filename)
165 | 	if err != nil {
166 | 		return fmt.Errorf("failed to open source file: %v", err)
167 | 	}
168 | 	defer src.Close()
169 | 
170 | 	// Create backup file
171 | 	dst, err := os.Create(backupFilename)
172 | 	if err != nil {
173 | 		return fmt.Errorf("failed to create backup file: %v", err)
174 | 	}
175 | 	defer dst.Close()
176 | 
177 | 	// Copy content
178 | 	_, err = io.Copy(dst, src)
179 | 	if err != nil {
180 | 		return fmt.Errorf("failed to copy content to backup: %v", err)
181 | 	}
182 | 
183 | 	return nil
184 | }
185 | 
186 | // countFixableIssues counts how many issues can be automatically fixed
187 | func (l *Linter) countFixableIssues(issues []*Issue) int {
188 | 	fixableRules := map[string]bool{
189 | 		"MD009": true, // Trailing spaces
190 | 		"MD010": true, // Hard tabs
191 | 		"MD012": true, // Multiple consecutive blank lines
192 | 		"MD018": true, // No space after hash on atx style heading
193 | 		"MD019": true, // Multiple spaces after hash on atx style heading
194 | 		"MD023": true, // Headings must start at the beginning of the line
195 | 		"MD047": true, // Files should end with a single newline character
196 | 	}
197 | 
198 | 	count := 0
199 | 	for _, issue := range issues {
200 | 		if fixableRules[issue.Rule] {
201 | 			count++
202 | 		}
203 | 	}
204 | 	return count
205 | }
206 | 


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
 1 | github.com/PuerkitoBio/goquery v1.9.1 h1:mTL6XjbJTZdpfL+Gwl5U2h1l9yEkJjhmlTeV9VPW7UI=
 2 | github.com/PuerkitoBio/goquery v1.9.1/go.mod h1:cW1n6TmIMDoORQU5IU/P1T3tGFunOeXEpGP2WHRwkbY=
 3 | github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss=
 4 | github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU=
 5 | github.com/aws/aws-sdk-go v1.55.6 h1:cSg4pvZ3m8dgYcgqB97MrcdjUmZ1BeMYKUxMMB89IPk=
 6 | github.com/aws/aws-sdk-go v1.55.6/go.mod h1:eRwEWoyTWFMVYVQzKMNHWP5/RV4xIUGMQfXQHfHkpNU=
 7 | github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
 8 | github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
 9 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
10 | github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y=
11 | github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8=
12 | github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
13 | github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
14 | github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg=
15 | github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo=
16 | github.com/jmespath/go-jmespath/internal/testify v1.5.1 h1:shLQSRRSCCPj3f2gpwzGwWFoC7ycTf1rcQZHOlsJ6N8=
17 | github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U=
18 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
19 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
20 | github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
21 | github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM=
22 | github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y=
23 | github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
24 | github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
25 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
26 | github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
27 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
28 | golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
29 | golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
30 | golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
31 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
32 | golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
33 | golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
34 | golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
35 | golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns=
36 | golang.org/x/net v0.33.0 h1:74SYHlV8BIgHIFC/LrYkOGIwL19eTYXQ5wc6TBuO36I=
37 | golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4=
38 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
39 | golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
40 | golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
41 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
42 | golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
43 | golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
44 | golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
45 | golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
46 | golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
47 | golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
48 | golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
49 | golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
50 | golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
51 | golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY=
52 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
53 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
54 | golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
55 | golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
56 | golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
57 | golang.org/x/text v0.23.0 h1:D71I7dUrlY+VX0gQShAThNGHFxZ13dGLBHQLVl1mJlY=
58 | golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4=
59 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
60 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
61 | golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
62 | golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
63 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
64 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
65 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
66 | gopkg.in/yaml.v2 v2.2.8 h1:obN1ZagJSUGI0Ek/LBmuj4SNLPfIny3KsKFopxRdj10=
67 | gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
68 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
69 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
70 | 


--------------------------------------------------------------------------------
/internal/linter/linter_test.go:
--------------------------------------------------------------------------------
  1 | package linter
  2 | 
  3 | import (
  4 | 	"os"
  5 | 	"testing"
  6 | )
  7 | 
  8 | func TestLinter_LintContent(t *testing.T) {
  9 | 	tests := []struct {
 10 | 		name        string
 11 | 		content     string
 12 | 		expectRules []string // Expected rule IDs that should trigger
 13 | 		expectCount int      // Expected number of issues
 14 | 	}{
 15 | 		{
 16 | 			name:        "valid markdown",
 17 | 			content:     "# Title\n\nThis is valid markdown.\n",
 18 | 			expectRules: []string{},
 19 | 			expectCount: 0,
 20 | 		},
 21 | 		{
 22 | 			name:        "trailing spaces",
 23 | 			content:     "# Title  \n\nContent with trailing spaces.  \n",
 24 | 			expectRules: []string{"MD009"},
 25 | 			expectCount: 2,
 26 | 		},
 27 | 		{
 28 | 			name:        "hard tabs",
 29 | 			content:     "# Title\n\n\tContent with hard tab.\n",
 30 | 			expectRules: []string{"MD010"},
 31 | 			expectCount: 1,
 32 | 		},
 33 | 		{
 34 | 			name:        "multiple blank lines",
 35 | 			content:     "# Title\n\n\n\nContent after multiple blank lines.\n",
 36 | 			expectRules: []string{"MD012"},
 37 | 			expectCount: 2, // MD012 triggers for each set of consecutive blank lines
 38 | 		},
 39 | 		{
 40 | 			name:        "no space after hash",
 41 | 			content:     "#Title\n\nContent.\n",
 42 | 			expectRules: []string{"MD018"},
 43 | 			expectCount: 1,
 44 | 		},
 45 | 		{
 46 | 			name:        "multiple spaces after hash",
 47 | 			content:     "#  Title\n\nContent.\n",
 48 | 			expectRules: []string{"MD019"},
 49 | 			expectCount: 1,
 50 | 		},
 51 | 		{
 52 | 			name:        "heading not at start of line",
 53 | 			content:     "Some text\n # Title\n\nContent.\n",
 54 | 			expectRules: []string{"MD023"},
 55 | 			expectCount: 1,
 56 | 		},
 57 | 		{
 58 | 			name:        "list without blank line before",
 59 | 			content:     "# Title\nSome text\n- List item\n\nContent.\n",
 60 | 			expectRules: []string{"MD032"},
 61 | 			expectCount: 1,
 62 | 		},
 63 | 		{
 64 | 			name:        "list without blank line after",
 65 | 			content:     "# Title\n\n- List item\nSome text\n",
 66 | 			expectRules: []string{"MD032"},
 67 | 			expectCount: 1,
 68 | 		},
 69 | 		{
 70 | 			name:        "file not ending with newline",
 71 | 			content:     "# Title\n\nContent without final newline",
 72 | 			expectRules: []string{"MD047"},
 73 | 			expectCount: 1,
 74 | 		},
 75 | 		{
 76 | 			name:        "file ending with multiple newlines",
 77 | 			content:     "# Title\n\nContent.\n\n",
 78 | 			expectRules: []string{"MD047", "MD012"},
 79 | 			expectCount: 2, // Both MD047 and MD012 trigger
 80 | 		},
 81 | 	}
 82 | 
 83 | 	for _, tt := range tests {
 84 | 		t.Run(tt.name, func(t *testing.T) {
 85 | 			linter := New(&Config{})
 86 | 			result, err := linter.LintContent("test.md", tt.content)
 87 | 
 88 | 			if err != nil {
 89 | 				t.Fatalf("LintContent failed: %v", err)
 90 | 			}
 91 | 
 92 | 			if len(result.Issues) != tt.expectCount {
 93 | 				t.Errorf("Expected %d issues, got %d", tt.expectCount, len(result.Issues))
 94 | 				for _, issue := range result.Issues {
 95 | 					t.Logf("Issue: %s - %s", issue.Rule, issue.Message)
 96 | 				}
 97 | 			}
 98 | 
 99 | 			// Check that expected rules are triggered
100 | 			foundRules := make(map[string]bool)
101 | 			for _, issue := range result.Issues {
102 | 				foundRules[issue.Rule] = true
103 | 			}
104 | 
105 | 			for _, expectedRule := range tt.expectRules {
106 | 				if !foundRules[expectedRule] {
107 | 					t.Errorf("Expected rule %s to be triggered, but it wasn't", expectedRule)
108 | 				}
109 | 			}
110 | 		})
111 | 	}
112 | }
113 | 
114 | func TestLinter_AutoFix(t *testing.T) {
115 | 	tests := []struct {
116 | 		name           string
117 | 		content        string
118 | 		expectFixed    bool
119 | 		expectFixCount int
120 | 		expectRules    []string
121 | 	}{
122 | 		{
123 | 			name:           "fix trailing spaces",
124 | 			content:        "# Title  \n\nContent with trailing spaces.  \n",
125 | 			expectFixed:    true,
126 | 			expectFixCount: 2,
127 | 			expectRules:    []string{"MD009"},
128 | 		},
129 | 		{
130 | 			name:           "fix hard tabs",
131 | 			content:        "# Title\n\n\tContent with hard tab.\n",
132 | 			expectFixed:    true,
133 | 			expectFixCount: 1,
134 | 			expectRules:    []string{"MD010"},
135 | 		},
136 | 		{
137 | 			name:           "fix multiple blank lines",
138 | 			content:        "# Title\n\n\n\nContent after multiple blank lines.\n",
139 | 			expectFixed:    true,
140 | 			expectFixCount: 2, // MD012 triggers multiple times
141 | 			expectRules:    []string{"MD012"},
142 | 		},
143 | 	}
144 | 
145 | 	for _, tt := range tests {
146 | 		t.Run(tt.name, func(t *testing.T) {
147 | 			// Create a temporary file
148 | 			tmpFile, err := os.CreateTemp("", "test_*.md")
149 | 			if err != nil {
150 | 				t.Fatalf("Failed to create temp file: %v", err)
151 | 			}
152 | 			defer os.Remove(tmpFile.Name())
153 | 			defer os.Remove(tmpFile.Name() + ".orig") // Remove backup file
154 | 
155 | 			// Write content to temp file
156 | 			if _, err := tmpFile.WriteString(tt.content); err != nil {
157 | 				t.Fatalf("Failed to write to temp file: %v", err)
158 | 			}
159 | 			tmpFile.Close()
160 | 
161 | 			// Run linter with auto-fix
162 | 			linter := New(&Config{AutoFix: true})
163 | 			result, err := linter.LintFile(tmpFile.Name())
164 | 
165 | 			if err != nil {
166 | 				t.Fatalf("LintFile failed: %v", err)
167 | 			}
168 | 
169 | 			if tt.expectFixed && result.FixedCount != tt.expectFixCount {
170 | 				t.Errorf("Expected %d fixes, got %d", tt.expectFixCount, result.FixedCount)
171 | 			}
172 | 
173 | 			// Check that backup file was created
174 | 			if tt.expectFixed {
175 | 				if _, err := os.Stat(tmpFile.Name() + ".orig"); os.IsNotExist(err) {
176 | 					t.Error("Expected backup file to be created, but it wasn't")
177 | 				}
178 | 			}
179 | 		})
180 | 	}
181 | }
182 | 
183 | func TestLinter_BackupCreation(t *testing.T) {
184 | 	// Create a temporary file
185 | 	tmpFile, err := os.CreateTemp("", "test_*.md")
186 | 	if err != nil {
187 | 		t.Fatalf("Failed to create temp file: %v", err)
188 | 	}
189 | 	defer os.Remove(tmpFile.Name())
190 | 	defer os.Remove(tmpFile.Name() + ".orig")
191 | 
192 | 	originalContent := "# Title  \n\nContent with trailing spaces.  \n"
193 | 	if _, err := tmpFile.WriteString(originalContent); err != nil {
194 | 		t.Fatalf("Failed to write to temp file: %v", err)
195 | 	}
196 | 	tmpFile.Close()
197 | 
198 | 	// Run linter with auto-fix
199 | 	linter := New(&Config{AutoFix: true})
200 | 	_, err = linter.LintFile(tmpFile.Name())
201 | 
202 | 	if err != nil {
203 | 		t.Fatalf("LintFile failed: %v", err)
204 | 	}
205 | 
206 | 	// Check that backup file exists and contains original content
207 | 	backupContent, err := os.ReadFile(tmpFile.Name() + ".orig")
208 | 	if err != nil {
209 | 		t.Fatalf("Failed to read backup file: %v", err)
210 | 	}
211 | 
212 | 	if string(backupContent) != originalContent {
213 | 		t.Errorf("Backup content doesn't match original.\nExpected: %q\nGot: %q", originalContent, string(backupContent))
214 | 	}
215 | }
216 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | BINARY=mdctl
  2 | VERSION=$(shell git describe --tags || echo "unknown version")
  3 | BUILDTIME=$(shell date -u)
  4 | GOBUILD=CGO_ENABLED=0 go build -trimpath -ldflags '-X "github.com/samzong/mdctl/cmd.Version=$(VERSION)" -X "github.com/samzong/mdctl/cmd.BuildTime=$(BUILDTIME)"'
  5 | 
  6 | # Homebrew related variables
  7 | CLEAN_VERSION=$(shell echo $(VERSION) | sed 's/^v//')
  8 | DOWNLOAD_URL=https://github.com/samzong/mdctl/releases/download/$(VERSION)/mdctl-$(CLEAN_VERSION)-darwin-amd64.tar.gz
  9 | HOMEBREW_TAP_REPO=homebrew-tap
 10 | FORMULA_FILE=Formula/mdctl.rb
 11 | BRANCH_NAME=update-mdctl-$(CLEAN_VERSION)
 12 | 
 13 | # Adjust architecture definitions to match goreleaser output
 14 | SUPPORTED_ARCHS = Darwin_x86_64 Darwin_arm64 Linux_x86_64 Linux_arm64
 15 | 
 16 | .PHONY: deps
 17 | deps:
 18 | 	@echo "Installing Go dependencies..."
 19 | 	go mod download
 20 | 	go mod verify
 21 | 
 22 | .PHONY: build
 23 | build: deps
 24 | 	$(GOBUILD) -o bin/$(BINARY)
 25 | 
 26 | .PHONY: test
 27 | test:
 28 | 	go test -v ./...
 29 | 
 30 | .PHONY: clean
 31 | clean:
 32 | 	rm -rf bin/
 33 | 	go clean -i ./...
 34 | 
 35 | .PHONY: fmt
 36 | fmt:
 37 | 	go fmt ./...
 38 | 	go mod tidy
 39 | 
 40 | .PHONY: all
 41 | all: clean fmt build test
 42 | 
 43 | .PHONY: update-homebrew
 44 | update-homebrew:
 45 | 	@echo "==> Starting Homebrew formula update process..."
 46 | 	@if [ -z "$(GH_PAT)" ]; then \
 47 | 		echo "❌ Error: GH_PAT environment variable is required"; \
 48 | 		exit 1; \
 49 | 	fi
 50 | 
 51 | 	@echo "==> Current version information:"
 52 | 	@echo "    - VERSION: $(VERSION)"
 53 | 	@echo "    - CLEAN_VERSION: $(CLEAN_VERSION)"
 54 | 
 55 | 	@echo "==> Preparing working directory..."
 56 | 	@rm -rf tmp && mkdir -p tmp
 57 | 	
 58 | 	@echo "==> Cloning Homebrew tap repository..."
 59 | 	@cd tmp && git clone https://$(GH_PAT)@github.com/samzong/$(HOMEBREW_TAP_REPO).git
 60 | 	@cd tmp/$(HOMEBREW_TAP_REPO) && echo "    - Creating new branch: $(BRANCH_NAME)" && git checkout -b $(BRANCH_NAME)
 61 | 
 62 | 	@echo "==> Processing architectures and calculating checksums..."
 63 | 	@cd tmp/$(HOMEBREW_TAP_REPO) && \
 64 | 	for arch in $(SUPPORTED_ARCHS); do \
 65 | 		echo "    - Processing $$arch..."; \
 66 | 		if [ "$(DRY_RUN)" = "1" ]; then \
 67 | 			echo "      [DRY_RUN] Would download: https://github.com/samzong/mdctl/releases/download/v$(CLEAN_VERSION)/mdctl_$${arch}.tar.gz"; \
 68 | 			case "$$arch" in \
 69 | 				Darwin_x86_64) DARWIN_AMD64_SHA="fake_sha_amd64" ;; \
 70 | 				Darwin_arm64) DARWIN_ARM64_SHA="fake_sha_arm64" ;; \
 71 | 				Linux_x86_64) LINUX_AMD64_SHA="fake_sha_linux_amd64" ;; \
 72 | 				Linux_arm64) LINUX_ARM64_SHA="fake_sha_linux_arm64" ;; \
 73 | 			esac; \
 74 | 		else \
 75 | 			echo "      - Downloading release archive..."; \
 76 | 			curl -L -sSfO "https://github.com/samzong/mdctl/releases/download/v$(CLEAN_VERSION)/mdctl_$${arch}.tar.gz" || { echo "❌ Failed to download $$arch archive"; exit 1; }; \
 77 | 			echo "      - Calculating SHA256..."; \
 78 | 			sha=$$(shasum -a 256 "mdctl_$${arch}.tar.gz" | cut -d' ' -f1); \
 79 | 			case "$$arch" in \
 80 | 				Darwin_x86_64) DARWIN_AMD64_SHA="$$sha"; echo "      ✓ Darwin AMD64 SHA: $$sha" ;; \
 81 | 				Darwin_arm64) DARWIN_ARM64_SHA="$$sha"; echo "      ✓ Darwin ARM64 SHA: $$sha" ;; \
 82 | 				Linux_x86_64) LINUX_AMD64_SHA="$$sha"; echo "      ✓ Linux AMD64 SHA: $$sha" ;; \
 83 | 				Linux_arm64) LINUX_ARM64_SHA="$$sha"; echo "      ✓ Linux ARM64 SHA: $$sha" ;; \
 84 | 			esac; \
 85 | 		fi; \
 86 | 	done; \
 87 | 	\
 88 | 	if [ "$(DRY_RUN)" = "1" ]; then \
 89 | 		echo "==> [DRY_RUN] Would update formula with:"; \
 90 | 		echo "    - Darwin AMD64 SHA: $$DARWIN_AMD64_SHA"; \
 91 | 		echo "    - Darwin ARM64 SHA: $$DARWIN_ARM64_SHA"; \
 92 | 		echo "    - Linux AMD64 SHA: $$LINUX_AMD64_SHA"; \
 93 | 		echo "    - Linux ARM64 SHA: $$LINUX_ARM64_SHA"; \
 94 | 		echo "    - Would commit and push changes"; \
 95 | 		echo "    - Would create PR"; \
 96 | 	else \
 97 | 		echo "==> Updating formula file..."; \
 98 | 		echo "    - Updating version to $(CLEAN_VERSION)"; \
 99 | 		sed -i '' -e 's|version ".*"|version "$(CLEAN_VERSION)"|' $(FORMULA_FILE); \
100 | 		\
101 | 		echo "    - Updating URLs and checksums"; \
102 | 		sed -i '' \
103 | 			-e '/on_macos/,/end/ { \
104 | 				/if Hardware::CPU.arm?/,/else/ { \
105 | 					s|url ".*"|url "https://github.com/samzong/mdctl/releases/download/v#{version}/mdctl_Darwin_arm64.tar.gz"|; \
106 | 					s|sha256 ".*"|sha256 "'"$$DARWIN_ARM64_SHA"'"|; \
107 | 				}; \
108 | 				/else/,/end/ { \
109 | 					s|url ".*"|url "https://github.com/samzong/mdctl/releases/download/v#{version}/mdctl_Darwin_x86_64.tar.gz"|; \
110 | 					s|sha256 ".*"|sha256 "'"$$DARWIN_AMD64_SHA"'"|; \
111 | 				}; \
112 | 			}' \
113 | 			-e '/on_linux/,/end/ { \
114 | 				/if Hardware::CPU.arm?/,/else/ { \
115 | 					s|url ".*"|url "https://github.com/samzong/mdctl/releases/download/v#{version}/mdctl_Linux_arm64.tar.gz"|; \
116 | 					s|sha256 ".*"|sha256 "'"$$LINUX_ARM64_SHA"'"|; \
117 | 				}; \
118 | 				/else/,/end/ { \
119 | 					s|url ".*"|url "https://github.com/samzong/mdctl/releases/download/v#{version}/mdctl_Linux_x86_64.tar.gz"|; \
120 | 					s|sha256 ".*"|sha256 "'"$$LINUX_AMD64_SHA"'"|; \
121 | 				}; \
122 | 			}' $(FORMULA_FILE); \
123 | 		\
124 | 		echo "    - Checking for changes..."; \
125 | 		if ! git diff --quiet $(FORMULA_FILE); then \
126 | 			echo "==> Changes detected, creating pull request..."; \
127 | 			echo "    - Adding changes to git"; \
128 | 			git add $(FORMULA_FILE); \
129 | 			echo "    - Committing changes"; \
130 | 			git commit -m "chore: bump to $(VERSION)"; \
131 | 			echo "    - Pushing to remote"; \
132 | 			git push -u origin $(BRANCH_NAME); \
133 | 			echo "    - Preparing pull request data"; \
134 | 			pr_data=$$(jq -n \
135 | 				--arg title "chore: update mdctl to $(VERSION)" \
136 | 				--arg body "Auto-generated PR\nSHAs:\n- Darwin(amd64): $$DARWIN_AMD64_SHA\n- Darwin(arm64): $$DARWIN_ARM64_SHA" \
137 | 				--arg head "$(BRANCH_NAME)" \
138 | 				--arg base "main" \
139 | 				'{title: $$title, body: $$body, head: $$head, base: $$base}'); \
140 | 			echo "    - Creating pull request"; \
141 | 			curl -X POST \
142 | 				-H "Authorization: token $(GH_PAT)" \
143 | 				-H "Content-Type: application/json" \
144 | 				https://api.github.com/repos/samzong/$(HOMEBREW_TAP_REPO)/pulls \
145 | 				-d "$$pr_data"; \
146 | 			echo "✅ Pull request created successfully"; \
147 | 		else \
148 | 			echo "❌ No changes detected in formula file"; \
149 | 			exit 1; \
150 | 		fi; \
151 | 	fi
152 | 
153 | 	@echo "==> Cleaning up temporary files..."
154 | 	@rm -rf tmp
155 | 	@echo "✅ Homebrew formula update process completed"
156 | 
157 | .PHONY: help
158 | help:
159 | 	@echo "Usage: make <target>"
160 | 	@echo "Targets:"
161 | 	@echo "  deps: Install Go dependencies"
162 | 	@echo "  build: Build the binary"
163 | 	@echo "  test: Run tests"
164 | 	@echo "  clean: Clean up build artifacts"
165 | 	@echo "  fmt: Format the code"
166 | 	@echo "  all: Clean, format, build, and test"
167 | 	@echo "  update-homebrew: Update Homebrew formula (requires GH_PAT)"
168 | 
169 | .DEFAULT_GOAL := help
170 | 


--------------------------------------------------------------------------------
/internal/llmstxt/extractor.go:
--------------------------------------------------------------------------------
  1 | package llmstxt
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"net/http"
  6 | 	"net/url"
  7 | 	"path"
  8 | 	"strings"
  9 | 
 10 | 	"github.com/PuerkitoBio/goquery"
 11 | )
 12 | 
 13 | // Extract page information from HTML content
 14 | func (g *Generator) extractPageInfo(urlStr string, resp *http.Response) (PageInfo, error) {
 15 | 	// Create PageInfo object
 16 | 	pageInfo := PageInfo{
 17 | 		URL:     urlStr,
 18 | 		Section: parseSection(urlStr),
 19 | 	}
 20 | 
 21 | 	// Parse HTML
 22 | 	doc, err := goquery.NewDocumentFromReader(resp.Body)
 23 | 	if err != nil {
 24 | 		return pageInfo, err
 25 | 	}
 26 | 
 27 | 	// Extract title
 28 | 	pageInfo.Title = extractTitle(doc)
 29 | 	if g.config.VeryVerbose {
 30 | 		g.logger.Printf("Extracted title from %s: %s", urlStr, pageInfo.Title)
 31 | 	}
 32 | 
 33 | 	if pageInfo.Title == "" {
 34 | 		// If title cannot be extracted, use the last segment of the URL as the title
 35 | 		pageInfo.Title = extractTitleFromURL(urlStr)
 36 | 		if g.config.VeryVerbose {
 37 | 			g.logger.Printf("Could not extract title, using URL-based title instead: %s", pageInfo.Title)
 38 | 		}
 39 | 	}
 40 | 
 41 | 	// Extract description
 42 | 	pageInfo.Description = extractDescription(doc)
 43 | 	if g.config.VeryVerbose {
 44 | 		g.logger.Printf("Extracted description from %s: %s", urlStr, truncateString(pageInfo.Description, 100))
 45 | 	}
 46 | 
 47 | 	// Extract content in full mode
 48 | 	if g.config.FullMode {
 49 | 		if g.config.VeryVerbose {
 50 | 			g.logger.Printf("Extracting full content from %s", urlStr)
 51 | 		}
 52 | 		pageInfo.Content = extractContent(doc)
 53 | 		if g.config.VeryVerbose {
 54 | 			contentLen := len(pageInfo.Content)
 55 | 			preview := truncateString(pageInfo.Content, 100)
 56 | 			g.logger.Printf("Extracted content from %s (%d chars): %s", urlStr, contentLen, preview)
 57 | 		}
 58 | 	}
 59 | 
 60 | 	return pageInfo, nil
 61 | }
 62 | 
 63 | // Helper function: truncate string and add ellipsis
 64 | func truncateString(s string, maxLen int) string {
 65 | 	s = strings.TrimSpace(s)
 66 | 	if len(s) <= maxLen {
 67 | 		return s
 68 | 	}
 69 | 	return s[:maxLen] + "..."
 70 | }
 71 | 
 72 | // Extract section information from URL
 73 | func parseSection(urlStr string) string {
 74 | 	// Parse URL
 75 | 	parsedURL, err := url.Parse(urlStr)
 76 | 	if err != nil {
 77 | 		return "ROOT"
 78 | 	}
 79 | 
 80 | 	// Split path
 81 | 	pathParts := strings.Split(strings.Trim(parsedURL.Path, "/"), "/")
 82 | 
 83 | 	// If path is empty, return ROOT
 84 | 	if len(pathParts) == 0 || pathParts[0] == "" {
 85 | 		return "ROOT"
 86 | 	}
 87 | 
 88 | 	// Return first segment of path
 89 | 	return pathParts[0]
 90 | }
 91 | 
 92 | // Extract title from HTML document
 93 | func extractTitle(doc *goquery.Document) string {
 94 | 	// Try to extract from title tag
 95 | 	title := doc.Find("title").First().Text()
 96 | 	title = strings.TrimSpace(title)
 97 | 
 98 | 	// If no title tag, try to extract from h1 tag
 99 | 	if title == "" {
100 | 		title = doc.Find("h1").First().Text()
101 | 		title = strings.TrimSpace(title)
102 | 	}
103 | 
104 | 	return title
105 | }
106 | 
107 | // Extract title from URL
108 | func extractTitleFromURL(urlStr string) string {
109 | 	// Parse URL
110 | 	parsedURL, err := url.Parse(urlStr)
111 | 	if err != nil {
112 | 		return urlStr
113 | 	}
114 | 
115 | 	// Get the last segment of the path
116 | 	basename := path.Base(parsedURL.Path)
117 | 
118 | 	// Remove file extension
119 | 	basename = strings.TrimSuffix(basename, path.Ext(basename))
120 | 
121 | 	// If basename is empty or is "/", use hostname
122 | 	if basename == "" || basename == "." || basename == "/" {
123 | 		return parsedURL.Hostname()
124 | 	}
125 | 
126 | 	// Replace hyphens and underscores with spaces, and capitalize
127 | 	basename = strings.ReplaceAll(basename, "-", " ")
128 | 	basename = strings.ReplaceAll(basename, "_", " ")
129 | 
130 | 	return strings.Title(basename)
131 | }
132 | 
133 | // Extract description from HTML document
134 | func extractDescription(doc *goquery.Document) string {
135 | 	var description string
136 | 
137 | 	// Try meta description
138 | 	description, _ = doc.Find("meta[name='description']").Attr("content")
139 | 	if description != "" {
140 | 		return strings.TrimSpace(description)
141 | 	}
142 | 
143 | 	// Try og:description
144 | 	description, _ = doc.Find("meta[property='og:description']").Attr("content")
145 | 	if description != "" {
146 | 		return strings.TrimSpace(description)
147 | 	}
148 | 
149 | 	// Try twitter:description
150 | 	description, _ = doc.Find("meta[name='twitter:description']").Attr("content")
151 | 	if description != "" {
152 | 		return strings.TrimSpace(description)
153 | 	}
154 | 
155 | 	// If none found, extract first text
156 | 	description = doc.Find("p").First().Text()
157 | 	if description != "" {
158 | 		// Limit length
159 | 		if len(description) > 200 {
160 | 			description = description[:197] + "..."
161 | 		}
162 | 		return strings.TrimSpace(description)
163 | 	}
164 | 
165 | 	return "No description available"
166 | }
167 | 
168 | // Extract content from HTML document
169 | func extractContent(doc *goquery.Document) string {
170 | 	var content strings.Builder
171 | 
172 | 	// Try to find main content area
173 | 	mainContent := doc.Find("article, main, #content, .content, .post-content").First()
174 | 
175 | 	// If no specific content area found, use body
176 | 	if mainContent.Length() == 0 {
177 | 		mainContent = doc.Find("body")
178 | 	}
179 | 
180 | 	// Extract all paragraphs
181 | 	mainContent.Find("p, h1, h2, h3, h4, h5, h6, ul, ol, blockquote").Each(func(i int, s *goquery.Selection) {
182 | 		// Get tag name
183 | 		tagName := goquery.NodeName(s)
184 | 		text := strings.TrimSpace(s.Text())
185 | 
186 | 		if text == "" {
187 | 			return
188 | 		}
189 | 
190 | 		// Format according to tag type
191 | 		switch tagName {
192 | 		case "h1":
193 | 			content.WriteString("# " + text + "\n\n")
194 | 		case "h2":
195 | 			content.WriteString("## " + text + "\n\n")
196 | 		case "h3":
197 | 			content.WriteString("### " + text + "\n\n")
198 | 		case "h4":
199 | 			content.WriteString("#### " + text + "\n\n")
200 | 		case "h5":
201 | 			content.WriteString("##### " + text + "\n\n")
202 | 		case "h6":
203 | 			content.WriteString("###### " + text + "\n\n")
204 | 		case "p":
205 | 			content.WriteString(text + "\n\n")
206 | 		case "blockquote":
207 | 			content.WriteString("> " + text + "\n\n")
208 | 		case "ul", "ol":
209 | 			s.Find("li").Each(func(j int, li *goquery.Selection) {
210 | 				liText := strings.TrimSpace(li.Text())
211 | 				if liText != "" {
212 | 					if tagName == "ul" {
213 | 						content.WriteString("- " + liText + "\n")
214 | 					} else {
215 | 						content.WriteString(fmt.Sprintf("%d. %s\n", j+1, liText))
216 | 					}
217 | 				}
218 | 			})
219 | 			content.WriteString("\n")
220 | 		}
221 | 	})
222 | 
223 | 	// Limit content length
224 | 	contentStr := content.String()
225 | 	if len(contentStr) > 10000 {
226 | 		// Find last paragraph end position
227 | 		lastParaEnd := strings.LastIndex(contentStr[:10000], "\n\n")
228 | 		if lastParaEnd == -1 {
229 | 			lastParaEnd = 10000
230 | 		}
231 | 		contentStr = contentStr[:lastParaEnd] + "\n\n... (content truncated)"
232 | 	}
233 | 
234 | 	return contentStr
235 | }
236 | 


--------------------------------------------------------------------------------
/internal/llmstxt/sitemap.go:
--------------------------------------------------------------------------------
  1 | package llmstxt
  2 | 
  3 | import (
  4 | 	"encoding/xml"
  5 | 	"fmt"
  6 | 	"io"
  7 | 	"net/http"
  8 | 	"strings"
  9 | 	"time"
 10 | 
 11 | 	"github.com/gobwas/glob"
 12 | )
 13 | 
 14 | // Sitemap XML structure
 15 | type Sitemap struct {
 16 | 	XMLName xml.Name `xml:"urlset"`
 17 | 	URLs    []struct {
 18 | 		Loc        string `xml:"loc"`
 19 | 		LastMod    string `xml:"lastmod,omitempty"`
 20 | 		ChangeFreq string `xml:"changefreq,omitempty"`
 21 | 		Priority   string `xml:"priority,omitempty"`
 22 | 	} `xml:"url"`
 23 | }
 24 | 
 25 | // SitemapIndex XML structure
 26 | type SitemapIndex struct {
 27 | 	XMLName  xml.Name `xml:"sitemapindex"`
 28 | 	Sitemaps []struct {
 29 | 		Loc     string `xml:"loc"`
 30 | 		LastMod string `xml:"lastmod,omitempty"`
 31 | 	} `xml:"sitemap"`
 32 | }
 33 | 
 34 | // Parse sitemap.xml file and return all URLs
 35 | func (g *Generator) parseSitemap() ([]string, error) {
 36 | 	g.logger.Printf("Parsing sitemap from %s", g.config.SitemapURL)
 37 | 
 38 | 	// Set HTTP client
 39 | 	client := &http.Client{
 40 | 		Timeout: time.Duration(g.config.Timeout) * time.Second,
 41 | 	}
 42 | 
 43 | 	// Build request
 44 | 	req, err := http.NewRequest("GET", g.config.SitemapURL, nil)
 45 | 	if err != nil {
 46 | 		return nil, fmt.Errorf("failed to create request: %w", err)
 47 | 	}
 48 | 
 49 | 	// Set User-Agent
 50 | 	req.Header.Set("User-Agent", g.config.UserAgent)
 51 | 
 52 | 	// Send request
 53 | 	resp, err := client.Do(req)
 54 | 	if err != nil {
 55 | 		return nil, fmt.Errorf("failed to fetch sitemap: %w", err)
 56 | 	}
 57 | 	defer resp.Body.Close()
 58 | 
 59 | 	if resp.StatusCode != http.StatusOK {
 60 | 		return nil, fmt.Errorf("failed to fetch sitemap, status code: %d", resp.StatusCode)
 61 | 	}
 62 | 
 63 | 	// Read response body
 64 | 	body, err := io.ReadAll(resp.Body)
 65 | 	if err != nil {
 66 | 		return nil, fmt.Errorf("failed to read sitemap content: %w", err)
 67 | 	}
 68 | 
 69 | 	// Try to parse as standard sitemap
 70 | 	var sitemap Sitemap
 71 | 	if err := xml.Unmarshal(body, &sitemap); err == nil && len(sitemap.URLs) > 0 {
 72 | 		g.logger.Println("Parsed standard sitemap")
 73 | 		return g.extractURLsFromSitemap(sitemap), nil
 74 | 	}
 75 | 
 76 | 	// Try to parse as sitemap index
 77 | 	var sitemapIndex SitemapIndex
 78 | 	if err := xml.Unmarshal(body, &sitemapIndex); err == nil && len(sitemapIndex.Sitemaps) > 0 {
 79 | 		g.logger.Println("Parsed sitemap index, fetching child sitemaps")
 80 | 		return g.fetchSitemapIndex(sitemapIndex, client)
 81 | 	}
 82 | 
 83 | 	// If all parsing fails, try to handle as text sitemap (one URL per line)
 84 | 	lines := string(body)
 85 | 	if len(lines) > 0 {
 86 | 		g.logger.Println("Parsing as text sitemap")
 87 | 		return g.parseTextSitemap(lines), nil
 88 | 	}
 89 | 
 90 | 	return nil, fmt.Errorf("could not parse sitemap, unknown format")
 91 | }
 92 | 
 93 | // Extract URLs from standard sitemap
 94 | func (g *Generator) extractURLsFromSitemap(sitemap Sitemap) []string {
 95 | 	urls := make([]string, 0, len(sitemap.URLs))
 96 | 	for _, urlEntry := range sitemap.URLs {
 97 | 		if urlEntry.Loc != "" {
 98 | 			urls = append(urls, urlEntry.Loc)
 99 | 		}
100 | 	}
101 | 	return urls
102 | }
103 | 
104 | // Get all child sitemap URLs from sitemap index
105 | func (g *Generator) fetchSitemapIndex(index SitemapIndex, client *http.Client) ([]string, error) {
106 | 	var allURLs []string
107 | 
108 | 	for _, sitemapEntry := range index.Sitemaps {
109 | 		if sitemapEntry.Loc == "" {
110 | 			continue
111 | 		}
112 | 
113 | 		g.logger.Printf("Fetching child sitemap: %s", sitemapEntry.Loc)
114 | 
115 | 		// Build request
116 | 		req, err := http.NewRequest("GET", sitemapEntry.Loc, nil)
117 | 		if err != nil {
118 | 			g.logger.Printf("Warning: failed to create request for child sitemap %s: %v", sitemapEntry.Loc, err)
119 | 			continue
120 | 		}
121 | 
122 | 		// Set User-Agent
123 | 		req.Header.Set("User-Agent", g.config.UserAgent)
124 | 
125 | 		// Send request
126 | 		resp, err := client.Do(req)
127 | 		if err != nil {
128 | 			g.logger.Printf("Warning: failed to fetch child sitemap %s: %v", sitemapEntry.Loc, err)
129 | 			continue
130 | 		}
131 | 
132 | 		// Read response body
133 | 		body, err := io.ReadAll(resp.Body)
134 | 		resp.Body.Close()
135 | 		if err != nil {
136 | 			g.logger.Printf("Warning: failed to read child sitemap %s: %v", sitemapEntry.Loc, err)
137 | 			continue
138 | 		}
139 | 
140 | 		// Parse child sitemap
141 | 		var childSitemap Sitemap
142 | 		if err := xml.Unmarshal(body, &childSitemap); err != nil {
143 | 			g.logger.Printf("Warning: failed to parse child sitemap %s: %v", sitemapEntry.Loc, err)
144 | 			continue
145 | 		}
146 | 
147 | 		// Extract URLs
148 | 		childURLs := g.extractURLsFromSitemap(childSitemap)
149 | 		g.logger.Printf("Found %d URLs in child sitemap %s", len(childURLs), sitemapEntry.Loc)
150 | 		allURLs = append(allURLs, childURLs...)
151 | 	}
152 | 
153 | 	return allURLs, nil
154 | }
155 | 
156 | // Parse text sitemap (one URL per line)
157 | func (g *Generator) parseTextSitemap(content string) []string {
158 | 	lines := splitLines(content)
159 | 	var urls []string
160 | 
161 | 	for _, line := range lines {
162 | 		line = normalizeURL(line)
163 | 		if isValidURL(line) {
164 | 			urls = append(urls, line)
165 | 		}
166 | 	}
167 | 
168 | 	return urls
169 | }
170 | 
171 | // Filter URLs based on include/exclude mode
172 | func (g *Generator) filterURLs(urls []string) []string {
173 | 	if len(g.config.IncludePaths) == 0 && len(g.config.ExcludePaths) == 0 {
174 | 		return urls // No filtering rules, return directly
175 | 	}
176 | 
177 | 	// Compile include/exclude mode
178 | 	var includeMatchers, excludeMatchers []glob.Glob
179 | 	for _, pattern := range g.config.IncludePaths {
180 | 		matcher, err := glob.Compile(pattern)
181 | 		if err != nil {
182 | 			g.logger.Printf("Warning: invalid include pattern '%s': %v", pattern, err)
183 | 			continue
184 | 		}
185 | 		includeMatchers = append(includeMatchers, matcher)
186 | 	}
187 | 
188 | 	for _, pattern := range g.config.ExcludePaths {
189 | 		matcher, err := glob.Compile(pattern)
190 | 		if err != nil {
191 | 			g.logger.Printf("Warning: invalid exclude pattern '%s': %v", pattern, err)
192 | 			continue
193 | 		}
194 | 		excludeMatchers = append(excludeMatchers, matcher)
195 | 	}
196 | 
197 | 	var filteredURLs []string
198 | 	for _, url := range urls {
199 | 		// If there are include rules, one of them must match
200 | 		if len(includeMatchers) > 0 {
201 | 			matched := false
202 | 			for _, matcher := range includeMatchers {
203 | 				if matcher.Match(url) {
204 | 					matched = true
205 | 					break
206 | 				}
207 | 			}
208 | 			if !matched {
209 | 				continue
210 | 			}
211 | 		}
212 | 
213 | 		// If any exclude rules match, exclude
214 | 		excluded := false
215 | 		for _, matcher := range excludeMatchers {
216 | 			if matcher.Match(url) {
217 | 				excluded = true
218 | 				break
219 | 			}
220 | 		}
221 | 		if excluded {
222 | 			continue
223 | 		}
224 | 
225 | 		filteredURLs = append(filteredURLs, url)
226 | 	}
227 | 
228 | 	return filteredURLs
229 | }
230 | 
231 | // Helper function: split text by line
232 | func splitLines(s string) []string {
233 | 	return strings.Split(s, "\n")
234 | }
235 | 
236 | // Helper function: normalize URL (remove spaces, etc.)
237 | func normalizeURL(url string) string {
238 | 	return url
239 | }
240 | 
241 | // Helper function: check if URL is valid
242 | func isValidURL(url string) bool {
243 | 	return url != ""
244 | }
245 | 


--------------------------------------------------------------------------------
/internal/linter/rules_test.go:
--------------------------------------------------------------------------------
  1 | package linter
  2 | 
  3 | import (
  4 | 	"testing"
  5 | )
  6 | 
  7 | func TestMD047_FileEndingCheck(t *testing.T) {
  8 | 	tests := []struct {
  9 | 		name        string
 10 | 		lines       []string
 11 | 		expectIssue bool
 12 | 		description string
 13 | 	}{
 14 | 		{
 15 | 			name:        "file ends with single newline",
 16 | 			lines:       []string{"# Title", "Content", ""},
 17 | 			expectIssue: false,
 18 | 			description: "should not trigger issue when file ends with single newline",
 19 | 		},
 20 | 		{
 21 | 			name:        "file does not end with newline",
 22 | 			lines:       []string{"# Title", "Content"},
 23 | 			expectIssue: true,
 24 | 			description: "should trigger issue when file doesn't end with newline",
 25 | 		},
 26 | 		{
 27 | 			name:        "file ends with multiple newlines",
 28 | 			lines:       []string{"# Title", "Content", "", ""},
 29 | 			expectIssue: true,
 30 | 			description: "should trigger issue when file ends with multiple newlines",
 31 | 		},
 32 | 		{
 33 | 			name:        "empty file",
 34 | 			lines:       []string{},
 35 | 			expectIssue: false,
 36 | 			description: "should not trigger issue for empty file",
 37 | 		},
 38 | 	}
 39 | 
 40 | 	rule := &MD047{BaseRule: BaseRule{id: "MD047", description: "Files should end with a single newline character", enabled: true}}
 41 | 
 42 | 	for _, tt := range tests {
 43 | 		t.Run(tt.name, func(t *testing.T) {
 44 | 			issues := rule.Check(tt.lines)
 45 | 			hasIssue := len(issues) > 0
 46 | 
 47 | 			if hasIssue != tt.expectIssue {
 48 | 				t.Errorf("%s: expected issue=%t, got issue=%t", tt.description, tt.expectIssue, hasIssue)
 49 | 				if hasIssue {
 50 | 					for _, issue := range issues {
 51 | 						t.Logf("Issue: %s", issue.Message)
 52 | 					}
 53 | 				}
 54 | 			}
 55 | 		})
 56 | 	}
 57 | }
 58 | 
 59 | func TestMD032_ListBlankLines(t *testing.T) {
 60 | 	tests := []struct {
 61 | 		name        string
 62 | 		lines       []string
 63 | 		expectCount int
 64 | 		description string
 65 | 	}{
 66 | 		{
 67 | 			name: "list with proper blank lines",
 68 | 			lines: []string{
 69 | 				"# Title",
 70 | 				"",
 71 | 				"- Item 1",
 72 | 				"- Item 2",
 73 | 				"",
 74 | 				"Content after list",
 75 | 			},
 76 | 			expectCount: 0,
 77 | 			description: "should not trigger issue when list has proper blank lines",
 78 | 		},
 79 | 		{
 80 | 			name: "list without blank line before",
 81 | 			lines: []string{
 82 | 				"# Title",
 83 | 				"Some text",
 84 | 				"- Item 1",
 85 | 				"",
 86 | 				"Content after list",
 87 | 			},
 88 | 			expectCount: 1,
 89 | 			description: "should trigger issue when list doesn't have blank line before",
 90 | 		},
 91 | 		{
 92 | 			name: "list without blank line after",
 93 | 			lines: []string{
 94 | 				"# Title",
 95 | 				"",
 96 | 				"- Item 1",
 97 | 				"Content after list",
 98 | 			},
 99 | 			expectCount: 1,
100 | 			description: "should trigger issue when list doesn't have blank line after",
101 | 		},
102 | 		{
103 | 			name: "list without blank lines before and after",
104 | 			lines: []string{
105 | 				"# Title",
106 | 				"Some text",
107 | 				"- Item 1",
108 | 				"Content after list",
109 | 			},
110 | 			expectCount: 2,
111 | 			description: "should trigger 2 issues when list doesn't have blank lines before and after",
112 | 		},
113 | 	}
114 | 
115 | 	rule := &MD032{BaseRule: BaseRule{id: "MD032", description: "Lists should be surrounded by blank lines", enabled: true}}
116 | 
117 | 	for _, tt := range tests {
118 | 		t.Run(tt.name, func(t *testing.T) {
119 | 			issues := rule.Check(tt.lines)
120 | 
121 | 			if len(issues) != tt.expectCount {
122 | 				t.Errorf("%s: expected %d issues, got %d issues", tt.description, tt.expectCount, len(issues))
123 | 				for i, issue := range issues {
124 | 					t.Logf("Issue %d: Line %d - %s", i+1, issue.Line, issue.Message)
125 | 				}
126 | 			}
127 | 		})
128 | 	}
129 | }
130 | 
131 | func TestRegexPrecompilation(t *testing.T) {
132 | 	tests := []struct {
133 | 		name string
134 | 		rule Rule
135 | 	}{
136 | 		{"MD018", &MD018{BaseRule: BaseRule{id: "MD018", enabled: true}}},
137 | 		{"MD019", &MD019{BaseRule: BaseRule{id: "MD019", enabled: true}}},
138 | 		{"MD023", &MD023{BaseRule: BaseRule{id: "MD023", enabled: true}}},
139 | 		{"MD032", &MD032{BaseRule: BaseRule{id: "MD032", enabled: true}}},
140 | 	}
141 | 
142 | 	for _, tt := range tests {
143 | 		t.Run(tt.name, func(t *testing.T) {
144 | 			// Call Check method to trigger regex compilation
145 | 			_ = tt.rule.Check([]string{"# Test", "Content"})
146 | 
147 | 			// Check that pattern was compiled for rules that have patterns
148 | 			switch rule := tt.rule.(type) {
149 | 			case *MD018:
150 | 				if rule.pattern == nil {
151 | 					t.Error("MD018 pattern was not compiled")
152 | 				}
153 | 			case *MD019:
154 | 				if rule.pattern == nil {
155 | 					t.Error("MD019 pattern was not compiled")
156 | 				}
157 | 			case *MD023:
158 | 				if rule.pattern == nil {
159 | 					t.Error("MD023 pattern was not compiled")
160 | 				}
161 | 			case *MD032:
162 | 				if rule.pattern == nil {
163 | 					t.Error("MD032 pattern was not compiled")
164 | 				}
165 | 			}
166 | 		})
167 | 	}
168 | }
169 | 
170 | func TestMD018_NoSpaceAfterHash(t *testing.T) {
171 | 	rule := &MD018{BaseRule: BaseRule{id: "MD018", enabled: true}}
172 | 
173 | 	tests := []struct {
174 | 		line        string
175 | 		expectIssue bool
176 | 	}{
177 | 		{"# Proper heading", false},
178 | 		{"#Bad heading", true},
179 | 		{"## Another proper heading", false},
180 | 		{"##Bad heading", true},
181 | 		{"### Yet another proper heading", false},
182 | 		{"###Bad heading", true},
183 | 		{"Not a heading", false},
184 | 		{"", false},
185 | 	}
186 | 
187 | 	for _, tt := range tests {
188 | 		t.Run(tt.line, func(t *testing.T) {
189 | 			issues := rule.Check([]string{tt.line})
190 | 			hasIssue := len(issues) > 0
191 | 
192 | 			if hasIssue != tt.expectIssue {
193 | 				t.Errorf("Line %q: expected issue=%t, got issue=%t", tt.line, tt.expectIssue, hasIssue)
194 | 			}
195 | 		})
196 | 	}
197 | }
198 | 
199 | func TestMD019_MultipleSpacesAfterHash(t *testing.T) {
200 | 	rule := &MD019{BaseRule: BaseRule{id: "MD019", enabled: true}}
201 | 
202 | 	tests := []struct {
203 | 		line        string
204 | 		expectIssue bool
205 | 	}{
206 | 		{"# Proper heading", false},
207 | 		{"#  Bad heading", true},
208 | 		{"## Another proper heading", false},
209 | 		{"##  Bad heading", true},
210 | 		{"###   Very bad heading", true},
211 | 		{"Not a heading", false},
212 | 		{"", false},
213 | 	}
214 | 
215 | 	for _, tt := range tests {
216 | 		t.Run(tt.line, func(t *testing.T) {
217 | 			issues := rule.Check([]string{tt.line})
218 | 			hasIssue := len(issues) > 0
219 | 
220 | 			if hasIssue != tt.expectIssue {
221 | 				t.Errorf("Line %q: expected issue=%t, got issue=%t", tt.line, tt.expectIssue, hasIssue)
222 | 			}
223 | 		})
224 | 	}
225 | }
226 | 
227 | func TestMD023_HeadingAtStartOfLine(t *testing.T) {
228 | 	rule := &MD023{BaseRule: BaseRule{id: "MD023", enabled: true}}
229 | 
230 | 	tests := []struct {
231 | 		line        string
232 | 		expectIssue bool
233 | 	}{
234 | 		{"# Proper heading", false},
235 | 		{" # Bad heading", true},
236 | 		{"  ## Very bad heading", true},
237 | 		{"Not a heading", false},
238 | 		{"", false},
239 | 	}
240 | 
241 | 	for _, tt := range tests {
242 | 		t.Run(tt.line, func(t *testing.T) {
243 | 			issues := rule.Check([]string{tt.line})
244 | 			hasIssue := len(issues) > 0
245 | 
246 | 			if hasIssue != tt.expectIssue {
247 | 				t.Errorf("Line %q: expected issue=%t, got issue=%t", tt.line, tt.expectIssue, hasIssue)
248 | 			}
249 | 		})
250 | 	}
251 | }
252 | 


--------------------------------------------------------------------------------
/internal/config/config.go:
--------------------------------------------------------------------------------
  1 | package config
  2 | 
  3 | import (
  4 | 	"encoding/json"
  5 | 	"fmt"
  6 | 	"os"
  7 | 	"path/filepath"
  8 | )
  9 | 
 10 | type CloudConfig struct {
 11 | 	Provider       string            `json:"provider"`
 12 | 	Region         string            `json:"region"`
 13 | 	Endpoint       string            `json:"endpoint"`
 14 | 	AccessKey      string            `json:"access_key"`
 15 | 	SecretKey      string            `json:"secret_key"`
 16 | 	Bucket         string            `json:"bucket"`
 17 | 	AccountID      string            `json:"account_id,omitempty"`
 18 | 	CustomDomain   string            `json:"custom_domain,omitempty"`
 19 | 	PathPrefix     string            `json:"path_prefix,omitempty"`
 20 | 	ProviderOpts   map[string]string `json:"provider_opts,omitempty"`
 21 | 	Concurrency    int               `json:"concurrency"`
 22 | 	SkipVerify     bool              `json:"skip_verify"`
 23 | 	CACertPath     string            `json:"ca_cert_path,omitempty"`
 24 | 	ConflictPolicy string            `json:"conflict_policy"`
 25 | 	CacheDir       string            `json:"cache_dir,omitempty"`
 26 | }
 27 | 
 28 | type Config struct {
 29 | 	TranslatePrompt   string                 `json:"translate_prompt"`
 30 | 	OpenAIEndpointURL string                 `json:"endpoint"`
 31 | 	OpenAIAPIKey      string                 `json:"api_key"`
 32 | 	ModelName         string                 `json:"model"`
 33 | 	Temperature       float64                `json:"temperature"`
 34 | 	TopP              float64                `json:"top_p"`
 35 | 	CloudStorages     map[string]CloudConfig `json:"cloud_storages,omitempty"`
 36 | 	DefaultStorage    string                 `json:"default_storage,omitempty"`
 37 | }
 38 | 
 39 | var DefaultCloudConfig = CloudConfig{
 40 | 	Provider:       "",
 41 | 	Region:         "auto",
 42 | 	Endpoint:       "",
 43 | 	AccessKey:      "",
 44 | 	SecretKey:      "",
 45 | 	Bucket:         "",
 46 | 	Concurrency:    5,
 47 | 	SkipVerify:     false,
 48 | 	ConflictPolicy: "rename",
 49 | }
 50 | 
 51 | var DefaultConfig = Config{
 52 | 	TranslatePrompt:   "Translate the markdown to {TARGET_LANG} as a native speaker - preserve code/YAML/links/cli commands (e.g. `kubectl apply` or `pip install langchain`) and tech terms (CRDs, Helm charts, RAG). Output ONLY fluently localized text with natural technical phrasing that doesn't read machine-generated.",
 53 | 	OpenAIEndpointURL: "https://api.openai.com/v1",
 54 | 	OpenAIAPIKey:      "",
 55 | 	ModelName:         "gpt-3.5-turbo",
 56 | 	Temperature:       0.0,
 57 | 	TopP:              1.0,
 58 | 	CloudStorages:     make(map[string]CloudConfig),
 59 | }
 60 | 
 61 | func GetConfigPath() string {
 62 | 	homeDir, err := os.UserHomeDir()
 63 | 	if err != nil {
 64 | 		return ""
 65 | 	}
 66 | 	return filepath.Join(homeDir, ".config", "mdctl", "config.json")
 67 | }
 68 | 
 69 | func LoadConfig() (*Config, error) {
 70 | 	configPath := GetConfigPath()
 71 | 	if configPath == "" {
 72 | 		return &DefaultConfig, nil
 73 | 	}
 74 | 
 75 | 	if _, err := os.Stat(configPath); os.IsNotExist(err) {
 76 | 		if err := SaveConfig(&DefaultConfig); err != nil {
 77 | 			return &DefaultConfig, fmt.Errorf("failed to create default config: %v", err)
 78 | 		}
 79 | 		return &DefaultConfig, nil
 80 | 	}
 81 | 
 82 | 	data, err := os.ReadFile(configPath)
 83 | 	if err != nil {
 84 | 		return &DefaultConfig, fmt.Errorf("failed to read config file: %v", err)
 85 | 	}
 86 | 
 87 | 	var config Config
 88 | 	if err := json.Unmarshal(data, &config); err != nil {
 89 | 		os.Remove(configPath)
 90 | 		if err := SaveConfig(&DefaultConfig); err != nil {
 91 | 			return &DefaultConfig, fmt.Errorf("failed to create new config after invalid file: %v", err)
 92 | 		}
 93 | 		return &DefaultConfig, fmt.Errorf("invalid config file (recreated with defaults): %v", err)
 94 | 	}
 95 | 
 96 | 	if config.TranslatePrompt == "" {
 97 | 		config.TranslatePrompt = DefaultConfig.TranslatePrompt
 98 | 	}
 99 | 	if config.OpenAIEndpointURL == "" {
100 | 		config.OpenAIEndpointURL = DefaultConfig.OpenAIEndpointURL
101 | 	}
102 | 	if config.ModelName == "" {
103 | 		config.ModelName = DefaultConfig.ModelName
104 | 	}
105 | 
106 | 	// Ensure CloudStorages is non-nil
107 | 	if config.CloudStorages == nil {
108 | 		config.CloudStorages = make(map[string]CloudConfig)
109 | 	}
110 | 
111 | 	// Check if default storage exists
112 | 	if config.DefaultStorage != "" {
113 | 		if _, exists := config.CloudStorages[config.DefaultStorage]; !exists {
114 | 			// If specified default storage doesn't exist, use the first available one
115 | 			if len(config.CloudStorages) > 0 {
116 | 				for name := range config.CloudStorages {
117 | 					config.DefaultStorage = name
118 | 					break
119 | 				}
120 | 			} else {
121 | 				config.DefaultStorage = ""
122 | 			}
123 | 		}
124 | 	} else if len(config.CloudStorages) > 0 {
125 | 		// If no default storage is set but there are storage configurations, set the first one as default
126 | 		for name := range config.CloudStorages {
127 | 			config.DefaultStorage = name
128 | 			break
129 | 		}
130 | 	}
131 | 
132 | 	return &config, nil
133 | }
134 | 
135 | func SaveConfig(config *Config) error {
136 | 	configPath := GetConfigPath()
137 | 	if configPath == "" {
138 | 		return fmt.Errorf("failed to get config path")
139 | 	}
140 | 
141 | 	configDir := filepath.Dir(configPath)
142 | 	if err := os.MkdirAll(configDir, 0755); err != nil {
143 | 		return fmt.Errorf("failed to create config directory: %v", err)
144 | 	}
145 | 
146 | 	data, err := json.MarshalIndent(config, "", "  ")
147 | 	if err != nil {
148 | 		return fmt.Errorf("failed to marshal config: %v", err)
149 | 	}
150 | 
151 | 	if err := os.WriteFile(configPath, data, 0644); err != nil {
152 | 		return fmt.Errorf("failed to write config file: %v", err)
153 | 	}
154 | 
155 | 	return nil
156 | }
157 | 
158 | // ApplyCloudConfig applies platform-specific settings to the cloud configuration
159 | func (c *Config) ApplyCloudConfig() {
160 | 	// Ensure CloudStorages is non-nil
161 | 	if c.CloudStorages == nil {
162 | 		c.CloudStorages = make(map[string]CloudConfig)
163 | 	}
164 | 
165 | 	// Check if default storage exists
166 | 	if c.DefaultStorage != "" {
167 | 		if _, exists := c.CloudStorages[c.DefaultStorage]; !exists {
168 | 			// If specified default storage doesn't exist, use the first available one
169 | 			for name := range c.CloudStorages {
170 | 				c.DefaultStorage = name
171 | 				break
172 | 			}
173 | 		}
174 | 	}
175 | 
176 | 	// If no default storage is set but there are storage configurations, set the first one as default
177 | 	if c.DefaultStorage == "" && len(c.CloudStorages) > 0 {
178 | 		for name := range c.CloudStorages {
179 | 			c.DefaultStorage = name
180 | 			break
181 | 		}
182 | 	}
183 | }
184 | 
185 | // GetActiveCloudConfig returns the current active cloud storage configuration
186 | // The storageName parameter can specify which configuration to use, if empty the default configuration is used
187 | func (c *Config) GetActiveCloudConfig(storageName string) CloudConfig {
188 | 	// If a storage name is specified, try to get that configuration
189 | 	if storageName != "" {
190 | 		if storage, exists := c.CloudStorages[storageName]; exists {
191 | 			return storage
192 | 		}
193 | 	}
194 | 
195 | 	// If there's a default configuration, use that
196 | 	if c.DefaultStorage != "" {
197 | 		if storage, exists := c.CloudStorages[c.DefaultStorage]; exists {
198 | 			return storage
199 | 		}
200 | 	}
201 | 
202 | 	// If any configuration is available, return the first one found
203 | 	if len(c.CloudStorages) > 0 {
204 | 		for _, storage := range c.CloudStorages {
205 | 			return storage
206 | 		}
207 | 	}
208 | 
209 | 	// Return default empty configuration
210 | 	return DefaultCloudConfig
211 | }
212 | 


--------------------------------------------------------------------------------
/cmd/lint.go:
--------------------------------------------------------------------------------
  1 | package cmd
  2 | 
  3 | import (
  4 | 	"encoding/json"
  5 | 	"fmt"
  6 | 	"os"
  7 | 	"path/filepath"
  8 | 	"strings"
  9 | 
 10 | 	"github.com/samzong/mdctl/internal/linter"
 11 | 	"github.com/spf13/cobra"
 12 | )
 13 | 
 14 | var (
 15 | 	autoFix      bool
 16 | 	configRules  []string
 17 | 	outputFormat string
 18 | 	rulesFile    string
 19 | 	enableRules  []string
 20 | 	disableRules []string
 21 | 	initConfig   bool
 22 | 	configOutput string
 23 | )
 24 | 
 25 | var lintCmd = &cobra.Command{
 26 | 	Use:   "lint [files...]",
 27 | 	Short: "Lint markdown files for syntax issues",
 28 | 	Long: `Lint markdown files using markdownlint rules to find syntax issues.
 29 | 
 30 | This command will scan markdown files and report any syntax issues found.
 31 | It can also automatically fix issues when --fix flag is used.
 32 | 
 33 | Examples:
 34 |   # Lint a single file
 35 |   mdctl lint README.md
 36 | 
 37 |   # Lint multiple files
 38 |   mdctl lint docs/*.md
 39 | 
 40 |   # Lint with auto-fix
 41 |   mdctl lint --fix README.md
 42 | 
 43 |   # Lint with custom rules configuration
 44 |   mdctl lint --config .markdownlint.json README.md
 45 | 
 46 |   # Enable specific rules
 47 |   mdctl lint --enable MD001,MD003 README.md
 48 | 
 49 |   # Disable specific rules
 50 |   mdctl lint --disable MD013,MD033 README.md
 51 | 
 52 |   # Create a default configuration file
 53 |   mdctl lint --init
 54 | 
 55 |   # Create a configuration file with custom name
 56 |   mdctl lint --init --init-config my-rules.json`,
 57 | 	RunE: func(cmd *cobra.Command, args []string) error {
 58 | 		// Handle config initialization
 59 | 		if initConfig {
 60 | 			configFile := configOutput
 61 | 			if configFile == "" {
 62 | 				configFile = ".markdownlint.json"
 63 | 			}
 64 | 
 65 | 			if err := linter.CreateDefaultConfig(configFile); err != nil {
 66 | 				return fmt.Errorf("failed to create config file: %v", err)
 67 | 			}
 68 | 
 69 | 			fmt.Printf("Created markdownlint configuration file: %s\n", configFile)
 70 | 			return nil
 71 | 		}
 72 | 
 73 | 		if len(args) == 0 {
 74 | 			return fmt.Errorf("at least one markdown file must be specified")
 75 | 		}
 76 | 
 77 | 		// Expand file patterns
 78 | 		var files []string
 79 | 		for _, arg := range args {
 80 | 			// Basic security validation - prevent path traversal
 81 | 			if strings.Contains(arg, "..") {
 82 | 				return fmt.Errorf("path traversal not allowed: %s", arg)
 83 | 			}
 84 | 
 85 | 			matches, err := filepath.Glob(arg)
 86 | 			if err != nil {
 87 | 				return fmt.Errorf("invalid file pattern %s: %v", arg, err)
 88 | 			}
 89 | 			if len(matches) == 0 {
 90 | 				// If no glob matches, check if it's a direct file
 91 | 				if _, err := os.Stat(arg); err == nil {
 92 | 					files = append(files, arg)
 93 | 				} else {
 94 | 					fmt.Printf("Warning: No files found matching pattern: %s\n", arg)
 95 | 				}
 96 | 			} else {
 97 | 				files = append(files, matches...)
 98 | 			}
 99 | 		}
100 | 
101 | 		// Filter for markdown files
102 | 		var markdownFiles []string
103 | 		for _, file := range files {
104 | 			if strings.HasSuffix(strings.ToLower(file), ".md") || strings.HasSuffix(strings.ToLower(file), ".markdown") {
105 | 				markdownFiles = append(markdownFiles, file)
106 | 			}
107 | 		}
108 | 
109 | 		if len(markdownFiles) == 0 {
110 | 			return fmt.Errorf("no markdown files found")
111 | 		}
112 | 
113 | 		// Create linter configuration
114 | 		config := &linter.Config{
115 | 			AutoFix:      autoFix,
116 | 			OutputFormat: outputFormat,
117 | 			RulesFile:    rulesFile,
118 | 			EnableRules:  enableRules,
119 | 			DisableRules: disableRules,
120 | 			Verbose:      verbose,
121 | 		}
122 | 
123 | 		// Create linter instance
124 | 		mdLinter := linter.New(config)
125 | 
126 | 		// Process files
127 | 		var totalIssues int
128 | 		var totalFixed int
129 | 
130 | 		for _, file := range markdownFiles {
131 | 			if verbose {
132 | 				fmt.Printf("Linting: %s\n", file)
133 | 			}
134 | 
135 | 			result, err := mdLinter.LintFile(file)
136 | 			if err != nil {
137 | 				fmt.Printf("Error linting %s: %v\n", file, err)
138 | 				continue
139 | 			}
140 | 
141 | 			totalIssues += len(result.Issues)
142 | 			totalFixed += result.FixedCount
143 | 
144 | 			// Display results based on output format
145 | 			if err := displayResults(file, result, config); err != nil {
146 | 				return fmt.Errorf("error displaying results: %v", err)
147 | 			}
148 | 		}
149 | 
150 | 		// Summary
151 | 		if verbose || len(markdownFiles) > 1 {
152 | 			fmt.Printf("\nSummary:\n")
153 | 			fmt.Printf("  Files processed: %d\n", len(markdownFiles))
154 | 			fmt.Printf("  Total issues: %d\n", totalIssues)
155 | 			if autoFix {
156 | 				fmt.Printf("  Issues fixed: %d\n", totalFixed)
157 | 			}
158 | 		}
159 | 
160 | 		// Exit with error code if issues found and not in fix mode
161 | 		if totalIssues > 0 && !autoFix {
162 | 			os.Exit(1)
163 | 		}
164 | 
165 | 		return nil
166 | 	},
167 | }
168 | 
169 | func displayResults(filename string, result *linter.Result, config *linter.Config) error {
170 | 	switch config.OutputFormat {
171 | 	case "json":
172 | 		return displayJSONResults(filename, result)
173 | 	case "github":
174 | 		return displayGitHubResults(filename, result)
175 | 	default:
176 | 		return displayDefaultResults(filename, result, config)
177 | 	}
178 | }
179 | 
180 | func displayDefaultResults(filename string, result *linter.Result, config *linter.Config) error {
181 | 	if len(result.Issues) == 0 {
182 | 		if config.Verbose {
183 | 			fmt.Printf("✓ %s: No issues found\n", filename)
184 | 		}
185 | 		return nil
186 | 	}
187 | 
188 | 	fmt.Printf("%s:\n", filename)
189 | 	for _, issue := range result.Issues {
190 | 		status := "✗"
191 | 		if issue.Fixed {
192 | 			status = "✓"
193 | 		}
194 | 
195 | 		fmt.Printf("  %s Line %d: %s (%s)\n",
196 | 			status, issue.Line, issue.Message, issue.Rule)
197 | 
198 | 		if config.Verbose && issue.Context != "" {
199 | 			fmt.Printf("    Context: %s\n", issue.Context)
200 | 		}
201 | 	}
202 | 
203 | 	if config.AutoFix && result.FixedCount > 0 {
204 | 		fmt.Printf("  Fixed %d issues\n", result.FixedCount)
205 | 	}
206 | 
207 | 	return nil
208 | }
209 | 
210 | func displayJSONResults(filename string, result *linter.Result) error {
211 | 	output := map[string]interface{}{
212 | 		"filename":    result.Filename,
213 | 		"issues":      result.Issues,
214 | 		"fixed_count": result.FixedCount,
215 | 	}
216 | 
217 | 	data, err := json.MarshalIndent(output, "", "  ")
218 | 	if err != nil {
219 | 		return err
220 | 	}
221 | 
222 | 	fmt.Println(string(data))
223 | 	return nil
224 | }
225 | 
226 | func displayGitHubResults(filename string, result *linter.Result) error {
227 | 	// GitHub Actions workflow commands format
228 | 	for _, issue := range result.Issues {
229 | 		level := "error"
230 | 		if issue.Fixed {
231 | 			level = "notice"
232 | 		}
233 | 
234 | 		fmt.Printf("::%s file=%s,line=%d::%s (%s)\n",
235 | 			level, filename, issue.Line, issue.Message, issue.Rule)
236 | 	}
237 | 	return nil
238 | }
239 | 
240 | func init() {
241 | 	lintCmd.Flags().BoolVar(&autoFix, "fix", false, "Automatically fix issues where possible")
242 | 	lintCmd.Flags().StringVar(&outputFormat, "format", "default", "Output format: default, json, github")
243 | 	lintCmd.Flags().StringVar(&rulesFile, "config", "", "Path to markdownlint configuration file")
244 | 	lintCmd.Flags().StringSliceVar(&enableRules, "enable", []string{}, "Enable specific rules (comma-separated)")
245 | 	lintCmd.Flags().StringSliceVar(&disableRules, "disable", []string{}, "Disable specific rules (comma-separated)")
246 | 	lintCmd.Flags().BoolVar(&initConfig, "init", false, "Create a default .markdownlint.json configuration file")
247 | 	lintCmd.Flags().StringVar(&configOutput, "init-config", "", "Path for the configuration file when using --init (default: .markdownlint.json)")
248 | 
249 | 	lintCmd.GroupID = "core"
250 | }
251 | 


--------------------------------------------------------------------------------
/cmd/upload.go:
--------------------------------------------------------------------------------
  1 | package cmd
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"strings"
  6 | 
  7 | 	"github.com/samzong/mdctl/internal/config"
  8 | 	"github.com/samzong/mdctl/internal/uploader"
  9 | 	"github.com/spf13/cobra"
 10 | )
 11 | 
 12 | var (
 13 | 	// Upload command flags
 14 | 	uploadSourceFile     string
 15 | 	uploadSourceDir      string
 16 | 	uploadProvider       string
 17 | 	uploadBucket         string
 18 | 	uploadCustomDomain   string
 19 | 	uploadPathPrefix     string
 20 | 	uploadDryRun         bool
 21 | 	uploadConcurrency    int
 22 | 	uploadForceUpload    bool
 23 | 	uploadSkipVerify     bool
 24 | 	uploadCACertPath     string
 25 | 	uploadConflictPolicy string
 26 | 	uploadCacheDir       string
 27 | 	uploadIncludeExts    string
 28 | 	uploadStorageName    string
 29 | 
 30 | 	uploadCmd = &cobra.Command{
 31 | 		Use:   "upload",
 32 | 		Short: "Upload local images in markdown files to cloud storage",
 33 | 		Long: `Upload local images in markdown files to cloud storage and rewrite URLs.
 34 | Supports multiple cloud storage providers with S3-compatible APIs.
 35 | 
 36 | Examples:
 37 |   mdctl upload -d docs/
 38 |   mdctl upload -f post.md
 39 |   mdctl upload -f post.md --storage my-s3`,
 40 | 		RunE: func(cmd *cobra.Command, args []string) error {
 41 | 			if uploadSourceFile == "" && uploadSourceDir == "" {
 42 | 				return fmt.Errorf("either source file (-f) or source directory (-d) must be specified")
 43 | 			}
 44 | 			if uploadSourceFile != "" && uploadSourceDir != "" {
 45 | 				return fmt.Errorf("cannot specify both source file (-f) and source directory (-d)")
 46 | 			}
 47 | 
 48 | 			// Load configuration file first
 49 | 			cfg, err := config.LoadConfig()
 50 | 			if err != nil {
 51 | 				return fmt.Errorf("failed to load config: %v", err)
 52 | 			}
 53 | 
 54 | 			// Get active cloud storage configuration
 55 | 			cloudConfig := cfg.GetActiveCloudConfig(uploadStorageName)
 56 | 
 57 | 			// Command line parameters take precedence over configuration
 58 | 			if uploadProvider == "" {
 59 | 				uploadProvider = cloudConfig.Provider
 60 | 			}
 61 | 
 62 | 			if uploadBucket == "" {
 63 | 				uploadBucket = cloudConfig.Bucket
 64 | 			}
 65 | 
 66 | 			// Check for empty values after using configuration file values
 67 | 			if uploadProvider == "" {
 68 | 				return fmt.Errorf("provider (-p) must be specified or set in configuration file")
 69 | 			}
 70 | 
 71 | 			if uploadBucket == "" {
 72 | 				return fmt.Errorf("bucket (-b) must be specified or set in configuration file")
 73 | 			}
 74 | 
 75 | 			// Set default region for S3-compatible services
 76 | 			// If region is not set or empty, set default region
 77 | 			if cloudConfig.Region == "" {
 78 | 				switch strings.ToLower(uploadProvider) {
 79 | 				case "s3":
 80 | 					// For AWS S3, default to us-east-1
 81 | 					cloudConfig.Region = "us-east-1"
 82 | 				case "r2", "minio", "b2":
 83 | 					// For S3-compatible services, region can be any value but must be provided
 84 | 					cloudConfig.Region = "auto"
 85 | 				}
 86 | 			}
 87 | 
 88 | 			// If not specified in command line, get other configuration parameters
 89 | 			if uploadCustomDomain == "" {
 90 | 				uploadCustomDomain = cloudConfig.CustomDomain
 91 | 			}
 92 | 
 93 | 			if uploadPathPrefix == "" {
 94 | 				uploadPathPrefix = cloudConfig.PathPrefix
 95 | 			}
 96 | 
 97 | 			if uploadConcurrency == 5 && cloudConfig.Concurrency != 0 { // 5 is default value
 98 | 				uploadConcurrency = cloudConfig.Concurrency
 99 | 			}
100 | 
101 | 			if uploadCACertPath == "" {
102 | 				uploadCACertPath = cloudConfig.CACertPath
103 | 			}
104 | 
105 | 			if uploadSkipVerify == false && cloudConfig.SkipVerify {
106 | 				uploadSkipVerify = true
107 | 			}
108 | 
109 | 			if uploadConflictPolicy == "rename" && cloudConfig.ConflictPolicy != "" {
110 | 				uploadConflictPolicy = cloudConfig.ConflictPolicy
111 | 			}
112 | 
113 | 			if uploadCacheDir == "" {
114 | 				uploadCacheDir = cloudConfig.CacheDir
115 | 			}
116 | 
117 | 			// Parse include extensions
118 | 			var exts []string
119 | 			if uploadIncludeExts != "" {
120 | 				exts = strings.Split(uploadIncludeExts, ",")
121 | 				for i, ext := range exts {
122 | 					exts[i] = strings.TrimSpace(ext)
123 | 				}
124 | 			}
125 | 
126 | 			// Validate conflict policy
127 | 			var conflictPolicy uploader.ConflictPolicy
128 | 			switch strings.ToLower(uploadConflictPolicy) {
129 | 			case "rename":
130 | 				conflictPolicy = uploader.ConflictPolicyRename
131 | 			case "version":
132 | 				conflictPolicy = uploader.ConflictPolicyVersion
133 | 			case "overwrite":
134 | 				conflictPolicy = uploader.ConflictPolicyOverwrite
135 | 			case "":
136 | 				conflictPolicy = uploader.ConflictPolicyRename // Default
137 | 			default:
138 | 				return fmt.Errorf("invalid conflict policy: %s (must be rename, version, or overwrite)", uploadConflictPolicy)
139 | 			}
140 | 
141 | 			// For R2, use account ID from configuration file
142 | 			if strings.ToLower(uploadProvider) == "r2" && cloudConfig.AccountID == "" {
143 | 				fmt.Printf("Note: R2 account ID not found in configuration, please set account_id in config file if you want to use r2.dev public URLs\n")
144 | 			}
145 | 
146 | 			// Create uploader
147 | 			up, err := uploader.New(uploader.UploaderConfig{
148 | 				SourceFile:     uploadSourceFile,
149 | 				SourceDir:      uploadSourceDir,
150 | 				Provider:       uploadProvider,
151 | 				Bucket:         uploadBucket,
152 | 				CustomDomain:   uploadCustomDomain,
153 | 				PathPrefix:     uploadPathPrefix,
154 | 				DryRun:         uploadDryRun,
155 | 				Concurrency:    uploadConcurrency,
156 | 				ForceUpload:    uploadForceUpload,
157 | 				SkipVerify:     uploadSkipVerify,
158 | 				CACertPath:     uploadCACertPath,
159 | 				ConflictPolicy: conflictPolicy,
160 | 				CacheDir:       uploadCacheDir,
161 | 				FileExtensions: exts,
162 | 			})
163 | 			if err != nil {
164 | 				return fmt.Errorf("failed to create uploader: %v", err)
165 | 			}
166 | 
167 | 			// Process files
168 | 			stats, err := up.Process()
169 | 			if err != nil {
170 | 				return fmt.Errorf("failed to process files: %v", err)
171 | 			}
172 | 
173 | 			// Print statistics
174 | 			fmt.Printf("\nUpload Statistics:\n")
175 | 			fmt.Printf("  Total Files Processed: %d\n", stats.ProcessedFiles)
176 | 			fmt.Printf("  Images Uploaded: %d\n", stats.UploadedImages)
177 | 			fmt.Printf("  Images Skipped: %d\n", stats.SkippedImages)
178 | 			fmt.Printf("  Failed Uploads: %d\n", stats.FailedImages)
179 | 			fmt.Printf("  Files Changed: %d\n", stats.ChangedFiles)
180 | 
181 | 			return nil
182 | 		},
183 | 	}
184 | )
185 | 
186 | func init() {
187 | 	// Add flags
188 | 	uploadCmd.Flags().StringVarP(&uploadSourceFile, "file", "f", "", "Source markdown file to process")
189 | 	uploadCmd.Flags().StringVarP(&uploadSourceDir, "dir", "d", "", "Source directory containing markdown files to process")
190 | 	uploadCmd.Flags().StringVarP(&uploadProvider, "provider", "p", "", "Cloud storage provider (s3, r2, minio)")
191 | 	uploadCmd.Flags().StringVarP(&uploadBucket, "bucket", "b", "", "Cloud storage bucket name")
192 | 	uploadCmd.Flags().StringVarP(&uploadCustomDomain, "custom-domain", "c", "", "Custom domain for generated URLs")
193 | 	uploadCmd.Flags().StringVar(&uploadPathPrefix, "prefix", "", "Path prefix for uploaded files")
194 | 	uploadCmd.Flags().BoolVar(&uploadDryRun, "dry-run", false, "Preview changes without uploading")
195 | 	uploadCmd.Flags().IntVar(&uploadConcurrency, "concurrency", 5, "Number of concurrent uploads")
196 | 	uploadCmd.Flags().BoolVarP(&uploadForceUpload, "force", "F", false, "Force upload even if file exists")
197 | 	uploadCmd.Flags().BoolVar(&uploadSkipVerify, "skip-verify", false, "Skip SSL verification")
198 | 	uploadCmd.Flags().StringVar(&uploadCACertPath, "ca-cert", "", "Path to CA certificate")
199 | 	uploadCmd.Flags().StringVar(&uploadConflictPolicy, "conflict", "rename", "Conflict policy (rename, version, overwrite)")
200 | 	uploadCmd.Flags().StringVar(&uploadCacheDir, "cache-dir", "", "Cache directory path")
201 | 	uploadCmd.Flags().StringVar(&uploadIncludeExts, "include", "", "Comma-separated list of file extensions to include")
202 | 	uploadCmd.Flags().StringVar(&uploadStorageName, "storage", "", "Storage name to use")
203 | }
204 | 


--------------------------------------------------------------------------------
/docs/features/upload.md:
--------------------------------------------------------------------------------
  1 | ## Design Document: Image Upload Feature for mdctl
  2 | 
  3 | ### Overview
  4 | 
  5 | Add a new feature to mdctl that uploads local images in markdown files to cloud storage services (S3-compatible APIs like Cloudflare R2, AWS S3, etc.) and rewrites the URLs in the markdown content.
  6 | 
  7 | ### Goals
  8 | 
  9 | 1. Upload local images to cloud storage services
 10 | 2. Support multiple storage providers with S3-compatible APIs
 11 | 3. Rewrite image URLs in markdown files to point to the cloud storage
 12 | 4. Maintain the existing design patterns and code structure
 13 | 5. Implement idempotent operations with content verification
 14 | 6. Support concurrent uploads for performance optimization
 15 | 7. Handle custom SSL certificates for various cloud providers
 16 | 
 17 | ### Architecture
 18 | 
 19 | Following the existing architecture pattern of mdctl, the upload feature will be implemented with these components:
 20 | 
 21 | #### 1. Command Layer (`cmd/upload.go`)
 22 | 
 23 | - Define CLI parameters:
 24 |   - Source file/directory (`-f/--file` or `-d/--dir`)
 25 |   - Cloud provider (`-p/--provider`)
 26 |   - Bucket name (`-b/--bucket`)
 27 |   - Custom domain (optional, `-c/--custom-domain`)
 28 |   - Path prefix (optional, `--prefix`)
 29 |   - File extensions to include (optional, `--include`)
 30 |   - Dry run mode (optional, `--dry-run`)
 31 |   - Concurrency level (optional, `--concurrency`)
 32 |   - Force upload (optional, `-F/--force`)
 33 |   - Skip SSL verification (optional, `--skip-verify`)
 34 |   - CA certificate path (optional, `--ca-cert`)
 35 |   - Conflict policy (optional, `--conflict=rename|version|overwrite`)
 36 |   - Cache directory (optional, `--cache-dir`)
 37 | 
 38 | - Validate input parameters
 39 | - Create and configure uploader component
 40 | - Add to the "core" command group alongside download and translate
 41 | 
 42 | #### 2. Uploader Module (`internal/uploader/uploader.go`)
 43 | 
 44 | - Core business logic for uploading files
 45 | - Methods for:
 46 |   - Processing single files or directories recursively
 47 |   - Identifying local images in markdown
 48 |   - Uploading files to cloud storage
 49 |   - Rewriting URLs in markdown content
 50 |   - Generating appropriate cloud storage paths
 51 |   - Managing the worker pool for concurrent uploads
 52 |   - Tracking upload progress with statistics
 53 |   - Calculating and verifying content hashes
 54 |   - Handling conflict resolution
 55 |   - Managing the local cache of uploaded files
 56 | 
 57 | #### 3. Storage Provider Interface (`internal/storage/provider.go`)
 58 | 
 59 | - Define a provider interface with methods:
 60 |   - `Upload(localPath, remotePath string, metadata map[string]string) (url string, err error)`
 61 |   - `Configure(config CloudConfig) error`
 62 |   - `GetPublicURL(remotePath string) string`
 63 |   - `ObjectExists(remotePath string) (bool, error)`
 64 |   - `CompareHash(remotePath, localHash string) (bool, error)`
 65 |   - `SetObjectMetadata(remotePath string, metadata map[string]string) error`
 66 |   - `GetObjectMetadata(remotePath string) (map[string]string, error)`
 67 | 
 68 | #### 4. Storage Provider Implementations
 69 | 
 70 | - S3-compatible provider (`internal/storage/s3.go`):
 71 |   - Implementation for AWS S3, Cloudflare R2, Minio, etc.
 72 |   - Configure region, endpoint, credentials
 73 |   - Handle authentication and uploads
 74 |   - Support custom certificates and SSL verification options
 75 |   - Implement content verification with ETag/MD5 hash comparison
 76 |   - Support object tagging for metadata
 77 | 
 78 | #### 5. Cache Management (`internal/cache/cache.go`)
 79 | 
 80 | - Maintain record of uploaded files with their hash values
 81 | - Cache structure with file path, remote URL, and hash
 82 | - Support for serializing/deserializing cache to disk
 83 | - Methods for lookup, update, and verification
 84 | 
 85 | #### 6. Configuration Extensions (`internal/config/config.go`)
 86 | 
 87 | Add new configuration fields:
 88 | ```go
 89 | type CloudConfig struct {
 90 |     Provider       string            `json:"provider"`
 91 |     Region         string            `json:"region"`
 92 |     Endpoint       string            `json:"endpoint"`
 93 |     AccessKey      string            `json:"access_key"`
 94 |     SecretKey      string            `json:"secret_key"`
 95 |     Bucket         string            `json:"bucket"`
 96 |     CustomDomain   string            `json:"custom_domain,omitempty"`
 97 |     PathPrefix     string            `json:"path_prefix,omitempty"`
 98 |     ProviderOpts   map[string]string `json:"provider_opts,omitempty"`
 99 |     Concurrency    int               `json:"concurrency"`
100 |     SkipVerify     bool              `json:"skip_verify"`
101 |     CACertPath     string            `json:"ca_cert_path,omitempty"`
102 |     ConflictPolicy string            `json:"conflict_policy"`
103 |     CacheDir       string            `json:"cache_dir,omitempty"`
104 | }
105 | 
106 | // Add to Config struct
107 | type Config struct {
108 |     // Existing fields...
109 |     CloudStorage CloudConfig `json:"cloud_storage"`
110 | }
111 | ```
112 | 
113 | ### Implementation Plan
114 | 
115 | 1. Add cloud storage config section to config.go
116 | 2. Implement cache management module
117 | 3. Create storage provider interface 
118 | 4. Implement S3-compatible provider with SSL handling
119 | 5. Create worker pool for concurrent uploads
120 | 6. Create uploader module implementation with verification logic
121 | 7. Implement idempotency and conflict resolution strategies  
122 | 8. Add upload command to cmd package
123 | 9. Create comprehensive tests
124 | 10. Update help text and documentation
125 | 11. Add sample usage to README
126 | 
127 | ### Command Usage Examples
128 | 
129 | ```bash
130 | # Upload images from a single file
131 | mdctl upload -f path/to/file.md -p s3 -b my-bucket
132 | 
133 | # Upload images from a directory
134 | mdctl upload -d path/to/dir -p r2 -b my-images --prefix blog/
135 | 
136 | # Use with a custom domain
137 | mdctl upload -f post.md -p s3 -b media-bucket -c assets.example.com
138 | 
139 | # Use custom concurrency setting
140 | mdctl upload -f blog-post.md -p s3 -b my-bucket --concurrency 10
141 | 
142 | # Force upload (bypass hash verification)
143 | mdctl upload -f readme.md -p r2 -b my-images -F
144 | 
145 | # Specify conflict resolution strategy
146 | mdctl upload -d docs/ -p s3 -b media --conflict=version
147 | 
148 | # Use custom SSL certificate
149 | mdctl upload -f doc.md -p s3 -b media --ca-cert /path/to/cert.pem
150 | 
151 | # Skip SSL verification for self-signed certificates
152 | mdctl upload -f doc.md -p minio -b local --skip-verify
153 | 
154 | # Configure cloud provider
155 | mdctl config set -k cloud_storage.provider -v "r2"
156 | mdctl config set -k cloud_storage.endpoint -v "https://xxxx.r2.cloudflarestorage.com"
157 | mdctl config set -k cloud_storage.access_key -v "YOUR_ACCESS_KEY"
158 | mdctl config set -k cloud_storage.secret_key -v "YOUR_SECRET_KEY"
159 | mdctl config set -k cloud_storage.bucket -v "my-images"
160 | mdctl config set -k cloud_storage.concurrency -v 5
161 | mdctl config set -k cloud_storage.conflict_policy -v "rename"
162 | ```
163 | 
164 | ### Technical Considerations
165 | 
166 | 1. **S3 SDK**: Use the AWS SDK for Go to interact with S3-compatible APIs
167 | 2. **Image Processing**: Optional compression/resizing before upload
168 | 3. **Error Handling**: Provide detailed error messages for failed uploads
169 | 4. **URL Generation**:
170 |    - Support both direct S3 URLs or custom domain URLs
171 |    - Handle path prefixing correctly
172 | 5. **Idempotency & Verification**:
173 |    - Calculate content hashes (MD5/SHA) for each file
174 |    - Store metadata in the object tags for verification
175 |    - Skip uploads for identical content (check hash before upload)
176 |    - Optional force upload flag to override verification
177 |    - Maintain a local cache of uploaded files with their hashes
178 | 6. **Concurrency & Reliability**:
179 |    - Implement worker pool for parallel uploads
180 |    - Configurable concurrency level (default: 5)
181 |    - Progress tracking for concurrent operations
182 |    - Built-in retry mechanism for failed uploads (hardcoded 3 retry attempts)
183 |    - Exponential backoff between retries (starting at 1s, doubling each retry)
184 |    - Standard timeout for upload operations
185 | 7. **SSL/Certificate Handling**:
186 |    - Support custom CA certificates
187 |    - Option to skip verification for self-signed certificates
188 |    - Configurable TLS settings per provider
189 | 8. **Conflict Resolution**:
190 |    - Strategies for handling name collisions (rename, version, overwrite)
191 |    - Option to preserve original filenames or use hashed names
192 | 9. **Incremental Uploads**:
193 |    - Track already uploaded files to avoid redundant operations
194 |    - Support for resuming interrupted batch uploads
195 | 
196 | ### Testing Strategy
197 | 
198 | 1. Unit tests for URL parsing and rewriting
199 | 2. Mocked storage provider for testing upload logic
200 | 3. Verification tests for hash calculation and comparison
201 | 4. Concurrency tests to ensure worker pool functions correctly
202 | 5. SSL/TLS configuration tests with mock certificates
203 | 6. Cache management tests for serialization/deserialization
204 | 7. Conflict resolution strategy tests
205 | 8. Integration tests with a local MinIO server
206 | 9. End-to-end tests with actual markdown files
207 | 10. Idempotency tests to verify repeated executions
208 | 11. Performance benchmarks for concurrent uploads


--------------------------------------------------------------------------------
/internal/exporter/sitereader/mkdocs.go:
--------------------------------------------------------------------------------
  1 | package sitereader
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"io"
  6 | 	"log"
  7 | 	"os"
  8 | 	"path/filepath"
  9 | 	"regexp"
 10 | 	"strings"
 11 | 
 12 | 	"gopkg.in/yaml.v3"
 13 | )
 14 | 
 15 | type MkDocsReader struct {
 16 | 	Logger *log.Logger
 17 | }
 18 | 
 19 | type MkDocsConfig struct {
 20 | 	Docs    []string `yaml:"nav"`
 21 | 	DocsDir string   `yaml:"docs_dir"`
 22 | 	Inherit string   `yaml:"INHERIT"`
 23 | }
 24 | 
 25 | func (r *MkDocsReader) Detect(dir string) bool {
 26 | 	// Setting up the Logger
 27 | 	if r.Logger == nil {
 28 | 		r.Logger = log.New(io.Discard, "", 0)
 29 | 	}
 30 | 
 31 | 	// Check if mkdocs.yml file exists
 32 | 	mkdocsPath := filepath.Join(dir, "mkdocs.yml")
 33 | 	if _, err := os.Stat(mkdocsPath); os.IsNotExist(err) {
 34 | 		// Try mkdocs.yaml
 35 | 		mkdocsPath = filepath.Join(dir, "mkdocs.yaml")
 36 | 		if _, err := os.Stat(mkdocsPath); os.IsNotExist(err) {
 37 | 			r.Logger.Printf("No mkdocs.yml or mkdocs.yaml found in %s", dir)
 38 | 			return false
 39 | 		}
 40 | 	}
 41 | 
 42 | 	r.Logger.Printf("Found MkDocs configuration file: %s", mkdocsPath)
 43 | 	return true
 44 | }
 45 | 
 46 | func (r *MkDocsReader) ReadStructure(dir string, configPath string, navPath string) ([]string, error) {
 47 | 	// Setting up the Logger
 48 | 	if r.Logger == nil {
 49 | 		r.Logger = log.New(io.Discard, "", 0)
 50 | 	}
 51 | 
 52 | 	r.Logger.Printf("Reading MkDocs site structure from: %s", dir)
 53 | 	if navPath != "" {
 54 | 		r.Logger.Printf("Filtering by navigation path: %s", navPath)
 55 | 	}
 56 | 
 57 | 	// Find config file
 58 | 	if configPath == "" {
 59 | 		configNames := []string{"mkdocs.yml", "mkdocs.yaml"}
 60 | 		var err error
 61 | 		configPath, err = FindConfigFile(dir, configNames)
 62 | 		if err != nil {
 63 | 			r.Logger.Printf("Failed to find MkDocs config file: %s", err)
 64 | 			return nil, fmt.Errorf("failed to find MkDocs config file: %s", err)
 65 | 		}
 66 | 	}
 67 | 	r.Logger.Printf("Using config file: %s", configPath)
 68 | 
 69 | 	// Read and parse config file, including handling INHERIT
 70 | 	config, err := r.readAndMergeConfig(configPath, dir)
 71 | 	if err != nil {
 72 | 		r.Logger.Printf("Failed to read config file: %s", err)
 73 | 		return nil, fmt.Errorf("failed to read config file: %s", err)
 74 | 	}
 75 | 
 76 | 	// Get docs directory
 77 | 	docsDir := "docs"
 78 | 	if docsDirValue, ok := config["docs_dir"]; ok {
 79 | 		if docsDirStr, ok := docsDirValue.(string); ok {
 80 | 			docsDir = docsDirStr
 81 | 		}
 82 | 	}
 83 | 	docsDir = filepath.Join(dir, docsDir)
 84 | 	r.Logger.Printf("Using docs directory: %s", docsDir)
 85 | 
 86 | 	// Parse navigation structure
 87 | 	var nav interface{}
 88 | 	if navValue, ok := config["nav"]; ok {
 89 | 		nav = navValue
 90 | 	} else {
 91 | 		// If no navigation config, try to find all Markdown files
 92 | 		r.Logger.Println("No navigation configuration found, searching for all markdown files")
 93 | 		return getAllMarkdownFiles(docsDir)
 94 | 	}
 95 | 
 96 | 	// Parse navigation structure, get file list
 97 | 	files, err := parseNavigation(nav, docsDir, navPath)
 98 | 	if err != nil {
 99 | 		r.Logger.Printf("Failed to parse navigation: %s", err)
100 | 		return nil, fmt.Errorf("failed to parse navigation: %s", err)
101 | 	}
102 | 
103 | 	r.Logger.Printf("Found %d files in navigation", len(files))
104 | 	return files, nil
105 | }
106 | 
107 | // readAndMergeConfig Read and merge MkDocs config file, handling INHERIT directive
108 | func (r *MkDocsReader) readAndMergeConfig(configPath string, baseDir string) (map[string]interface{}, error) {
109 | 	r.Logger.Printf("Reading and merging config file: %s", configPath)
110 | 
111 | 	// Read main config file
112 | 	configData, err := os.ReadFile(configPath)
113 | 	if err != nil {
114 | 		r.Logger.Printf("Failed to read MkDocs config file: %s", err)
115 | 		return nil, fmt.Errorf("failed to read MkDocs config file: %s", err)
116 | 	}
117 | 
118 | 	// Parse config file
119 | 	var config map[string]interface{}
120 | 	if err := yaml.Unmarshal(configData, &config); err != nil {
121 | 		r.Logger.Printf("Failed to parse MkDocs config file: %s", err)
122 | 		return nil, fmt.Errorf("failed to parse MkDocs config file: %s", err)
123 | 	}
124 | 
125 | 	// Check if there's an INHERIT directive
126 | 	inheritValue, hasInherit := config["INHERIT"]
127 | 	if !hasInherit {
128 | 		// No inherit, return current config
129 | 		return config, nil
130 | 	}
131 | 
132 | 	// Handle INHERIT directive
133 | 	inheritPath, ok := inheritValue.(string)
134 | 	if !ok {
135 | 		r.Logger.Printf("Invalid INHERIT value, expected string but got: %T", inheritValue)
136 | 		return nil, fmt.Errorf("invalid INHERIT value, expected string")
137 | 	}
138 | 
139 | 	r.Logger.Printf("Found INHERIT directive pointing to: %s", inheritPath)
140 | 
141 | 	// Parse inherit path, may be relative to current config file
142 | 	configDir := filepath.Dir(configPath)
143 | 	inheritFullPath := filepath.Join(configDir, inheritPath)
144 | 
145 | 	// Read inherited config file
146 | 	inheritConfig, err := r.readAndMergeConfig(inheritFullPath, baseDir)
147 | 	if err != nil {
148 | 		return nil, fmt.Errorf("failed to read inherited config file %s: %s", inheritFullPath, err)
149 | 	}
150 | 
151 | 	// Merge config, current config takes precedence
152 | 	mergedConfig := make(map[string]interface{})
153 | 
154 | 	// Copy inherit config first
155 | 	for k, v := range inheritConfig {
156 | 		mergedConfig[k] = v
157 | 	}
158 | 
159 | 	// Override current config
160 | 	for k, v := range config {
161 | 		if k != "INHERIT" { // Don't copy INHERIT directive
162 | 			mergedConfig[k] = v
163 | 		}
164 | 	}
165 | 
166 | 	r.Logger.Printf("Successfully merged config with inherited file")
167 | 	return mergedConfig, nil
168 | }
169 | 
170 | // preprocessMarkdownFile Preprocess Markdown file, remove YAML front matter that may cause problems
171 | func preprocessMarkdownFile(filePath string) error {
172 | 	// Read file content
173 | 	content, err := os.ReadFile(filePath)
174 | 	if err != nil {
175 | 		return err
176 | 	}
177 | 
178 | 	// Check if there's YAML front matter
179 | 	contentStr := string(content)
180 | 	yamlFrontMatterRegex := regexp.MustCompile(`(?s)^---\s*\n(.*?)\n---\s*\n`)
181 | 
182 | 	// If there's YAML front matter, remove it
183 | 	if yamlFrontMatterRegex.MatchString(contentStr) {
184 | 		// Create temp file
185 | 		tempFile, err := os.CreateTemp("", "mdctl-*.md")
186 | 		if err != nil {
187 | 			return err
188 | 		}
189 | 		tempFilePath := tempFile.Name()
190 | 		tempFile.Close()
191 | 
192 | 		// Remove YAML front matter
193 | 		processedContent := yamlFrontMatterRegex.ReplaceAllString(contentStr, "")
194 | 
195 | 		// Write processed content to temp file
196 | 		if err := os.WriteFile(tempFilePath, []byte(processedContent), 0644); err != nil {
197 | 			os.Remove(tempFilePath)
198 | 			return err
199 | 		}
200 | 
201 | 		// Replace original file
202 | 		if err := os.Rename(tempFilePath, filePath); err != nil {
203 | 			os.Remove(tempFilePath)
204 | 			return err
205 | 		}
206 | 	}
207 | 
208 | 	return nil
209 | }
210 | 
211 | // parseNavigation Parse MkDocs navigation structure
212 | func parseNavigation(nav interface{}, docsDir string, navPath string) ([]string, error) {
213 | 	var files []string
214 | 
215 | 	switch v := nav.(type) {
216 | 	case []interface{}:
217 | 		// Navigation is a list
218 | 		for _, item := range v {
219 | 			itemFiles, err := parseNavigation(item, docsDir, navPath)
220 | 			if err != nil {
221 | 				return nil, err
222 | 			}
223 | 			files = append(files, itemFiles...)
224 | 		}
225 | 	case map[string]interface{}:
226 | 		// Navigation is a map
227 | 		for title, value := range v {
228 | 			// If nav path is specified, check if current node title matches
229 | 			if navPath != "" {
230 | 				// Support simple path matching, e.g. "Section1/Subsection2"
231 | 				navParts := strings.Split(navPath, "/")
232 | 				if strings.TrimSpace(title) == strings.TrimSpace(navParts[0]) {
233 | 					// If it's a multi-level path, continue matching the next level
234 | 					if len(navParts) > 1 {
235 | 						subNavPath := strings.Join(navParts[1:], "/")
236 | 						itemFiles, err := parseNavigation(value, docsDir, subNavPath)
237 | 						if err != nil {
238 | 							return nil, err
239 | 						}
240 | 						files = append(files, itemFiles...)
241 | 						continue
242 | 					} else {
243 | 						// If it's a single-level path and matches, only handle this node
244 | 						itemFiles, err := parseNavigation(value, docsDir, "")
245 | 						if err != nil {
246 | 							return nil, err
247 | 						}
248 | 						files = append(files, itemFiles...)
249 | 						continue
250 | 					}
251 | 				} else {
252 | 					// Title doesn't match, skip this node
253 | 					continue
254 | 				}
255 | 			}
256 | 
257 | 			// If no nav path is specified or already matched the path, handle normally
258 | 			itemFiles, err := parseNavigation(value, docsDir, "")
259 | 			if err != nil {
260 | 				return nil, err
261 | 			}
262 | 			files = append(files, itemFiles...)
263 | 		}
264 | 	case string:
265 | 		// Navigation item is a file path
266 | 		if strings.HasSuffix(v, ".md") {
267 | 			filePath := filepath.Join(docsDir, v)
268 | 			if _, err := os.Stat(filePath); err == nil {
269 | 				// If no nav path is specified or already handled in nav path filtering, add file
270 | 				if navPath == "" {
271 | 					files = append(files, filePath)
272 | 				}
273 | 			}
274 | 		}
275 | 	}
276 | 
277 | 	return files, nil
278 | }
279 | 
280 | // getAllMarkdownFiles Get all Markdown files in a directory
281 | func getAllMarkdownFiles(dir string) ([]string, error) {
282 | 	var files []string
283 | 
284 | 	err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
285 | 		if err != nil {
286 | 			return err
287 | 		}
288 | 		if !info.IsDir() {
289 | 			ext := strings.ToLower(filepath.Ext(path))
290 | 			if ext == ".md" || ext == ".markdown" {
291 | 				files = append(files, path)
292 | 			}
293 | 		}
294 | 		return nil
295 | 	})
296 | 
297 | 	if err != nil {
298 | 		return nil, fmt.Errorf("failed to walk directory %s: %s", dir, err)
299 | 	}
300 | 
301 | 	return files, nil
302 | }
303 | 


--------------------------------------------------------------------------------
/internal/exporter/exporter.go:
--------------------------------------------------------------------------------
  1 | package exporter
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"io"
  6 | 	"log"
  7 | 	"os"
  8 | 	"path/filepath"
  9 | 	"sort"
 10 | 	"strings"
 11 | 
 12 | 	"github.com/samzong/mdctl/internal/exporter/sitereader"
 13 | )
 14 | 
 15 | // ExportOptions defines export options
 16 | type ExportOptions struct {
 17 | 	Template            string      // Word template file path
 18 | 	GenerateToc         bool        // Whether to generate table of contents
 19 | 	ShiftHeadingLevelBy int         // Heading level offset
 20 | 	FileAsTitle         bool        // Whether to use filename as section title
 21 | 	Format              string      // Output format (docx, pdf, epub)
 22 | 	SiteType            string      // Site type (mkdocs, hugo, docusaurus)
 23 | 	Verbose             bool        // Whether to enable verbose logging
 24 | 	Logger              *log.Logger // Logger
 25 | 	SourceDirs          []string    // List of source directories for processing image paths
 26 | 	TocDepth            int         // Table of contents depth, default is 3
 27 | 	NavPath             string      // Specified navigation path to export
 28 | }
 29 | 
 30 | // Exporter defines exporter interface
 31 | type Exporter interface {
 32 | 	Export(input string, output string, options ExportOptions) error
 33 | }
 34 | 
 35 | // DefaultExporter is the default exporter implementation
 36 | type DefaultExporter struct {
 37 | 	pandocPath string
 38 | 	logger     *log.Logger
 39 | }
 40 | 
 41 | // NewExporter creates a new exporter
 42 | func NewExporter() *DefaultExporter {
 43 | 	return &DefaultExporter{
 44 | 		pandocPath: "pandoc", // Default to pandoc in system PATH
 45 | 		logger:     log.New(os.Stdout, "[EXPORTER] ", log.LstdFlags),
 46 | 	}
 47 | }
 48 | 
 49 | // ExportFile exports a single Markdown file
 50 | func (e *DefaultExporter) ExportFile(input, output string, options ExportOptions) error {
 51 | 	// Set logger
 52 | 	if options.Logger != nil {
 53 | 		e.logger = options.Logger
 54 | 	} else if !options.Verbose {
 55 | 		e.logger = log.New(io.Discard, "", 0)
 56 | 	}
 57 | 
 58 | 	e.logger.Printf("Exporting file: %s -> %s", input, output)
 59 | 
 60 | 	// Check if file exists
 61 | 	if _, err := os.Stat(input); os.IsNotExist(err) {
 62 | 		e.logger.Printf("Error: input file does not exist: %s", input)
 63 | 		return fmt.Errorf("input file does not exist: %s", input)
 64 | 	}
 65 | 	e.logger.Printf("Input file exists: %s", input)
 66 | 
 67 | 	// Create output directory (if it doesn't exist)
 68 | 	outputDir := filepath.Dir(output)
 69 | 	if err := os.MkdirAll(outputDir, 0755); err != nil {
 70 | 		e.logger.Printf("Error: failed to create output directory: %s", err)
 71 | 		return fmt.Errorf("failed to create output directory: %s", err)
 72 | 	}
 73 | 	e.logger.Printf("Output directory created/verified: %s", outputDir)
 74 | 
 75 | 	// Add source directory to SourceDirs
 76 | 	sourceDir := filepath.Dir(input)
 77 | 	if options.SourceDirs == nil {
 78 | 		options.SourceDirs = []string{sourceDir}
 79 | 	} else {
 80 | 		// Check if already exists
 81 | 		found := false
 82 | 		for _, dir := range options.SourceDirs {
 83 | 			if dir == sourceDir {
 84 | 				found = true
 85 | 				break
 86 | 			}
 87 | 		}
 88 | 		if !found {
 89 | 			options.SourceDirs = append(options.SourceDirs, sourceDir)
 90 | 		}
 91 | 	}
 92 | 	e.logger.Printf("Added source directory to resource paths: %s", sourceDir)
 93 | 
 94 | 	// Use Pandoc to export
 95 | 	e.logger.Println("Starting Pandoc export process...")
 96 | 	pandocExporter := &PandocExporter{
 97 | 		PandocPath: e.pandocPath,
 98 | 		Logger:     e.logger,
 99 | 	}
100 | 	err := pandocExporter.Export(input, output, options)
101 | 	if err != nil {
102 | 		e.logger.Printf("Pandoc export failed: %s", err)
103 | 		return err
104 | 	}
105 | 
106 | 	e.logger.Printf("File export completed successfully: %s", output)
107 | 	return nil
108 | }
109 | 
110 | // ExportDirectory exports Markdown files in a directory
111 | func (e *DefaultExporter) ExportDirectory(inputDir, output string, options ExportOptions) error {
112 | 	// Set logger
113 | 	if options.Logger != nil {
114 | 		e.logger = options.Logger
115 | 	} else if !options.Verbose {
116 | 		e.logger = log.New(io.Discard, "", 0)
117 | 	}
118 | 
119 | 	e.logger.Printf("Exporting directory: %s -> %s", inputDir, output)
120 | 
121 | 	// Check if directory exists
122 | 	if _, err := os.Stat(inputDir); os.IsNotExist(err) {
123 | 		e.logger.Printf("Error: input directory does not exist: %s", inputDir)
124 | 		return fmt.Errorf("input directory does not exist: %s", inputDir)
125 | 	}
126 | 	e.logger.Printf("Input directory exists: %s", inputDir)
127 | 
128 | 	// Create output directory (if it doesn't exist)
129 | 	outputDir := filepath.Dir(output)
130 | 	if err := os.MkdirAll(outputDir, 0755); err != nil {
131 | 		e.logger.Printf("Error: failed to create output directory: %s", err)
132 | 		return fmt.Errorf("failed to create output directory: %s", err)
133 | 	}
134 | 	e.logger.Printf("Output directory created/verified: %s", outputDir)
135 | 
136 | 	// Initialize SourceDirs (if nil)
137 | 	if options.SourceDirs == nil {
138 | 		options.SourceDirs = []string{inputDir}
139 | 	} else {
140 | 		// Check if already exists
141 | 		found := false
142 | 		for _, dir := range options.SourceDirs {
143 | 			if dir == inputDir {
144 | 				found = true
145 | 				break
146 | 			}
147 | 		}
148 | 		if !found {
149 | 			options.SourceDirs = append(options.SourceDirs, inputDir)
150 | 		}
151 | 	}
152 | 	e.logger.Printf("Added input directory to resource paths: %s", inputDir)
153 | 
154 | 	// Depending on site type, choose different processing
155 | 	var files []string
156 | 	var err error
157 | 
158 | 	if options.SiteType != "" && options.SiteType != "basic" {
159 | 		// Use site reader to get file list
160 | 		e.logger.Printf("Using site reader for site type: %s", options.SiteType)
161 | 		reader, err := sitereader.GetSiteReader(options.SiteType, options.Verbose, e.logger)
162 | 		if err != nil {
163 | 			e.logger.Printf("Error getting site reader: %s", err)
164 | 			return err
165 | 		}
166 | 
167 | 		// Detect if it's the specified type of site
168 | 		e.logger.Printf("Detecting if directory is a %s site...", options.SiteType)
169 | 		if !reader.Detect(inputDir) {
170 | 			e.logger.Printf("Error: directory %s does not appear to be a %s site", inputDir, options.SiteType)
171 | 			return fmt.Errorf("directory %s does not appear to be a %s site", inputDir, options.SiteType)
172 | 		}
173 | 		e.logger.Printf("Directory confirmed as %s site", options.SiteType)
174 | 
175 | 		e.logger.Println("Reading site structure...")
176 | 		files, err = reader.ReadStructure(inputDir, "", options.NavPath)
177 | 		if err != nil {
178 | 			e.logger.Printf("Error reading site structure: %s", err)
179 | 			return err
180 | 		}
181 | 		e.logger.Printf("Found %d files in site structure", len(files))
182 | 	} else {
183 | 		// Basic directory mode: sort files by name
184 | 		e.logger.Println("Using basic directory mode, sorting files by name")
185 | 		files, err = GetMarkdownFilesInDir(inputDir)
186 | 		if err != nil {
187 | 			e.logger.Printf("Error getting markdown files: %s", err)
188 | 			return err
189 | 		}
190 | 		e.logger.Printf("Found %d markdown files in directory", len(files))
191 | 	}
192 | 
193 | 	if len(files) == 0 {
194 | 		e.logger.Printf("Error: no markdown files found in directory: %s", inputDir)
195 | 		return fmt.Errorf("no markdown files found in directory: %s", inputDir)
196 | 	}
197 | 
198 | 	// If there's only one file, export directly
199 | 	if len(files) == 1 {
200 | 		e.logger.Printf("Only one file found, exporting directly: %s", files[0])
201 | 		return e.ExportFile(files[0], output, options)
202 | 	}
203 | 
204 | 	// Merge multiple files
205 | 	e.logger.Printf("Merging %d files...", len(files))
206 | 	merger := &Merger{
207 | 		ShiftHeadingLevelBy: options.ShiftHeadingLevelBy,
208 | 		FileAsTitle:         options.FileAsTitle,
209 | 		Logger:              e.logger,
210 | 		SourceDirs:          make([]string, 0),
211 | 		Verbose:             options.Verbose,
212 | 	}
213 | 
214 | 	// Create temporary file
215 | 	e.logger.Println("Creating temporary file for merged content...")
216 | 	tempFile, err := os.CreateTemp("", "mdctl-merged-*.md")
217 | 	if err != nil {
218 | 		e.logger.Printf("Error creating temporary file: %s", err)
219 | 		return fmt.Errorf("failed to create temporary file: %s", err)
220 | 	}
221 | 	tempFilePath := tempFile.Name()
222 | 	tempFile.Close()
223 | 	defer os.Remove(tempFilePath)
224 | 	e.logger.Printf("Temporary file created: %s", tempFilePath)
225 | 
226 | 	// Merge files
227 | 	e.logger.Println("Merging files...")
228 | 	if err := merger.Merge(files, tempFilePath); err != nil {
229 | 		e.logger.Printf("Error merging files: %s", err)
230 | 		return fmt.Errorf("failed to merge files: %s", err)
231 | 	}
232 | 	e.logger.Println("Files merged successfully")
233 | 
234 | 	// Add merger collected source directories to options
235 | 	if merger.SourceDirs != nil && len(merger.SourceDirs) > 0 {
236 | 		e.logger.Printf("Adding %d source directories from merger", len(merger.SourceDirs))
237 | 		for _, dir := range merger.SourceDirs {
238 | 			// Check if already exists
239 | 			found := false
240 | 			for _, existingDir := range options.SourceDirs {
241 | 				if existingDir == dir {
242 | 					found = true
243 | 					break
244 | 				}
245 | 			}
246 | 			if !found {
247 | 				options.SourceDirs = append(options.SourceDirs, dir)
248 | 				e.logger.Printf("Added source directory: %s", dir)
249 | 			}
250 | 		}
251 | 	}
252 | 
253 | 	// Export merged file
254 | 	e.logger.Println("Starting Pandoc export process...")
255 | 	pandocExporter := &PandocExporter{
256 | 		PandocPath: e.pandocPath,
257 | 		Logger:     e.logger,
258 | 	}
259 | 	err = pandocExporter.Export(tempFilePath, output, options)
260 | 	if err != nil {
261 | 		e.logger.Printf("Pandoc export failed: %s", err)
262 | 		return err
263 | 	}
264 | 
265 | 	e.logger.Printf("Directory export completed successfully: %s", output)
266 | 	return nil
267 | }
268 | 
269 | // SiteReader defines site reader interface
270 | type SiteReader interface {
271 | 	// Detect if given directory is this type of site
272 | 	Detect(dir string) bool
273 | 	// Read site structure, return sorted list of files
274 | 	ReadStructure(dir string, configPath string) ([]string, error)
275 | }
276 | 
277 | // GetMarkdownFilesInDir gets all Markdown files in a directory and sorts them by filename
278 | func GetMarkdownFilesInDir(dir string) ([]string, error) {
279 | 	// Check if directory exists
280 | 	info, err := os.Stat(dir)
281 | 	if err != nil {
282 | 		return nil, err
283 | 	}
284 | 	if !info.IsDir() {
285 | 		return nil, fmt.Errorf("%s is not a directory", dir)
286 | 	}
287 | 
288 | 	// Recursively find all Markdown files
289 | 	var files []string
290 | 	err = filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
291 | 		if err != nil {
292 | 			return err
293 | 		}
294 | 		if !info.IsDir() {
295 | 			ext := strings.ToLower(filepath.Ext(path))
296 | 			if ext == ".md" || ext == ".markdown" {
297 | 				files = append(files, path)
298 | 			}
299 | 		}
300 | 		return nil
301 | 	})
302 | 
303 | 	if err != nil {
304 | 		return nil, fmt.Errorf("failed to walk directory %s: %s", dir, err)
305 | 	}
306 | 
307 | 	// Sort by filename
308 | 	sort.Strings(files)
309 | 
310 | 	return files, nil
311 | }
312 | 


--------------------------------------------------------------------------------
/internal/exporter/merger.go:
--------------------------------------------------------------------------------
  1 | package exporter
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"fmt"
  6 | 	"io"
  7 | 	"log"
  8 | 	"os"
  9 | 	"path/filepath"
 10 | 	"regexp"
 11 | 	"strings"
 12 | 	"unicode/utf8"
 13 | 
 14 | 	"golang.org/x/text/encoding/simplifiedchinese"
 15 | 	"golang.org/x/text/transform"
 16 | )
 17 | 
 18 | // Merger Merge multiple Markdown files
 19 | type Merger struct {
 20 | 	ShiftHeadingLevelBy int
 21 | 	FileAsTitle         bool
 22 | 	Logger              *log.Logger
 23 | 	// Store all source directories, used to set Pandoc's resource paths
 24 | 	SourceDirs []string
 25 | 	// Whether to enable verbose logging
 26 | 	Verbose bool
 27 | }
 28 | 
 29 | // Merge Merge multiple Markdown files into a single target file
 30 | func (m *Merger) Merge(sources []string, target string) error {
 31 | 	// If no logger is provided, create a default one
 32 | 	if m.Logger == nil {
 33 | 		if m.Verbose {
 34 | 			m.Logger = log.New(os.Stdout, "[MERGER] ", log.LstdFlags)
 35 | 		} else {
 36 | 			m.Logger = log.New(io.Discard, "", 0)
 37 | 		}
 38 | 	}
 39 | 
 40 | 	if len(sources) == 0 {
 41 | 		m.Logger.Println("Error: no source files provided")
 42 | 		return fmt.Errorf("no source files provided")
 43 | 	}
 44 | 
 45 | 	m.Logger.Printf("Merging %d files into: %s", len(sources), target)
 46 | 	var mergedContent strings.Builder
 47 | 
 48 | 	// Initialize source directory list
 49 | 	m.SourceDirs = make([]string, 0, len(sources))
 50 | 	sourceDirsMap := make(map[string]bool) // Used for deduplication
 51 | 
 52 | 	// Process each source file
 53 | 	for i, source := range sources {
 54 | 		m.Logger.Printf("Processing file %d/%d: %s", i+1, len(sources), source)
 55 | 
 56 | 		// Get source file's directory and add to list (deduplication)
 57 | 		sourceDir := filepath.Dir(source)
 58 | 		if !sourceDirsMap[sourceDir] {
 59 | 			sourceDirsMap[sourceDir] = true
 60 | 			m.SourceDirs = append(m.SourceDirs, sourceDir)
 61 | 		}
 62 | 
 63 | 		// Read file content
 64 | 		content, err := os.ReadFile(source)
 65 | 		if err != nil {
 66 | 			m.Logger.Printf("Error reading file %s: %s", source, err)
 67 | 			return fmt.Errorf("failed to read file %s: %s", source, err)
 68 | 		}
 69 | 
 70 | 		// Process content
 71 | 		processedContent := string(content)
 72 | 
 73 | 		// Ensure content is valid UTF-8
 74 | 		if !utf8.ValidString(processedContent) {
 75 | 			m.Logger.Printf("File %s contains invalid UTF-8, attempting to convert from GBK", source)
 76 | 			// Attempt to convert content from GBK to UTF-8
 77 | 			reader := transform.NewReader(bytes.NewReader(content), simplifiedchinese.GBK.NewDecoder())
 78 | 			decodedContent, err := io.ReadAll(reader)
 79 | 			if err != nil {
 80 | 				m.Logger.Printf("Failed to decode content from file %s: %s", source, err)
 81 | 				return fmt.Errorf("failed to decode content from file %s: %s", source, err)
 82 | 			}
 83 | 			processedContent = string(decodedContent)
 84 | 			m.Logger.Printf("Successfully converted content from GBK to UTF-8")
 85 | 		}
 86 | 
 87 | 		// Remove YAML front matter
 88 | 		m.Logger.Println("Removing YAML front matter...")
 89 | 		processedContent = removeYAMLFrontMatter(processedContent)
 90 | 
 91 | 		// Process image paths
 92 | 		m.Logger.Println("Processing image paths...")
 93 | 		processedContent, err = processImagePaths(processedContent, source, m.Logger, m.Verbose)
 94 | 		if err != nil {
 95 | 			m.Logger.Printf("Error processing image paths: %s", err)
 96 | 			return fmt.Errorf("failed to process image paths: %s", err)
 97 | 		}
 98 | 
 99 | 		// Adjust heading levels
100 | 		if m.ShiftHeadingLevelBy != 0 {
101 | 			m.Logger.Printf("Shifting heading levels by %d", m.ShiftHeadingLevelBy)
102 | 			processedContent = ShiftHeadings(processedContent, m.ShiftHeadingLevelBy)
103 | 		}
104 | 
105 | 		// Add filename as title
106 | 		if m.FileAsTitle {
107 | 			filename := filepath.Base(source)
108 | 			m.Logger.Printf("Adding filename as title: %s", filename)
109 | 			processedContent = AddTitleFromFilename(processedContent, filename, 1+m.ShiftHeadingLevelBy)
110 | 		}
111 | 
112 | 		// Add to merged content
113 | 		m.Logger.Printf("Adding processed content to merged result (length: %d bytes)", len(processedContent))
114 | 		mergedContent.WriteString(processedContent)
115 | 
116 | 		// If not the last file, add separator
117 | 		if i < len(sources)-1 {
118 | 			mergedContent.WriteString("\n\n")
119 | 		}
120 | 	}
121 | 
122 | 	// Final content
123 | 	finalContent := mergedContent.String()
124 | 
125 | 	// Check again for any YAML-related issues
126 | 	m.Logger.Println("Sanitizing final content...")
127 | 	finalContent = sanitizeContent(finalContent)
128 | 
129 | 	// Write target file, ensuring UTF-8 encoding
130 | 	m.Logger.Printf("Writing merged content to target file: %s (size: %d bytes)", target, len(finalContent))
131 | 	err := os.WriteFile(target, []byte(finalContent), 0644)
132 | 	if err != nil {
133 | 		m.Logger.Printf("Error writing merged content: %s", err)
134 | 		return fmt.Errorf("failed to write merged content to %s: %s", target, err)
135 | 	}
136 | 
137 | 	m.Logger.Printf("Successfully merged %d files into: %s", len(sources), target)
138 | 	return nil
139 | }
140 | 
141 | // processImagePaths Process image paths in Markdown, converting relative paths to paths relative to the command execution location
142 | func processImagePaths(content, sourcePath string, logger *log.Logger, verbose bool) (string, error) {
143 | 	// If no logger is provided, create a default one
144 | 	if logger == nil {
145 | 		if verbose {
146 | 			logger = log.New(os.Stdout, "[IMAGE] ", log.LstdFlags)
147 | 		} else {
148 | 			logger = log.New(io.Discard, "", 0)
149 | 		}
150 | 	}
151 | 
152 | 	// Get source file's directory
153 | 	sourceDir := filepath.Dir(sourcePath)
154 | 	if verbose {
155 | 		logger.Printf("Processing image paths: source file directory = %s", sourceDir)
156 | 	}
157 | 
158 | 	// Get current working directory (location of command execution)
159 | 	workingDir, err := os.Getwd()
160 | 	if err != nil {
161 | 		return "", fmt.Errorf("unable to get current working directory: %v", err)
162 | 	}
163 | 	if verbose {
164 | 		logger.Printf("Current working directory = %s", workingDir)
165 | 	}
166 | 
167 | 	// Get absolute path of source file's directory
168 | 	absSourceDir, err := filepath.Abs(sourceDir)
169 | 	if err != nil {
170 | 		return "", fmt.Errorf("unable to get absolute path of source file's directory: %v", err)
171 | 	}
172 | 	if verbose {
173 | 		logger.Printf("Source file's directory absolute path = %s", absSourceDir)
174 | 	}
175 | 
176 | 	// Match Markdown image syntax: ![alt](path)
177 | 	imageRegex := regexp.MustCompile(`!\[(.*?)\]\((.*?)\)`)
178 | 
179 | 	// Replace all image paths
180 | 	processedContent := imageRegex.ReplaceAllStringFunc(content, func(match string) string {
181 | 		// Extract image path
182 | 		submatches := imageRegex.FindStringSubmatch(match)
183 | 		if len(submatches) < 3 {
184 | 			return match // If match is incorrect, keep as-is
185 | 		}
186 | 
187 | 		altText := submatches[1]
188 | 		imagePath := submatches[2]
189 | 		if verbose {
190 | 			logger.Printf("Found image: alt = %s, path = %s", altText, imagePath)
191 | 		}
192 | 
193 | 		// If image is a web image (starts with http:// or https://), keep as-is
194 | 		if strings.HasPrefix(imagePath, "http://") || strings.HasPrefix(imagePath, "https://") {
195 | 			if verbose {
196 | 				logger.Printf("Keeping web image path: %s", imagePath)
197 | 			}
198 | 			return match
199 | 		}
200 | 
201 | 		// Parse image's absolute path
202 | 		var absoluteImagePath string
203 | 		if filepath.IsAbs(imagePath) {
204 | 			absoluteImagePath = imagePath
205 | 		} else {
206 | 			// For relative paths, convert to absolute path first
207 | 			absoluteImagePath = filepath.Join(absSourceDir, imagePath)
208 | 		}
209 | 		if verbose {
210 | 			logger.Printf("Image path: relative path = %s, absolute path = %s", imagePath, absoluteImagePath)
211 | 		}
212 | 
213 | 		// Check if image file exists
214 | 		if _, err := os.Stat(absoluteImagePath); os.IsNotExist(err) {
215 | 			if verbose {
216 | 				logger.Printf("Image does not exist: %s", absoluteImagePath)
217 | 			}
218 | 			// Image does not exist, try to find it in adjacent directories
219 | 			// For example, if path is ../images/image.png, try to find it in the images subdirectory of the parent directory of the source file's directory
220 | 			if strings.HasPrefix(imagePath, "../") {
221 | 				parentDir := filepath.Dir(absSourceDir)
222 | 				relPath := strings.TrimPrefix(imagePath, "../")
223 | 				alternativePath := filepath.Join(parentDir, relPath)
224 | 				if verbose {
225 | 					logger.Printf("Trying alternative path: %s", alternativePath)
226 | 				}
227 | 				if _, err := os.Stat(alternativePath); err == nil {
228 | 					absoluteImagePath = alternativePath
229 | 					if verbose {
230 | 						logger.Printf("Found image in alternative path: %s", absoluteImagePath)
231 | 					}
232 | 				} else {
233 | 					// Still not found, keep as-is
234 | 					if verbose {
235 | 						logger.Printf("Image does not exist in alternative path: %s", alternativePath)
236 | 					}
237 | 					return match
238 | 				}
239 | 			} else {
240 | 				// Image not found, keep as-is
241 | 				return match
242 | 			}
243 | 		}
244 | 
245 | 		// Calculate image's path relative to current working directory
246 | 		relPath, err := filepath.Rel(workingDir, absoluteImagePath)
247 | 		if err != nil {
248 | 			if verbose {
249 | 				logger.Printf("Unable to calculate relative path, keeping original path: %s, error: %v", imagePath, err)
250 | 			}
251 | 			return match
252 | 		}
253 | 
254 | 		// Update image reference with path relative to current working directory
255 | 		newRef := fmt.Sprintf("![%s](%s)", altText, relPath)
256 | 		if verbose {
257 | 			logger.Printf("Updating image reference: %s -> %s", match, newRef)
258 | 		}
259 | 		return newRef
260 | 	})
261 | 
262 | 	return processedContent, nil
263 | }
264 | 
265 | // removeYAMLFrontMatter Remove YAML front matter
266 | func removeYAMLFrontMatter(content string) string {
267 | 	// Match YAML front matter
268 | 	yamlFrontMatterRegex := regexp.MustCompile(`(?s)^---\s*\n(.*?)\n---\s*\n`)
269 | 	return yamlFrontMatterRegex.ReplaceAllString(content, "")
270 | }
271 | 
272 | // sanitizeContent Clean content, removing content that may cause Pandoc parsing errors
273 | func sanitizeContent(content string) string {
274 | 	// Remove lines that may cause YAML parsing errors
275 | 	lines := strings.Split(content, "\n")
276 | 	var cleanedLines []string
277 | 
278 | 	for _, line := range lines {
279 | 		// Skip lines that may cause YAML parsing errors
280 | 		if strings.Contains(line, ":") && !strings.Contains(line, ": ") && !strings.HasPrefix(line, "    ") && !strings.HasPrefix(line, "\t") {
281 | 			// In this case, there should be a space after the colon, but there isn't, which may cause YAML parsing errors
282 | 			// Try to fix it
283 | 			fixedLine := strings.Replace(line, ":", ": ", 1)
284 | 			cleanedLines = append(cleanedLines, fixedLine)
285 | 		} else if strings.HasPrefix(line, "-") && !strings.HasPrefix(line, "- ") && len(line) > 1 {
286 | 			// In this case, there should be a space after the dash, but there isn't, which may cause YAML parsing errors
287 | 			// Try to fix it
288 | 			fixedLine := strings.Replace(line, "-", "- ", 1)
289 | 			cleanedLines = append(cleanedLines, fixedLine)
290 | 		} else {
291 | 			cleanedLines = append(cleanedLines, line)
292 | 		}
293 | 	}
294 | 
295 | 	return strings.Join(cleanedLines, "\n")
296 | }
297 | 


--------------------------------------------------------------------------------
/internal/exporter/pandoc.go:
--------------------------------------------------------------------------------
  1 | package exporter
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"io"
  6 | 	"log"
  7 | 	"os"
  8 | 	"os/exec"
  9 | 	"path/filepath"
 10 | 	"regexp"
 11 | 	"strings"
 12 | )
 13 | 
 14 | // PandocExporter Use Pandoc to export Markdown files
 15 | type PandocExporter struct {
 16 | 	PandocPath string
 17 | 	Logger     *log.Logger
 18 | }
 19 | 
 20 | // Export Use Pandoc to export Markdown files
 21 | func (e *PandocExporter) Export(input, output string, options ExportOptions) error {
 22 | 	// If no logger is provided, create a default one
 23 | 	if e.Logger == nil {
 24 | 		if options.Verbose {
 25 | 			e.Logger = log.New(os.Stdout, "[PANDOC] ", log.LstdFlags)
 26 | 		} else {
 27 | 			e.Logger = log.New(io.Discard, "", 0)
 28 | 		}
 29 | 	}
 30 | 
 31 | 	e.Logger.Printf("Starting Pandoc export: %s -> %s", input, output)
 32 | 
 33 | 	// Ensure output path is absolute
 34 | 	absOutput, err := filepath.Abs(output)
 35 | 	if err != nil {
 36 | 		e.Logger.Printf("Failed to get absolute path for output: %s", err)
 37 | 		return fmt.Errorf("failed to get absolute path for output: %s", err)
 38 | 	}
 39 | 	e.Logger.Printf("Using absolute output path: %s", absOutput)
 40 | 
 41 | 	// Create a temporary file for sanitized content
 42 | 	e.Logger.Println("Creating sanitized copy of input file...")
 43 | 	tempFile, err := createSanitizedCopy(input, e.Logger)
 44 | 	if err != nil {
 45 | 		e.Logger.Printf("Failed to create sanitized copy: %s", err)
 46 | 		return fmt.Errorf("failed to create sanitized copy: %s", err)
 47 | 	}
 48 | 	defer os.Remove(tempFile)
 49 | 	e.Logger.Printf("Sanitized copy created: %s", tempFile)
 50 | 
 51 | 	// Build Pandoc command arguments
 52 | 	e.Logger.Println("Building Pandoc command arguments...")
 53 | 	args := []string{
 54 | 		tempFile,
 55 | 		"-o", absOutput,
 56 | 		"--standalone",
 57 | 		"--pdf-engine=xelatex",
 58 | 		"-V", "mainfont=SimSun", // Use SimSun as the main font
 59 | 		"--wrap=preserve",
 60 | 		"--embed-resources", // Embed resources into output file
 61 | 	}
 62 | 
 63 | 	// Add resource path parameters, helping Pandoc find images
 64 | 	// Collect all possible resource paths
 65 | 	resourcePaths := make(map[string]bool)
 66 | 
 67 | 	// Add input file directory
 68 | 	inputDir := filepath.Dir(input)
 69 | 	resourcePaths[inputDir] = true
 70 | 	e.Logger.Printf("Added input file directory to resource paths: %s", inputDir)
 71 | 
 72 | 	// Add current working directory
 73 | 	workingDir, err := os.Getwd()
 74 | 	if err == nil {
 75 | 		resourcePaths[workingDir] = true
 76 | 		e.Logger.Printf("Added current working directory to resource paths: %s", workingDir)
 77 | 	}
 78 | 
 79 | 	// Add output file directory
 80 | 	outputDir := filepath.Dir(absOutput)
 81 | 	resourcePaths[outputDir] = true
 82 | 	e.Logger.Printf("Added output file directory to resource paths: %s", outputDir)
 83 | 
 84 | 	// Add source file directories to resource paths
 85 | 	if len(options.SourceDirs) > 0 {
 86 | 		for _, dir := range options.SourceDirs {
 87 | 			resourcePaths[dir] = true
 88 | 			e.Logger.Printf("Added source file directory to resource paths: %s", dir)
 89 | 		}
 90 | 	}
 91 | 
 92 | 	// Add all resource paths to Pandoc arguments
 93 | 	for path := range resourcePaths {
 94 | 		args = append(args, "--resource-path", path)
 95 | 	}
 96 | 
 97 | 	// Add template parameter
 98 | 	if options.Template != "" {
 99 | 		e.Logger.Printf("Using template: %s", options.Template)
100 | 		args = append(args, "--reference-doc", options.Template)
101 | 	}
102 | 
103 | 	// Add directory parameter
104 | 	if options.GenerateToc {
105 | 		e.Logger.Println("Generating table of contents")
106 | 		args = append(args, "--toc")
107 | 
108 | 		// Add directory depth parameter
109 | 		if options.TocDepth > 0 {
110 | 			e.Logger.Printf("Setting table of contents depth to: %d", options.TocDepth)
111 | 			args = append(args, "--toc-depth", fmt.Sprintf("%d", options.TocDepth))
112 | 		}
113 | 	}
114 | 
115 | 	// Add heading level offset parameter
116 | 	if options.ShiftHeadingLevelBy != 0 {
117 | 		e.Logger.Printf("Shifting heading levels by: %d", options.ShiftHeadingLevelBy)
118 | 		args = append(args, "--shift-heading-level-by", fmt.Sprintf("%d", options.ShiftHeadingLevelBy))
119 | 	}
120 | 
121 | 	// Add specific parameters based on output format
122 | 	e.Logger.Printf("Using output format: %s", options.Format)
123 | 	switch options.Format {
124 | 	case "pdf":
125 | 		// PDF format needs special handling for Chinese
126 | 		e.Logger.Println("Adding PDF-specific parameters for CJK support")
127 | 		args = append(args,
128 | 			"-V", "CJKmainfont=SimSun", // CJK font settings
129 | 			"-V", "documentclass=article",
130 | 			"-V", "geometry=margin=1in")
131 | 	case "epub":
132 | 		// EPUB format specific parameters
133 | 		e.Logger.Println("Adding EPUB-specific parameters")
134 | 		args = append(args, "--epub-chapter-level=1")
135 | 	}
136 | 
137 | 	// Execute Pandoc command
138 | 	e.Logger.Printf("Executing Pandoc command: %s %s", e.PandocPath, strings.Join(args, " "))
139 | 	cmd := exec.Command(e.PandocPath, args...)
140 | 
141 | 	// Set working directory to input file directory, which helps Pandoc find relative paths for images
142 | 	cmd.Dir = inputDir
143 | 
144 | 	outputBytes, err := cmd.CombinedOutput()
145 | 	if err != nil {
146 | 		// If execution fails, try to look at input file content for debugging
147 | 		e.Logger.Printf("Pandoc execution failed: %s", err)
148 | 		e.Logger.Printf("Pandoc output: %s", string(outputBytes))
149 | 
150 | 		inputContent, readErr := os.ReadFile(tempFile)
151 | 		if readErr == nil {
152 | 			// Only show the first 500 characters to avoid too much output
153 | 			contentPreview := string(inputContent)
154 | 			if len(contentPreview) > 500 {
155 | 				contentPreview = contentPreview[:500] + "..."
156 | 			}
157 | 			e.Logger.Printf("Input file preview:\n%s", contentPreview)
158 | 			return fmt.Errorf("pandoc execution failed: %s\nOutput: %s\nCommand: %s\nInput file preview:\n%s",
159 | 				err, string(outputBytes), strings.Join(cmd.Args, " "), contentPreview)
160 | 		}
161 | 
162 | 		return fmt.Errorf("pandoc execution failed: %s\nOutput: %s\nCommand: %s",
163 | 			err, string(outputBytes), strings.Join(cmd.Args, " "))
164 | 	}
165 | 
166 | 	e.Logger.Printf("Pandoc export completed successfully: %s", output)
167 | 	return nil
168 | }
169 | 
170 | // createSanitizedCopy Create a sanitized temporary file copy
171 | func createSanitizedCopy(inputFile string, logger *log.Logger) (string, error) {
172 | 	if logger == nil {
173 | 		logger = log.New(io.Discard, "", 0)
174 | 	}
175 | 
176 | 	// Read input file content
177 | 	logger.Printf("Reading input file: %s", inputFile)
178 | 	content, err := os.ReadFile(inputFile)
179 | 	if err != nil {
180 | 		return "", fmt.Errorf("failed to read input file: %s", err)
181 | 	}
182 | 
183 | 	// Convert content to string
184 | 	contentStr := string(content)
185 | 
186 | 	// Remove YAML front matter
187 | 	logger.Println("Removing YAML front matter...")
188 | 	yamlFrontMatterRegex := regexp.MustCompile(`(?s)^---\s*\n(.*?)\n---\s*\n`)
189 | 	if yamlFrontMatterRegex.MatchString(contentStr) {
190 | 		logger.Println("YAML front matter found, removing it")
191 | 		contentStr = yamlFrontMatterRegex.ReplaceAllString(contentStr, "")
192 | 	}
193 | 
194 | 	// Fix lines that may cause YAML parsing errors
195 | 	logger.Println("Fixing potential YAML parsing issues...")
196 | 	lines := strings.Split(contentStr, "\n")
197 | 	var cleanedLines []string
198 | 	fixedLines := 0
199 | 
200 | 	for _, line := range lines {
201 | 		// Skip lines that may cause YAML parsing errors
202 | 		if strings.Contains(line, ":") && !strings.Contains(line, ": ") && !strings.HasPrefix(line, "    ") && !strings.HasPrefix(line, "\t") {
203 | 			// In this case, there should be a space after the colon, but there isn't, which may cause YAML parsing errors
204 | 			// Try to fix it
205 | 			fixedLine := strings.Replace(line, ":", ": ", 1)
206 | 			cleanedLines = append(cleanedLines, fixedLine)
207 | 			fixedLines++
208 | 			logger.Printf("Fixed line with missing space after colon: %s -> %s", line, fixedLine)
209 | 		} else if strings.HasPrefix(line, "-") && !strings.HasPrefix(line, "- ") && len(line) > 1 {
210 | 			// In this case, there should be a space after the dash, but there isn't, which may cause YAML parsing errors
211 | 			// Try to fix it
212 | 			fixedLine := strings.Replace(line, "-", "- ", 1)
213 | 			cleanedLines = append(cleanedLines, fixedLine)
214 | 			fixedLines++
215 | 			logger.Printf("Fixed line with missing space after dash: %s -> %s", line, fixedLine)
216 | 		} else {
217 | 			cleanedLines = append(cleanedLines, line)
218 | 		}
219 | 	}
220 | 
221 | 	logger.Printf("Fixed %d lines with potential YAML issues", fixedLines)
222 | 
223 | 	// Create a temporary file
224 | 	tempDir := os.TempDir()
225 | 	tempFilePath := filepath.Join(tempDir, "mdctl-sanitized-"+filepath.Base(inputFile))
226 | 
227 | 	// Write sanitized content to temporary file
228 | 	logger.Printf("Writing sanitized content to temporary file: %s", tempFilePath)
229 | 	err = os.WriteFile(tempFilePath, []byte(strings.Join(cleanedLines, "\n")), 0644)
230 | 	if err != nil {
231 | 		return "", err
232 | 	}
233 | 
234 | 	return tempFilePath, nil
235 | }
236 | 
237 | // preprocessInputFile Preprocess input file, removing content that may cause Pandoc parsing errors
238 | func preprocessInputFile(filePath string) error {
239 | 	// Read file content
240 | 	content, err := os.ReadFile(filePath)
241 | 	if err != nil {
242 | 		return err
243 | 	}
244 | 
245 | 	contentStr := string(content)
246 | 
247 | 	// Check for unconventional YAML front matter
248 | 	yamlFrontMatterRegex := regexp.MustCompile(`(?s)^---\s*\n(.*?)\n---\s*\n`)
249 | 	if yamlFrontMatterRegex.MatchString(contentStr) {
250 | 		// Extract YAML front matter content
251 | 		matches := yamlFrontMatterRegex.FindStringSubmatch(contentStr)
252 | 		if len(matches) > 1 {
253 | 			yamlContent := matches[1]
254 | 
255 | 			// Check if YAML content has formatting issues
256 | 			if strings.Contains(yamlContent, "\n-") && !strings.Contains(yamlContent, "\n- ") {
257 | 				// Fix formatting issue: ensure there's a space after the dash
258 | 				fixedYaml := strings.ReplaceAll(yamlContent, "\n-", "\n- ")
259 | 				fixedContent := strings.Replace(contentStr, yamlContent, fixedYaml, 1)
260 | 
261 | 				// Write back to file
262 | 				return os.WriteFile(filePath, []byte(fixedContent), 0644)
263 | 			}
264 | 		}
265 | 
266 | 		// If YAML format has other issues, remove entire front matter
267 | 		processedContent := yamlFrontMatterRegex.ReplaceAllString(contentStr, "")
268 | 		return os.WriteFile(filePath, []byte(processedContent), 0644)
269 | 	}
270 | 
271 | 	return nil
272 | }
273 | 
274 | // CheckPandocAvailability Check if Pandoc is available
275 | func CheckPandocAvailability() error {
276 | 	cmd := exec.Command("pandoc", "--version")
277 | 	outputBytes, err := cmd.CombinedOutput()
278 | 	if err != nil {
279 | 		return fmt.Errorf("pandoc is not available: %s\n\nPlease install Pandoc to use the export feature:\n\n"+
280 | 			"macOS: brew install pandoc\n"+
281 | 			"Ubuntu/Debian: sudo apt-get install pandoc\n"+
282 | 			"Windows: choco install pandoc\n\n"+
283 | 			"For more information, visit: https://pandoc.org/installing.html", err)
284 | 	}
285 | 
286 | 	// Check version
287 | 	versionStr := string(outputBytes)
288 | 	if !strings.Contains(versionStr, "pandoc") {
289 | 		return fmt.Errorf("unexpected pandoc version output: %s", versionStr)
290 | 	}
291 | 
292 | 	return nil
293 | }
294 | 


--------------------------------------------------------------------------------
/internal/translator/translator.go:
--------------------------------------------------------------------------------
  1 | package translator
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"encoding/json"
  6 | 	"fmt"
  7 | 	"io"
  8 | 	"net/http"
  9 | 	"os"
 10 | 	"path/filepath"
 11 | 	"regexp"
 12 | 	"sort"
 13 | 	"strings"
 14 | 
 15 | 	"github.com/samzong/mdctl/internal/config"
 16 | 	"github.com/samzong/mdctl/internal/markdownfmt"
 17 | 	"gopkg.in/yaml.v3"
 18 | )
 19 | 
 20 | // SupportedLanguages defines the mapping of supported languages
 21 | var SupportedLanguages = map[string]string{
 22 | 	"zh": "中文",
 23 | 	"en": "English",
 24 | 	"ja": "日本語",
 25 | 	"ko": "한국어",
 26 | 	"fr": "Français",
 27 | 	"de": "Deutsch",
 28 | 	"es": "Español",
 29 | 	"it": "Italiano",
 30 | 	"ru": "Русский",
 31 | 	"pt": "Português",
 32 | 	"vi": "Tiếng Việt",
 33 | 	"th": "ไทย",
 34 | 	"ar": "العربية",
 35 | 	"hi": "हिन्दी",
 36 | }
 37 | 
 38 | // IsLanguageSupported checks if the language is supported
 39 | func IsLanguageSupported(lang string) bool {
 40 | 	_, ok := SupportedLanguages[lang]
 41 | 	return ok
 42 | }
 43 | 
 44 | // GetSupportedLanguages returns a list of supported languages
 45 | func GetSupportedLanguages() string {
 46 | 	var langs []string
 47 | 	for code, name := range SupportedLanguages {
 48 | 		langs = append(langs, fmt.Sprintf("%s (%s)", code, name))
 49 | 	}
 50 | 	sort.Strings(langs)
 51 | 	return strings.Join(langs, ", ")
 52 | }
 53 | 
 54 | type OpenAIMessage struct {
 55 | 	Role    string `json:"role"`
 56 | 	Content string `json:"content"`
 57 | }
 58 | 
 59 | type OpenAIRequest struct {
 60 | 	Model       string          `json:"model"`
 61 | 	Messages    []OpenAIMessage `json:"messages"`
 62 | 	Temperature float64         `json:"temperature"`
 63 | 	TopP        float64         `json:"top_p"`
 64 | }
 65 | 
 66 | type OpenAIResponse struct {
 67 | 	Choices []struct {
 68 | 		Message struct {
 69 | 			Content string `json:"content"`
 70 | 		} `json:"message"`
 71 | 	} `json:"choices"`
 72 | }
 73 | 
 74 | // Progress is used to track translation progress
 75 | type Progress struct {
 76 | 	Total      int
 77 | 	Current    int
 78 | 	SourceFile string
 79 | 	TargetFile string
 80 | }
 81 | 
 82 | // ProgressCallback defines the progress callback function type
 83 | type ProgressCallback func(progress Progress)
 84 | 
 85 | // Translator struct for the translator
 86 | type Translator struct {
 87 | 	config   *config.Config
 88 | 	format   bool
 89 | 	progress ProgressCallback
 90 | }
 91 | 
 92 | // New creates a new translator instance
 93 | func New(cfg *config.Config, format bool) *Translator {
 94 | 	return &Translator{
 95 | 		config: cfg,
 96 | 		format: format,
 97 | 		progress: func(p Progress) {
 98 | 			if p.Total > 1 {
 99 | 				fmt.Printf("Translating file [%d/%d]: %s\n", p.Current, p.Total, p.SourceFile)
100 | 			}
101 | 		},
102 | 	}
103 | }
104 | 
105 | var (
106 | 	// RegexPatterns defines patterns for removing special content blocks
107 | 	RegexPatterns = []struct {
108 | 		Pattern string
109 | 		Replace string
110 | 	}{
111 | 		{`(?s)<think>.*?</think>\n?`, ""}, // Remove ollama deepthink thinking process
112 | 	}
113 | )
114 | 
115 | // TranslateContent translates the content
116 | func (t *Translator) TranslateContent(content string, lang string) (string, error) {
117 | 	// Remove potential front matter
118 | 	content = removeFrontMatter(content)
119 | 
120 | 	prompt := strings.Replace(t.config.TranslatePrompt, "{TARGET_LANG}", lang, 1)
121 | 
122 | 	messages := []OpenAIMessage{
123 | 		{Role: "system", Content: prompt},
124 | 		{Role: "user", Content: content},
125 | 	}
126 | 
127 | 	reqBody := OpenAIRequest{
128 | 		Model:       t.config.ModelName,
129 | 		Messages:    messages,
130 | 		Temperature: t.config.Temperature,
131 | 		TopP:        t.config.TopP,
132 | 	}
133 | 
134 | 	jsonData, err := json.Marshal(reqBody)
135 | 	if err != nil {
136 | 		return "", fmt.Errorf("failed to marshal request: %v", err)
137 | 	}
138 | 
139 | 	req, err := http.NewRequest("POST", t.config.OpenAIEndpointURL+"/chat/completions", bytes.NewBuffer(jsonData))
140 | 	if err != nil {
141 | 		return "", fmt.Errorf("failed to create request: %v", err)
142 | 	}
143 | 
144 | 	req.Header.Set("Content-Type", "application/json")
145 | 	req.Header.Set("Authorization", "Bearer "+t.config.OpenAIAPIKey)
146 | 
147 | 	client := &http.Client{}
148 | 	resp, err := client.Do(req)
149 | 	if err != nil {
150 | 		return "", fmt.Errorf("failed to send request: %v", err)
151 | 	}
152 | 	defer resp.Body.Close()
153 | 
154 | 	body, err := io.ReadAll(resp.Body)
155 | 	if err != nil {
156 | 		return "", fmt.Errorf("failed to read response: %v", err)
157 | 	}
158 | 
159 | 	var response OpenAIResponse
160 | 	if err := json.Unmarshal(body, &response); err != nil {
161 | 		return "", fmt.Errorf("failed to parse response: %v\nResponse body: %s", err, string(body))
162 | 	}
163 | 
164 | 	if len(response.Choices) == 0 {
165 | 		return "", fmt.Errorf("no translation result\nResponse body: %s", string(body))
166 | 	}
167 | 
168 | 	// Get translated content
169 | 	translatedContent := response.Choices[0].Message.Content
170 | 
171 | 	// Remove special content blocks
172 | 	for _, pattern := range RegexPatterns {
173 | 		translatedContent = regexp.MustCompile(pattern.Pattern).ReplaceAllString(translatedContent, pattern.Replace)
174 | 	}
175 | 
176 | 	// Remove potential markdown code block markers
177 | 	translatedContent = strings.TrimPrefix(translatedContent, "\n")
178 | 
179 | 	// If formatting is enabled, format the translated content
180 | 	if t.format {
181 | 		formatter := markdownfmt.New(true)
182 | 		translatedContent = formatter.Format(translatedContent)
183 | 	}
184 | 
185 | 	return translatedContent, nil
186 | }
187 | 
188 | // removeFrontMatter removes front matter from content
189 | func removeFrontMatter(content string) string {
190 | 	// If content starts with ---, it may contain front matter
191 | 	trimmedContent := strings.TrimSpace(content)
192 | 	if strings.HasPrefix(trimmedContent, "---") {
193 | 		parts := strings.SplitN(trimmedContent, "---", 3)
194 | 		if len(parts) >= 3 {
195 | 			return strings.TrimSpace(parts[2])
196 | 		}
197 | 	}
198 | 	return content
199 | }
200 | 
201 | // ProcessFile handles translation of a single file
202 | func ProcessFile(srcPath, dstPath, targetLang string, cfg *config.Config, format bool, force bool) error {
203 | 	t := New(cfg, format)
204 | 
205 | 	// Check if target path is a directory
206 | 	dstInfo, err := os.Stat(dstPath)
207 | 	if err == nil && dstInfo.IsDir() {
208 | 		dstPath = filepath.Join(dstPath, filepath.Base(srcPath))
209 | 	}
210 | 
211 | 	// Check if target file already exists
212 | 	if _, err := os.Stat(dstPath); err == nil {
213 | 		dstContent, err := os.ReadFile(dstPath)
214 | 		if err != nil {
215 | 			return fmt.Errorf("failed to read target file: %v", err)
216 | 		}
217 | 
218 | 		// Check if already translated
219 | 		var dstFrontMatter map[string]interface{}
220 | 		if strings.HasPrefix(string(dstContent), "---\n") {
221 | 			parts := strings.SplitN(string(dstContent)[4:], "\n---\n", 2)
222 | 			if len(parts) == 2 {
223 | 				if err := yaml.Unmarshal([]byte(parts[0]), &dstFrontMatter); err != nil {
224 | 					return fmt.Errorf("failed to parse target file front matter: %v", err)
225 | 				}
226 | 				if translated, ok := dstFrontMatter["translated"].(bool); ok && translated {
227 | 					if !force {
228 | 						fmt.Printf("Skipping %s (already translated, use -F to force translate)\n", srcPath)
229 | 						return nil
230 | 					}
231 | 					fmt.Printf("Force translating %s\n", srcPath)
232 | 				}
233 | 			}
234 | 		}
235 | 	}
236 | 
237 | 	// Read source file content
238 | 	content, err := os.ReadFile(srcPath)
239 | 	if err != nil {
240 | 		return fmt.Errorf("failed to read source file: %v", err)
241 | 	}
242 | 
243 | 	// Parse front matter
244 | 	var frontMatter map[string]interface{}
245 | 	contentToTranslate := string(content)
246 | 
247 | 	// Check and parse front matter
248 | 	if strings.HasPrefix(contentToTranslate, "---\n") {
249 | 		parts := strings.SplitN(contentToTranslate[4:], "\n---\n", 2)
250 | 		if len(parts) == 2 {
251 | 			if err := yaml.Unmarshal([]byte(parts[0]), &frontMatter); err != nil {
252 | 				return fmt.Errorf("failed to parse front matter: %v", err)
253 | 			}
254 | 			contentToTranslate = parts[1]
255 | 		}
256 | 	}
257 | 
258 | 	// Translate content
259 | 	translatedContent, err := t.TranslateContent(contentToTranslate, targetLang)
260 | 	if err != nil {
261 | 		return fmt.Errorf("failed to translate content: %v", err)
262 | 	}
263 | 
264 | 	// Update front matter
265 | 	if frontMatter == nil {
266 | 		frontMatter = make(map[string]interface{})
267 | 	}
268 | 	frontMatter["translated"] = true
269 | 
270 | 	// Generate new file content
271 | 	frontMatterBytes, err := yaml.Marshal(frontMatter)
272 | 	if err != nil {
273 | 		return fmt.Errorf("failed to marshal front matter: %v", err)
274 | 	}
275 | 
276 | 	newContent := fmt.Sprintf("---\n%s---\n\n%s", string(frontMatterBytes), translatedContent)
277 | 
278 | 	// Create target directory if it doesn't exist
279 | 	if err := os.MkdirAll(filepath.Dir(dstPath), 0755); err != nil {
280 | 		return fmt.Errorf("failed to create target directory: %v", err)
281 | 	}
282 | 
283 | 	// Write translated content to target file
284 | 	if err := os.WriteFile(dstPath, []byte(newContent), 0644); err != nil {
285 | 		return fmt.Errorf("failed to write target file: %v", err)
286 | 	}
287 | 
288 | 	return nil
289 | }
290 | 
291 | // ProcessDirectory processes all markdown files in the directory
292 | func ProcessDirectory(srcDir, dstDir string, targetLang string, cfg *config.Config, force bool, format bool) error {
293 | 	// First calculate the total number of files to process
294 | 	var total int
295 | 	err := filepath.Walk(srcDir, func(path string, info os.FileInfo, err error) error {
296 | 		if err != nil {
297 | 			return err
298 | 		}
299 | 		if !info.IsDir() && filepath.Ext(path) == ".md" {
300 | 			total++
301 | 		}
302 | 		return nil
303 | 	})
304 | 	if err != nil {
305 | 		return fmt.Errorf("failed to count files: %v", err)
306 | 	}
307 | 
308 | 	fmt.Printf("Found %d markdown files to translate\n", total)
309 | 
310 | 	// Create translator instance
311 | 	t := New(cfg, format)
312 | 	current := 0
313 | 
314 | 	// Walk through source directory
315 | 	return filepath.Walk(srcDir, func(path string, info os.FileInfo, err error) error {
316 | 		if err != nil {
317 | 			return err
318 | 		}
319 | 
320 | 		// Skip directories
321 | 		if info.IsDir() {
322 | 			return nil
323 | 		}
324 | 
325 | 		// Only process markdown files
326 | 		ext := filepath.Ext(path)
327 | 		if ext != ".md" {
328 | 			return nil
329 | 		}
330 | 
331 | 		current++
332 | 
333 | 		// Get relative path
334 | 		relPath, err := filepath.Rel(srcDir, path)
335 | 		if err != nil {
336 | 			return fmt.Errorf("failed to get relative path: %v", err)
337 | 		}
338 | 
339 | 		var dstPath string
340 | 		if dstDir == "" {
341 | 			// If target directory is empty, create translation file in source directory
342 | 			dir := filepath.Dir(path)
343 | 			base := filepath.Base(path)
344 | 			nameWithoutExt := strings.TrimSuffix(base, ext)
345 | 			dstPath = filepath.Join(dir, nameWithoutExt+"_"+targetLang+ext)
346 | 		} else {
347 | 			// If a different target directory is specified, use the specified directory structure
348 | 			dstPath = filepath.Join(dstDir, relPath)
349 | 		}
350 | 
351 | 		t.progress(Progress{
352 | 			Total:      total,
353 | 			Current:    current,
354 | 			SourceFile: path,
355 | 			TargetFile: dstPath,
356 | 		})
357 | 
358 | 		// Process file
359 | 		if err := ProcessFile(path, dstPath, targetLang, cfg, format, force); err != nil {
360 | 			return fmt.Errorf("failed to process file %s: %v", path, err)
361 | 		}
362 | 
363 | 		return nil
364 | 	})
365 | }
366 | 


--------------------------------------------------------------------------------