├── mdctl.png
├── main.go
├── action.yml
├── go.mod
├── .gitignore
├── .github
└── workflows
│ ├── update-homebrew.yml
│ ├── release.yml
│ ├── pr-review.yml
│ ├── idoc.yml
│ └── docker-build.yml
├── .goreleaser.yaml
├── LICENSE
├── Dockerfile
├── cmd
├── download.go
├── root.go
├── llmstxt.go
├── translate.go
├── export.go
├── lint.go
└── upload.go
├── internal
├── storage
│ └── provider.go
├── llmstxt
│ ├── formatter.go
│ ├── fetcher.go
│ ├── generator.go
│ ├── extractor.go
│ └── sitemap.go
├── exporter
│ ├── sitereader
│ │ ├── reader.go
│ │ └── mkdocs.go
│ ├── heading.go
│ ├── exporter.go
│ ├── merger.go
│ └── pandoc.go
├── linter
│ ├── config.go
│ ├── fixer.go
│ ├── linter.go
│ ├── linter_test.go
│ └── rules_test.go
├── cache
│ └── cache.go
├── markdownfmt
│ └── formatter.go
├── processor
│ └── processor.go
├── config
│ └── config.go
└── translator
│ └── translator.go
├── docs
├── DEVELOPMENT.md
└── features
│ ├── export.md
│ └── upload.md
├── README.md
├── go.sum
└── Makefile
/mdctl.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samzong/mdctl/HEAD/mdctl.png
--------------------------------------------------------------------------------
/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "github.com/samzong/mdctl/cmd"
5 | )
6 |
7 | func main() {
8 | cmd.Execute()
9 | }
10 |
--------------------------------------------------------------------------------
/action.yml:
--------------------------------------------------------------------------------
1 | name: "mdctl CLI"
2 | description: "Run mdctl (Markdown docs toolkit) in GitHub Actions via Docker."
3 | author: "samzong"
4 | branding:
5 | icon: "book"
6 | color: "blue"
7 |
8 | inputs:
9 | args:
10 | description: "Arguments to pass to mdctl (e.g., \"export -f README.md -o out.docx\")."
11 | required: false
12 | default: "--help"
13 |
14 | runs:
15 | using: "docker"
16 | image: "Dockerfile"
17 | entrypoint: "/bin/sh"
18 | args:
19 | - -c
20 | - mdctl ${{ inputs.args }}
21 |
22 |
--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/samzong/mdctl
2 |
3 | go 1.23.4
4 |
5 | require (
6 | github.com/PuerkitoBio/goquery v1.9.1
7 | github.com/aws/aws-sdk-go v1.55.6
8 | github.com/gobwas/glob v0.2.3
9 | github.com/spf13/cobra v1.8.1
10 | golang.org/x/text v0.23.0
11 | gopkg.in/yaml.v3 v3.0.1
12 | )
13 |
14 | require (
15 | github.com/andybalholm/cascadia v1.3.2 // indirect
16 | github.com/inconshreveable/mousetrap v1.1.0 // indirect
17 | github.com/jmespath/go-jmespath v0.4.0 // indirect
18 | github.com/spf13/pflag v1.0.5 // indirect
19 | golang.org/x/net v0.33.0 // indirect
20 | )
21 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Binaries for programs and plugins
2 | *.exe
3 | *.exe~
4 | *.dll
5 | *.so
6 | *.dylib
7 | mdctl
8 | bin/
9 |
10 | # Test binary, built with `go test -c`
11 | *.test
12 |
13 | # Output of the go coverage tool, specifically when used with LiteIDE
14 | *.out
15 |
16 | # Dependency directories (remove the comment below to include it)
17 | vendor/
18 |
19 | # Go workspace file
20 | go.work
21 |
22 | # IDE specific files
23 | .idea/
24 | .vscode/
25 | *.swp
26 | *.swo
27 |
28 | # OS generated files
29 | .DS_Store
30 | .DS_Store?
31 | ._*
32 | .Spotlight-V100
33 | .Trashes
34 | ehthumbs.db
35 | Thumbs.db
36 |
37 | # Project specific
38 | images/
39 | dist/
40 | *.docx
41 | *.pdf
--------------------------------------------------------------------------------
/.github/workflows/update-homebrew.yml:
--------------------------------------------------------------------------------
1 | name: Update Homebrew Tap
2 |
3 | on:
4 | workflow_dispatch:
5 | inputs:
6 | tag:
7 | description: 'Select the tag to update Homebrew'
8 | required: true
9 | type: string
10 | repository_dispatch:
11 | types: [trigger-homebrew-update]
12 |
13 | jobs:
14 | update-homebrew:
15 | runs-on: macos-latest
16 | steps:
17 | - name: Set version
18 | run: |
19 | if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
20 | echo "VERSION=${{ inputs.tag }}" >> $GITHUB_ENV
21 | else
22 | echo "VERSION=${{ github.event.client_payload.version }}" >> $GITHUB_ENV
23 | fi
24 |
25 | - name: Checkout repository
26 | uses: actions/checkout@v4
27 | with:
28 | fetch-depth: 0
29 |
30 | - name: Update Homebrew Formula
31 | env:
32 | GH_PAT: ${{ secrets.GH_PAT }}
33 | run: make update-homebrew
--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
1 | name: Release
2 |
3 | on:
4 | push:
5 | tags:
6 | - 'v*'
7 |
8 | permissions:
9 | contents: write
10 | packages: write
11 |
12 | jobs:
13 | goreleaser:
14 | runs-on: ubuntu-latest
15 | steps:
16 | - name: Checkout
17 | uses: actions/checkout@v4
18 | with:
19 | fetch-depth: 0
20 |
21 | - name: Set up Go
22 | uses: actions/setup-go@v4
23 | with:
24 | go-version: '>=1.21.0'
25 | cache: true
26 |
27 | - name: Run GoReleaser
28 | uses: goreleaser/goreleaser-action@v5
29 | with:
30 | distribution: goreleaser
31 | version: latest
32 | args: release --clean
33 | env:
34 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
35 |
36 | - name: Trigger Homebrew Update
37 | if: success()
38 | uses: peter-evans/repository-dispatch@v2
39 | with:
40 | token: ${{ secrets.GH_PAT }}
41 | event-type: trigger-homebrew-update
42 | client-payload: '{"version": "${{ env.VERSION }}"}'
--------------------------------------------------------------------------------
/.github/workflows/pr-review.yml:
--------------------------------------------------------------------------------
1 | name: PR Review
2 |
3 | on:
4 | pull_request:
5 | types: [opened, synchronize, reopened]
6 | paths-ignore:
7 | - '**.md'
8 | - 'docs/**'
9 | - '.gitignore'
10 |
11 | jobs:
12 | review:
13 | name: Build & Test
14 | runs-on: ubuntu-latest
15 | steps:
16 | - name: Checkout code
17 | uses: actions/checkout@v4
18 | with:
19 | fetch-depth: 0
20 |
21 | - name: Set up Go
22 | uses: actions/setup-go@v5
23 | with:
24 | go-version: '1.21'
25 | cache: true
26 |
27 | - name: Install dependencies
28 | run: make deps
29 |
30 | - name: Format check
31 | run: |
32 | make fmt
33 | git diff --exit-code || (echo "Code is not formatted. Please run 'make fmt'" && exit 1)
34 |
35 | - name: Run tests
36 | run: make test
37 |
38 | - name: Build
39 | run: make build
40 |
41 | - name: Upload artifact
42 | uses: actions/upload-artifact@v4
43 | with:
44 | name: mdctl
45 | path: bin/mdctl
--------------------------------------------------------------------------------
/.goreleaser.yaml:
--------------------------------------------------------------------------------
1 | before:
2 | hooks:
3 | - go mod tidy
4 |
5 | builds:
6 | - env:
7 | - CGO_ENABLED=0
8 | goos:
9 | - linux
10 | - windows
11 | - darwin
12 | goarch:
13 | - amd64
14 | - arm64
15 | ignore:
16 | - goos: windows
17 | goarch: arm64
18 | ldflags:
19 | - -s -w -X github.com/samzong/mdctl/cmd.Version={{.Version}} -X github.com/samzong/mdctl/cmd.BuildTime={{.Date}}
20 | binary: mdctl
21 |
22 | archives:
23 | - format: tar.gz
24 | name_template: >-
25 | {{ .ProjectName }}_
26 | {{- title .Os }}_
27 | {{- if eq .Arch "amd64" }}x86_64
28 | {{- else if eq .Arch "386" }}i386
29 | {{- else }}{{ .Arch }}{{ end }}
30 | {{- if .Arm }}v{{ .Arm }}{{ end }}
31 | format_overrides:
32 | - goos: windows
33 | format: zip
34 |
35 | changelog:
36 | sort: asc
37 | filters:
38 | exclude:
39 | - '^docs:'
40 | - '^test:'
41 | - '^ci:'
42 | - '^chore:'
43 |
44 | checksum:
45 | name_template: 'checksums.txt'
46 |
47 | snapshot:
48 | name_template: "{{ incpatch .Version }}-next"
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2025 samzong
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM golang:1.23-alpine AS builder
2 |
3 | WORKDIR /app
4 |
5 | # Copy go mod and sum files
6 | COPY go.mod go.sum ./
7 |
8 | # Download dependencies
9 | RUN go mod download
10 |
11 | # Copy source code
12 | COPY . .
13 |
14 | # Install git for version information
15 | RUN apk add --no-cache git
16 |
17 | # Set build arguments with defaults
18 | ARG VERSION=dev
19 | ARG BUILD_TIME
20 |
21 | # Set default build time if not provided
22 | RUN if [ -z "$BUILD_TIME" ]; then BUILD_TIME=$(date -u +"%Y-%m-%dT%H:%M:%SZ"); fi && \
23 | echo "Building version: $VERSION, build time: $BUILD_TIME" && \
24 | CGO_ENABLED=0 go build -trimpath -ldflags "-s -w -X github.com/samzong/mdctl/cmd.Version=${VERSION} -X github.com/samzong/mdctl/cmd.BuildTime=${BUILD_TIME}" -o /app/bin/mdctl
25 |
26 | # Use a minimal alpine image for the final stage
27 | FROM alpine:3.19
28 |
29 | # Install ca-certificates for HTTPS requests
30 | RUN apk --no-cache add ca-certificates
31 |
32 | WORKDIR /root/
33 |
34 | # Copy the binary from the builder stage
35 | COPY --from=builder /app/bin/mdctl /usr/local/bin/mdctl
36 |
37 | # Create config directory
38 | RUN mkdir -p /root/.config/mdctl
39 |
40 | # Set the entrypoint
41 | ENTRYPOINT ["mdctl"]
42 |
43 | # Default command
44 | CMD ["--help"]
45 |
--------------------------------------------------------------------------------
/cmd/download.go:
--------------------------------------------------------------------------------
1 | package cmd
2 |
3 | import (
4 | "fmt"
5 |
6 | "github.com/samzong/mdctl/internal/processor"
7 |
8 | "github.com/spf13/cobra"
9 | )
10 |
11 | var (
12 | sourceFile string
13 | sourceDir string
14 | imageOutputDir string
15 |
16 | downloadCmd = &cobra.Command{
17 | Use: "download",
18 | Short: "Download remote images in markdown files",
19 | Long: `Download remote images in markdown files to local storage and update references.
20 | Examples:
21 | mdctl download -f post.md
22 | mdctl download -d content/posts
23 | mdctl download -f post.md -o assets/images`,
24 | RunE: func(cmd *cobra.Command, args []string) error {
25 | if sourceFile == "" && sourceDir == "" {
26 | return fmt.Errorf("either source file (-f) or source directory (-d) must be specified")
27 | }
28 | if sourceFile != "" && sourceDir != "" {
29 | return fmt.Errorf("cannot specify both source file (-f) and source directory (-d)")
30 | }
31 |
32 | p := processor.New(sourceFile, sourceDir, imageOutputDir)
33 | return p.Process()
34 | },
35 | }
36 | )
37 |
38 | func init() {
39 | downloadCmd.Flags().StringVarP(&sourceFile, "file", "f", "", "Source markdown file to process")
40 | downloadCmd.Flags().StringVarP(&sourceDir, "dir", "d", "", "Source directory containing markdown files to process")
41 | downloadCmd.Flags().StringVarP(&imageOutputDir, "output", "o", "", "Output directory for downloaded images (optional)")
42 | }
43 |
--------------------------------------------------------------------------------
/cmd/root.go:
--------------------------------------------------------------------------------
1 | package cmd
2 |
3 | import (
4 | "fmt"
5 | "os"
6 |
7 | "github.com/spf13/cobra"
8 | )
9 |
10 | var (
11 | Version = "dev"
12 | BuildTime = "unknown"
13 | verbose bool
14 | veryVerbose bool
15 |
16 | rootCmd = &cobra.Command{
17 | Use: "mdctl",
18 | Short: "A CLI tool for markdown file operations",
19 | Long: `mdctl is a CLI tool that helps you manage and process markdown files.
20 | Currently supports downloading remote images and more features to come.`,
21 | Version: fmt.Sprintf("%s (built at %s)", Version, BuildTime),
22 | }
23 | )
24 |
25 | func Execute() {
26 | if err := rootCmd.Execute(); err != nil {
27 | fmt.Println(err)
28 | os.Exit(1)
29 | }
30 | }
31 |
32 | func init() {
33 | // Add commands first
34 | rootCmd.AddCommand(translateCmd)
35 | rootCmd.AddCommand(downloadCmd)
36 | rootCmd.AddCommand(configCmd)
37 | rootCmd.AddCommand(uploadCmd)
38 | rootCmd.AddCommand(exportCmd)
39 | rootCmd.AddCommand(llmstxtCmd)
40 | rootCmd.AddCommand(lintCmd)
41 |
42 | // Add global flags
43 | rootCmd.PersistentFlags().BoolVarP(&verbose, "verbose", "v", false, "Enable verbose output")
44 | rootCmd.PersistentFlags().BoolVar(&veryVerbose, "vv", false, "Enable very verbose output with detailed information")
45 |
46 | // Then add groups and set group IDs
47 | rootCmd.AddGroup(&cobra.Group{
48 | ID: "core",
49 | Title: "Core Commands:",
50 | })
51 | rootCmd.AddGroup(&cobra.Group{
52 | ID: "config",
53 | Title: "Configuration Commands:",
54 | })
55 |
56 | // Set group for each command
57 | translateCmd.GroupID = "core"
58 | downloadCmd.GroupID = "core"
59 | uploadCmd.GroupID = "core"
60 | exportCmd.GroupID = "core"
61 | llmstxtCmd.GroupID = "core"
62 | lintCmd.GroupID = "core"
63 | configCmd.GroupID = "config"
64 | }
65 |
--------------------------------------------------------------------------------
/internal/storage/provider.go:
--------------------------------------------------------------------------------
1 | package storage
2 |
3 | import (
4 | "github.com/samzong/mdctl/internal/config"
5 | )
6 |
7 | // Provider defines the interface for storage providers
8 | type Provider interface {
9 | // Upload uploads a file to cloud storage
10 | Upload(localPath, remotePath string, metadata map[string]string) (string, error)
11 |
12 | // Configure sets up the provider with the given configuration
13 | Configure(config config.CloudConfig) error
14 |
15 | // GetPublicURL returns the public URL for a remote path
16 | GetPublicURL(remotePath string) string
17 |
18 | // ObjectExists checks if an object exists in the storage
19 | ObjectExists(remotePath string) (bool, error)
20 |
21 | // CompareHash compares a local hash with a remote object's hash
22 | CompareHash(remotePath, localHash string) (bool, error)
23 |
24 | // SetObjectMetadata sets metadata for an object
25 | SetObjectMetadata(remotePath string, metadata map[string]string) error
26 |
27 | // GetObjectMetadata retrieves metadata for an object
28 | GetObjectMetadata(remotePath string) (map[string]string, error)
29 | }
30 |
31 | // ProviderFactory is a function that creates a new storage provider
32 | type ProviderFactory func() Provider
33 |
34 | var providers = make(map[string]ProviderFactory)
35 |
36 | // RegisterProvider registers a storage provider factory
37 | func RegisterProvider(name string, factory ProviderFactory) {
38 | providers[name] = factory
39 | }
40 |
41 | // GetProvider returns a storage provider by name
42 | func GetProvider(name string) (Provider, bool) {
43 | factory, exists := providers[name]
44 | if !exists {
45 | return nil, false
46 | }
47 | return factory(), true
48 | }
49 |
50 | // ListProviders returns a list of available provider names
51 | func ListProviders() []string {
52 | var names []string
53 | for name := range providers {
54 | names = append(names, name)
55 | }
56 | return names
57 | }
58 |
--------------------------------------------------------------------------------
/internal/llmstxt/formatter.go:
--------------------------------------------------------------------------------
1 | package llmstxt
2 |
3 | import (
4 | "strings"
5 | "unicode"
6 | )
7 |
8 | // Format to Markdown content
9 | func (g *Generator) formatContent(sections map[string][]PageInfo) string {
10 | var buf strings.Builder
11 |
12 | // Get sorted section list
13 | sectionNames := g.getSortedSections(sections)
14 |
15 | // Find root page info
16 | var rootPage PageInfo
17 | if rootPages, ok := sections["ROOT"]; ok && len(rootPages) > 0 {
18 | rootPage = rootPages[0]
19 | }
20 |
21 | // Add document title
22 | buf.WriteString("# ")
23 | buf.WriteString(rootPage.Title)
24 | buf.WriteString("\n\n")
25 |
26 | // Add document description
27 | buf.WriteString("> ")
28 | buf.WriteString(rootPage.Description)
29 | buf.WriteString("\n\n")
30 |
31 | // Handle each section
32 | for _, section := range sectionNames {
33 | // Skip ROOT section, because it's already used for title and description
34 | if section == "ROOT" {
35 | continue
36 | }
37 |
38 | // Add section title
39 | buf.WriteString("## ")
40 | buf.WriteString(capitalizeString(section))
41 | buf.WriteString("\n\n")
42 |
43 | // Add page info for each page in section
44 | for _, page := range sections[section] {
45 | buf.WriteString("- [")
46 | buf.WriteString(page.Title)
47 | buf.WriteString("](")
48 | buf.WriteString(page.URL)
49 | buf.WriteString("): ")
50 | buf.WriteString(page.Description)
51 | buf.WriteString("\n")
52 |
53 | // Add page content in full mode
54 | if g.config.FullMode && page.Content != "" {
55 | buf.WriteString("\n")
56 | buf.WriteString(page.Content)
57 | buf.WriteString("\n")
58 | }
59 |
60 | buf.WriteString("\n")
61 | }
62 | }
63 |
64 | return buf.String()
65 | }
66 |
67 | // Capitalize first letter, lowercase the rest
68 | func capitalizeString(str string) string {
69 | if str == "" {
70 | return ""
71 | }
72 |
73 | runes := []rune(str)
74 | return string(unicode.ToUpper(runes[0])) + strings.ToLower(string(runes[1:]))
75 | }
76 |
--------------------------------------------------------------------------------
/.github/workflows/idoc.yml:
--------------------------------------------------------------------------------
1 | # 📖 Simple document generation tool! Dependence Node.js run.
2 | # https://github.com/jaywcjlove/idoc
3 |
4 | name: idoc
5 | on:
6 | push:
7 | branches:
8 | - main
9 |
10 | jobs:
11 | build-deploy:
12 | runs-on: ubuntu-latest
13 | steps:
14 | - uses: actions/checkout@v4
15 | - uses: actions/setup-node@v4
16 | with:
17 | node-version: 20
18 | registry-url: "https://registry.npmjs.org"
19 |
20 | - name: Create idoc config.
21 | run: |
22 | cat > idoc.yml << EOF
23 | site: mdctl
24 | description: A command-line tool for processing Markdown files. Currently, it supports automatically downloading remote images to local storage and updating the image references in Markdown files, as well as translating markdown files using AI models.
25 | keywords: Markdown processor,CLI tool,Image downloader,Markdown translator,AI translation,Markdown automation,Remote image handling,Markdown utilities,AI-powered Markdown,Markdown enhancement,Markdown file management
26 | favicon: assets/favicon.ico
27 | logo: assets/icon.png
28 |
29 | openSource: https://github.com/samzong/mdctl
30 |
31 | tocs: false
32 |
33 | element:
34 | wrapper: style=max-width:720px;
35 |
36 | menus:
37 | Home: index.html
38 | About:
39 | url: https://github.com/samzong
40 | target: __blank
41 | sideEffectFiles:
42 | - README_zh.md
43 |
44 | cacheFileStat: true
45 |
46 | footer: |
47 | Copyright © {{idocYear}} samzong
48 | EOF
49 |
50 | - run: npm install idoc@1 -g
51 | - run: idoc
52 |
53 | - name: Deploy
54 | uses: peaceiris/actions-gh-pages@v4
55 | if: github.ref == 'refs/heads/main'
56 | with:
57 | github_token: ${{ secrets.GITHUB_TOKEN }}
58 | publish_dir: ./dist
59 |
--------------------------------------------------------------------------------
/internal/exporter/sitereader/reader.go:
--------------------------------------------------------------------------------
1 | package sitereader
2 |
3 | import (
4 | "fmt"
5 | "io"
6 | "log"
7 | "os"
8 | "path/filepath"
9 | )
10 |
11 | // SiteReader Define Site Reader Interface
12 | type SiteReader interface {
13 | // Detect if given directory is this type of site
14 | Detect(dir string) bool
15 |
16 | // Read site structure, return sorted list of files
17 | // navPath parameter is used to specify the navigation path to export, empty to export all
18 | ReadStructure(dir string, configPath string, navPath string) ([]string, error)
19 | }
20 |
21 | // GetSiteReader Return the appropriate reader based on site type
22 | func GetSiteReader(siteType string, verbose bool, logger *log.Logger) (SiteReader, error) {
23 | // If no logger is provided, create a default one
24 | if logger == nil {
25 | if verbose {
26 | logger = log.New(os.Stdout, "[SITE-READER] ", log.LstdFlags)
27 | } else {
28 | logger = log.New(io.Discard, "", 0)
29 | }
30 | }
31 |
32 | logger.Printf("Creating site reader for type: %s", siteType)
33 |
34 | switch siteType {
35 | case "mkdocs":
36 | logger.Println("Using MkDocs site reader")
37 | return &MkDocsReader{Logger: logger}, nil
38 | case "hugo":
39 | logger.Println("Hugo site type is not yet implemented")
40 | return nil, fmt.Errorf("hugo site type is not yet implemented")
41 | case "docusaurus":
42 | logger.Println("Docusaurus site type is not yet implemented")
43 | return nil, fmt.Errorf("docusaurus site type is not yet implemented")
44 | default:
45 | logger.Printf("Unsupported site type: %s", siteType)
46 | return nil, fmt.Errorf("unsupported site type: %s", siteType)
47 | }
48 | }
49 |
50 | // FindConfigFile Find config file in given directory
51 | func FindConfigFile(dir string, configNames []string) (string, error) {
52 | // If no config file name is provided, use default values
53 | if len(configNames) == 0 {
54 | configNames = []string{"config.yml", "config.yaml"}
55 | }
56 |
57 | // Find config file
58 | for _, name := range configNames {
59 | configPath := filepath.Join(dir, name)
60 | if _, err := os.Stat(configPath); err == nil {
61 | return configPath, nil
62 | }
63 | }
64 |
65 | return "", fmt.Errorf("no config file found in %s", dir)
66 | }
67 |
--------------------------------------------------------------------------------
/cmd/llmstxt.go:
--------------------------------------------------------------------------------
1 | package cmd
2 |
3 | import (
4 | "fmt"
5 | "os"
6 |
7 | "github.com/samzong/mdctl/internal/llmstxt"
8 | "github.com/spf13/cobra"
9 | )
10 |
11 | var (
12 | includePaths []string
13 | excludePaths []string
14 | outputPath string
15 | fullMode bool
16 | concurrency int
17 | timeout int
18 | maxPages int
19 |
20 | llmstxtCmd = &cobra.Command{
21 | Use: "llmstxt [url]",
22 | Short: "Generate llms.txt from sitemap.xml",
23 | Long: `Generate a llms.txt file from a website's sitemap.xml. This file is a curated
24 | list of the website's pages in markdown format, perfect for training or fine-tuning
25 | language models.
26 |
27 | In standard mode, only title and description are extracted. In full mode (-f flag),
28 | the content of each page is also extracted.
29 |
30 | Examples:
31 | # Standard mode
32 | mdctl llmstxt https://example.com/sitemap.xml > llms.txt
33 |
34 | # Full-content mode
35 | mdctl llmstxt -f https://example.com/sitemap.xml > llms-full.txt`,
36 | Args: cobra.ExactArgs(1),
37 | RunE: func(cmd *cobra.Command, args []string) error {
38 | sitemapURL := args[0]
39 |
40 | // Create a generator and configure options
41 | config := llmstxt.GeneratorConfig{
42 | SitemapURL: sitemapURL,
43 | IncludePaths: includePaths,
44 | ExcludePaths: excludePaths,
45 | FullMode: fullMode,
46 | Concurrency: concurrency,
47 | Timeout: timeout,
48 | UserAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
49 | Verbose: verbose,
50 | VeryVerbose: veryVerbose,
51 | MaxPages: maxPages,
52 | }
53 |
54 | generator := llmstxt.NewGenerator(config)
55 |
56 | // Execute generation
57 | content, err := generator.Generate()
58 | if err != nil {
59 | return err
60 | }
61 |
62 | // Output content
63 | if outputPath == "" {
64 | // Output to standard output
65 | fmt.Println(content)
66 | } else {
67 | // Output to file
68 | return os.WriteFile(outputPath, []byte(content), 0644)
69 | }
70 |
71 | return nil
72 | },
73 | }
74 | )
75 |
76 | func init() {
77 | llmstxtCmd.Flags().StringVarP(&outputPath, "output", "o", "", "Output file path (default: stdout)")
78 | llmstxtCmd.Flags().StringSliceVarP(&includePaths, "include-path", "i", []string{}, "Glob patterns for paths to include (can be specified multiple times)")
79 | llmstxtCmd.Flags().StringSliceVarP(&excludePaths, "exclude-path", "e", []string{}, "Glob patterns for paths to exclude (can be specified multiple times)")
80 | llmstxtCmd.Flags().BoolVarP(&fullMode, "full", "f", false, "Enable full-content mode (extract page content)")
81 | llmstxtCmd.Flags().IntVarP(&concurrency, "concurrency", "c", 5, "Number of concurrent requests")
82 | llmstxtCmd.Flags().IntVar(&timeout, "timeout", 30, "Request timeout in seconds")
83 | llmstxtCmd.Flags().IntVar(&maxPages, "max-pages", 0, "Maximum number of pages to process (0 for unlimited)")
84 |
85 | // Add command to core group
86 | llmstxtCmd.GroupID = "core"
87 |
88 | rootCmd.AddCommand(llmstxtCmd)
89 | }
90 |
--------------------------------------------------------------------------------
/internal/llmstxt/fetcher.go:
--------------------------------------------------------------------------------
1 | package llmstxt
2 |
3 | import (
4 | "fmt"
5 | "net/http"
6 | "sync"
7 | "time"
8 | )
9 |
10 | // Fetch pages concurrently using a worker pool
11 | func (g *Generator) fetchPages(urls []string) ([]PageInfo, error) {
12 | g.logger.Printf("Starting to fetch %d pages with concurrency %d", len(urls), g.config.Concurrency)
13 |
14 | // Create result and error channels
15 | resultChan := make(chan PageInfo, len(urls))
16 | errorChan := make(chan error, len(urls))
17 |
18 | // Create work channel, controlling concurrency
19 | workChan := make(chan string, len(urls))
20 |
21 | // Start worker pool
22 | var wg sync.WaitGroup
23 | for i := 0; i < g.config.Concurrency; i++ {
24 | wg.Add(1)
25 | go func() {
26 | defer wg.Done()
27 | for urlStr := range workChan {
28 | pageInfo, err := g.fetchPageContent(urlStr)
29 | if err != nil {
30 | g.logger.Printf("Warning: failed to fetch page %s: %v", urlStr, err)
31 | errorChan <- fmt.Errorf("failed to fetch page %s: %w", urlStr, err)
32 | continue
33 | }
34 | resultChan <- pageInfo
35 | }
36 | }()
37 | }
38 |
39 | // Send all URLs to work channel
40 | for _, urlStr := range urls {
41 | workChan <- urlStr
42 | }
43 | close(workChan)
44 |
45 | // Wait for all work to finish
46 | wg.Wait()
47 | close(resultChan)
48 | close(errorChan)
49 |
50 | // Collect results
51 | var results []PageInfo
52 | for result := range resultChan {
53 | results = append(results, result)
54 | g.logger.Printf("Fetched page: %s", result.URL)
55 | }
56 |
57 | // Check for errors (don't interrupt processing, just log warnings)
58 | for err := range errorChan {
59 | g.logger.Printf("Warning: %v", err)
60 | }
61 |
62 | g.logger.Printf("Successfully fetched %d/%d pages", len(results), len(urls))
63 |
64 | return results, nil
65 | }
66 |
67 | // Get the content of a single page
68 | func (g *Generator) fetchPageContent(urlStr string) (PageInfo, error) {
69 | // Set HTTP client
70 | client := &http.Client{
71 | Timeout: time.Duration(g.config.Timeout) * time.Second,
72 | }
73 |
74 | // Build request
75 | req, err := http.NewRequest("GET", urlStr, nil)
76 | if err != nil {
77 | return PageInfo{}, fmt.Errorf("failed to create request: %w", err)
78 | }
79 |
80 | // Set User-Agent
81 | req.Header.Set("User-Agent", g.config.UserAgent)
82 |
83 | // Send request
84 | start := time.Now()
85 | resp, err := client.Do(req)
86 | if err != nil {
87 | return PageInfo{}, fmt.Errorf("failed to fetch page: %w", err)
88 | }
89 | defer resp.Body.Close()
90 |
91 | if resp.StatusCode != http.StatusOK {
92 | return PageInfo{}, fmt.Errorf("failed to fetch page, status code: %d", resp.StatusCode)
93 | }
94 |
95 | // Extract page information
96 | pageInfo, err := g.extractPageInfo(urlStr, resp)
97 | if err != nil {
98 | return PageInfo{}, fmt.Errorf("failed to extract page info: %w", err)
99 | }
100 |
101 | // Record timing information
102 | elapsed := time.Since(start).Round(time.Millisecond)
103 | g.logger.Printf("Fetched %s in %v", urlStr, elapsed)
104 |
105 | return pageInfo, nil
106 | }
107 |
--------------------------------------------------------------------------------
/docs/DEVELOPMENT.md:
--------------------------------------------------------------------------------
1 | # mdctl 开发者指南
2 |
3 | ## 项目介绍
4 |
5 | mdctl 是一个用于处理 Markdown 文件的命令行工具,主要功能包括:
6 |
7 | 1. **下载功能**:自动下载 Markdown 文件中的远程图片到本地,并更新引用路径
8 | 2. **翻译功能**:使用 AI 模型将 Markdown 文件翻译成多种语言
9 | 3. **上传功能**:将本地图片上传到云存储,并更新 Markdown 文件中的引用
10 | 4. **配置管理**:管理工具的配置信息
11 | 5. **其他功能**:如导出为其他格式、生成 llms.txt 文件等
12 |
13 | ## 项目结构
14 |
15 | ```bash
16 | ../mdctl
17 | ├── cmd
18 | │ ├── config.go
19 | │ ├── download.go
20 | │ ├── export.go
21 | │ ├── llmstxt.go
22 | │ ├── root.go
23 | │ ├── translate.go
24 | │ └── upload.go
25 | ├── internal
26 | │ ├── cache
27 | │ ├── config
28 | │ ├── exporter
29 | │ ├── llmstxt
30 | │ ├── markdownfmt
31 | │ ├── processor
32 | │ ├── storage
33 | │ ├── translator
34 | │ └── uploader
35 | ├── main.go
36 | ├── go.mod
37 | ├── go.sum
38 | ```
39 |
40 | ## 核心模块说明
41 |
42 | ### 命令行模块 (cmd/)
43 |
44 | 使用 [Cobra](https://github.com/spf13/cobra) 库实现命令行界面,主要命令包括:
45 |
46 | - **root**: 根命令,定义基本信息和版本
47 | - **download**: 下载远程图片到本地
48 | - **translate**: 翻译 Markdown 文件
49 | - **upload**: 上传本地图片到云存储
50 | - **config**: 管理配置信息
51 |
52 | ### 处理器模块 (internal/processor/)
53 |
54 | 负责处理 Markdown 文件中的远程图片下载,主要功能:
55 |
56 | - 解析 Markdown 文件中的图片链接
57 | - 下载远程图片到本地
58 | - 更新 Markdown 文件中的图片引用路径
59 |
60 | ### 翻译模块 (internal/translator/)
61 |
62 | 负责翻译 Markdown 文件,主要功能:
63 |
64 | - 支持多种语言翻译
65 | - 保持 Markdown 格式和 front matter 不变
66 | - 使用 AI 模型进行翻译
67 | - 支持目录结构的翻译
68 |
69 | ### 上传模块 (internal/uploader/)
70 |
71 | 负责上传本地图片到云存储,主要功能:
72 |
73 | - 解析 Markdown 文件中的本地图片链接
74 | - 上传图片到云存储
75 | - 更新 Markdown 文件中的图片引用路径
76 | - 支持多种冲突处理策略
77 |
78 | ### 存储模块 (internal/storage/)
79 |
80 | 定义存储提供者接口和实现,主要功能:
81 |
82 | - 提供统一的存储接口
83 | - 支持 S3 兼容的存储服务
84 | - 处理文件上传和元数据管理
85 |
86 | ### llms.txt 生成模块 (internal/llmstxt/)
87 |
88 | 负责从网站的 sitemap.xml 生成 llms.txt 文件,主要功能:
89 |
90 | - 解析 sitemap.xml 文件
91 | - 访问每个 URL 并提取页面内容
92 | - 生成格式化的 llms.txt 文档
93 |
94 | ### 配置模块 (internal/config/)
95 |
96 | 负责管理配置信息,主要功能:
97 |
98 | - 加载和保存配置文件
99 | - 管理 AI 模型配置
100 | - 管理云存储配置
101 |
102 | ## 开发风格和约定
103 |
104 | ### 代码组织
105 |
106 | 1. **命令与实现分离**:命令行接口在 `cmd/` 目录,具体实现在 `internal/` 目录
107 | 2. **模块化设计**:每个功能都有独立的模块,如处理器、翻译器、上传器等
108 | 3. **接口定义**:使用接口定义模块间交互,如存储提供者接口
109 |
110 | ### 错误处理
111 |
112 | 错误处理采用 Go 语言的标准方式,通过返回错误值进行传递和处理。
113 |
114 | ### 配置管理
115 |
116 | 配置文件存储在 `~/.config/mdctl/config.json`,包含:
117 |
118 | - AI 模型配置(端点、API 密钥、模型名称等)
119 | - 云存储配置(提供者、区域、访问密钥等)
120 |
121 | ### 日志输出
122 |
123 | 使用标准输出进行日志记录,提供详细的处理信息和错误信息。
124 |
125 | ## 添加新功能的步骤
126 |
127 | 1. **定义命令**:在 `cmd/` 目录下创建新的命令文件,定义命令行接口
128 | 2. **实现功能**:在 `internal/` 目录下创建相应的实现模块
129 | 3. **注册命令**:在 `cmd/root.go` 的 `init()` 函数中注册新命令
130 | 4. **更新文档**:更新 README 文件,添加新功能的说明
131 |
132 | ## 构建和发布
133 |
134 | 项目使用 Makefile 和 GoReleaser 进行构建和发布:
135 |
136 | - **构建**:使用 `make build` 命令构建项目
137 | - **发布**:使用 `make release` 命令发布新版本
138 |
139 | ## 扩展点
140 |
141 | ### 添加新的存储提供者
142 |
143 | 1. 在 `internal/storage/` 目录下创建新的提供者实现
144 | 2. 实现 `Provider` 接口
145 | 3. 在初始化时注册提供者
146 |
147 | ### 添加新的 AI 模型支持
148 |
149 | 1. 在 `internal/translator/` 目录下扩展翻译器实现
150 | 2. 添加新模型的 API 调用
151 | 3. 更新配置模块以支持新模型的配置
152 |
153 | ### 添加新的 Markdown 处理功能
154 |
155 | 1. 创建新的处理器模块
156 | 2. 实现 Markdown 解析和处理逻辑
157 | 3. 添加新的命令行接口
158 |
--------------------------------------------------------------------------------
/internal/exporter/heading.go:
--------------------------------------------------------------------------------
1 | package exporter
2 |
3 | import (
4 | "bufio"
5 | "fmt"
6 | "regexp"
7 | "strings"
8 | )
9 |
10 | var (
11 | // Match ATX-style headings (those starting with #)
12 | atxHeadingRegex = regexp.MustCompile(`^(#{1,6})\s+(.+)$`)
13 | // Match Setext-style headings (underline style)
14 | setextHeading1Regex = regexp.MustCompile(`^=+\s*$`)
15 | setextHeading2Regex = regexp.MustCompile(`^-+\s*$`)
16 | )
17 |
18 | // ShiftHeadings Adjust heading levels in Markdown text
19 | func ShiftHeadings(content string, shiftBy int) string {
20 | if shiftBy == 0 {
21 | return content
22 | }
23 |
24 | scanner := bufio.NewScanner(strings.NewReader(content))
25 | var result []string
26 | var prevLine string
27 | var isPrevLineHeading bool
28 |
29 | for scanner.Scan() {
30 | line := scanner.Text()
31 |
32 | // Handle ATX-style headings
33 | if matches := atxHeadingRegex.FindStringSubmatch(line); matches != nil {
34 | level := len(matches[1]) + shiftBy
35 | heading := matches[2]
36 |
37 | if level <= 6 {
38 | // Still valid heading level
39 | result = append(result, fmt.Sprintf("%s %s", strings.Repeat("#", level), heading))
40 | } else {
41 | // Exceeded max heading level, convert to bold text
42 | result = append(result, fmt.Sprintf("**%s**", heading))
43 | }
44 | isPrevLineHeading = false
45 | } else if setextHeading1Regex.MatchString(line) && prevLine != "" {
46 | // Handle Setext-style level 1 headings
47 | level := 1 + shiftBy
48 | if level <= 6 {
49 | result[len(result)-1] = fmt.Sprintf("%s %s", strings.Repeat("#", level), prevLine)
50 | } else {
51 | result[len(result)-1] = fmt.Sprintf("**%s**", prevLine)
52 | }
53 | isPrevLineHeading = true
54 | } else if setextHeading2Regex.MatchString(line) && prevLine != "" {
55 | // Handle Setext-style level 2 headings
56 | level := 2 + shiftBy
57 | if level <= 6 {
58 | result[len(result)-1] = fmt.Sprintf("%s %s", strings.Repeat("#", level), prevLine)
59 | } else {
60 | result[len(result)-1] = fmt.Sprintf("**%s**", prevLine)
61 | }
62 | isPrevLineHeading = true
63 | } else {
64 | // Ordinary line
65 | result = append(result, line)
66 | isPrevLineHeading = false
67 | }
68 |
69 | if !isPrevLineHeading {
70 | prevLine = line
71 | }
72 | }
73 |
74 | return strings.Join(result, "\n")
75 | }
76 |
77 | // AddTitleFromFilename Add heading from filename
78 | func AddTitleFromFilename(content, filename string, level int) string {
79 | // Extract heading from filename (remove extension)
80 | title := strings.TrimSuffix(filename, ".md")
81 | title = strings.TrimSuffix(title, ".markdown")
82 |
83 | // Replace underscores and hyphens with spaces, making the heading more readable
84 | title = strings.ReplaceAll(title, "_", " ")
85 | title = strings.ReplaceAll(title, "-", " ")
86 |
87 | // Capitalize the first letter of each word
88 | title = strings.Title(title)
89 |
90 | // Create heading line
91 | var titleLine string
92 | if level <= 6 {
93 | titleLine = fmt.Sprintf("%s %s\n\n", strings.Repeat("#", level), title)
94 | } else {
95 | titleLine = fmt.Sprintf("**%s**\n\n", title)
96 | }
97 |
98 | return titleLine + content
99 | }
100 |
--------------------------------------------------------------------------------
/.github/workflows/docker-build.yml:
--------------------------------------------------------------------------------
1 | name: Docker Image Build
2 |
3 | on:
4 | release:
5 | types: [published, created]
6 | push:
7 | tags:
8 | - 'v*'
9 | paths-ignore:
10 | - '**.md'
11 | - '.github/**'
12 | - '!.github/workflows/docker-build.yml'
13 | workflow_dispatch:
14 |
15 | jobs:
16 | docker:
17 | name: Build and Push Multi-arch Image
18 | runs-on: ubuntu-latest
19 | permissions:
20 | contents: write
21 | packages: write
22 | steps:
23 | - name: Checkout Code
24 | uses: actions/checkout@v4
25 | with:
26 | fetch-depth: 0
27 |
28 | - name: Set up QEMU
29 | uses: docker/setup-qemu-action@v3
30 |
31 | - name: Setup Docker Buildx
32 | uses: docker/setup-buildx-action@v3
33 |
34 | - name: Login to GitHub Container Registry
35 | uses: docker/login-action@v3
36 | with:
37 | registry: ghcr.io
38 | username: ${{ github.actor }}
39 | password: ${{ secrets.GITHUB_TOKEN }}
40 |
41 | - name: Extract Metadata
42 | id: meta
43 | uses: docker/metadata-action@v5
44 | with:
45 | images: ghcr.io/${{ github.repository_owner }}/mdctl
46 | tags: |
47 | type=ref,event=branch
48 | type=ref,event=pr
49 | type=semver,pattern={{version}}
50 | type=semver,pattern={{major}}.{{minor}}
51 | type=semver,pattern={{major}}
52 | type=sha,format=short
53 | type=raw,value=latest,enable=${{ github.ref_type == 'tag' }}
54 |
55 | - name: Display tags
56 | run: |
57 | echo "Generated tags: ${{ steps.meta.outputs.tags }}"
58 | echo "Ref type: ${{ github.ref_type }}"
59 | echo "Ref: ${{ github.ref }}"
60 |
61 | # Set explicit latest tag for tag events
62 | - name: Set explicit latest tag
63 | if: startsWith(github.ref, 'refs/tags/')
64 | run: echo "EXTRA_TAGS=ghcr.io/${{ github.repository_owner }}/mdctl:latest" >> $GITHUB_ENV
65 |
66 | # Get version information
67 | - name: Get version info
68 | id: version_info
69 | run: |
70 | # Get version from tag or git describe
71 | if [[ "$GITHUB_REF_TYPE" == "tag" ]]; then
72 | VERSION="${GITHUB_REF_NAME}"
73 | else
74 | VERSION="$(git describe --tags --always || echo 'dev')"
75 | fi
76 |
77 | # Get build time
78 | BUILD_TIME="$(date -u +"%Y-%m-%dT%H:%M:%SZ")"
79 |
80 | # Set outputs
81 | echo "VERSION=${VERSION}" >> $GITHUB_OUTPUT
82 | echo "BUILD_TIME=${BUILD_TIME}" >> $GITHUB_OUTPUT
83 |
84 | # Display for debugging
85 | echo "Version: ${VERSION}"
86 | echo "Build time: ${BUILD_TIME}"
87 |
88 | - name: Build and Push Multi-arch Image
89 | uses: docker/build-push-action@v5
90 | with:
91 | context: .
92 | platforms: linux/amd64,linux/arm64
93 | push: true
94 | tags: ${{ steps.meta.outputs.tags }}${{ env.EXTRA_TAGS != '' && format(',{0}', env.EXTRA_TAGS) || '' }}
95 | labels: ${{ steps.meta.outputs.labels }}
96 | build-args: |
97 | VERSION=${{ steps.version_info.outputs.VERSION }}
98 | BUILD_TIME=${{ steps.version_info.outputs.BUILD_TIME }}
99 | cache-from: type=gha
100 | cache-to: type=gha,mode=max
101 |
--------------------------------------------------------------------------------
/internal/linter/config.go:
--------------------------------------------------------------------------------
1 | package linter
2 |
3 | import (
4 | "encoding/json"
5 | "os"
6 | "path/filepath"
7 | )
8 |
9 | // ConfigFile represents a markdownlint configuration file
10 | type ConfigFile struct {
11 | // Default configuration
12 | Default bool `json:"default,omitempty"`
13 |
14 | // Extends other configuration files
15 | Extends string `json:"extends,omitempty"`
16 |
17 | // Rule-specific configuration
18 | MD001 *RuleConfig `json:"MD001,omitempty"`
19 | MD003 *RuleConfig `json:"MD003,omitempty"`
20 | MD009 *RuleConfig `json:"MD009,omitempty"`
21 | MD010 *RuleConfig `json:"MD010,omitempty"`
22 | MD012 *RuleConfig `json:"MD012,omitempty"`
23 | MD013 *RuleConfig `json:"MD013,omitempty"`
24 | MD018 *RuleConfig `json:"MD018,omitempty"`
25 | MD019 *RuleConfig `json:"MD019,omitempty"`
26 | MD023 *RuleConfig `json:"MD023,omitempty"`
27 | MD032 *RuleConfig `json:"MD032,omitempty"`
28 | MD047 *RuleConfig `json:"MD047,omitempty"`
29 | }
30 |
31 | // RuleConfig represents configuration for a specific rule
32 | type RuleConfig struct {
33 | // Whether the rule is enabled
34 | Enabled *bool `json:"enabled,omitempty"`
35 |
36 | // Rule-specific options
37 | Options map[string]interface{} `json:"options,omitempty"`
38 | }
39 |
40 | // LoadConfigFile loads configuration from a file
41 | func LoadConfigFile(filename string) (*ConfigFile, error) {
42 | // Try to find config file if not specified
43 | if filename == "" {
44 | filename = findConfigFile()
45 | }
46 |
47 | if filename == "" {
48 | return &ConfigFile{Default: true}, nil
49 | }
50 |
51 | data, err := os.ReadFile(filename)
52 | if err != nil {
53 | return nil, err
54 | }
55 |
56 | var config ConfigFile
57 | if err := json.Unmarshal(data, &config); err != nil {
58 | return nil, err
59 | }
60 |
61 | return &config, nil
62 | }
63 |
64 | // ApplyToRuleSet applies the configuration to a rule set
65 | func (c *ConfigFile) ApplyToRuleSet(rs *RuleSet) {
66 | ruleConfigs := map[string]*RuleConfig{
67 | "MD001": c.MD001,
68 | "MD003": c.MD003,
69 | "MD009": c.MD009,
70 | "MD010": c.MD010,
71 | "MD012": c.MD012,
72 | "MD013": c.MD013,
73 | "MD018": c.MD018,
74 | "MD019": c.MD019,
75 | "MD023": c.MD023,
76 | "MD032": c.MD032,
77 | "MD047": c.MD047,
78 | }
79 |
80 | for ruleID, ruleConfig := range ruleConfigs {
81 | if ruleConfig != nil && ruleConfig.Enabled != nil {
82 | if rule, exists := rs.rules[ruleID]; exists {
83 | rule.SetEnabled(*ruleConfig.Enabled)
84 | }
85 | }
86 | }
87 | }
88 |
89 | // findConfigFile looks for common markdownlint config files
90 | func findConfigFile() string {
91 | configFiles := []string{
92 | ".markdownlint.json",
93 | ".markdownlint.jsonc",
94 | ".markdownlintrc",
95 | ".markdownlintrc.json",
96 | ".markdownlintrc.jsonc",
97 | }
98 |
99 | for _, filename := range configFiles {
100 | if _, err := os.Stat(filename); err == nil {
101 | return filename
102 | }
103 | }
104 |
105 | // Also check in home directory
106 | if home, err := os.UserHomeDir(); err == nil {
107 | for _, filename := range configFiles {
108 | fullPath := filepath.Join(home, filename)
109 | if _, err := os.Stat(fullPath); err == nil {
110 | return fullPath
111 | }
112 | }
113 | }
114 |
115 | return ""
116 | }
117 |
118 | // CreateDefaultConfig creates a default configuration file
119 | func CreateDefaultConfig(filename string) error {
120 | config := ConfigFile{
121 | Default: true,
122 | MD001: &RuleConfig{Enabled: boolPtr(true)},
123 | MD003: &RuleConfig{Enabled: boolPtr(true)},
124 | MD009: &RuleConfig{Enabled: boolPtr(true)},
125 | MD010: &RuleConfig{Enabled: boolPtr(true)},
126 | MD012: &RuleConfig{Enabled: boolPtr(true)},
127 | MD013: &RuleConfig{Enabled: boolPtr(true)},
128 | MD018: &RuleConfig{Enabled: boolPtr(true)},
129 | MD019: &RuleConfig{Enabled: boolPtr(true)},
130 | MD023: &RuleConfig{Enabled: boolPtr(true)},
131 | MD032: &RuleConfig{Enabled: boolPtr(true)},
132 | MD047: &RuleConfig{Enabled: boolPtr(true)},
133 | }
134 |
135 | data, err := json.MarshalIndent(config, "", " ")
136 | if err != nil {
137 | return err
138 | }
139 |
140 | return os.WriteFile(filename, data, 0644)
141 | }
142 |
143 | // boolPtr returns a pointer to a bool value
144 | func boolPtr(b bool) *bool {
145 | return &b
146 | }
147 |
--------------------------------------------------------------------------------
/internal/cache/cache.go:
--------------------------------------------------------------------------------
1 | package cache
2 |
3 | import (
4 | "encoding/json"
5 | "fmt"
6 | "os"
7 | "path/filepath"
8 | "sync"
9 | "time"
10 | )
11 |
12 | // CacheItem represents a single cached file information
13 | type CacheItem struct {
14 | LocalPath string `json:"local_path"`
15 | RemotePath string `json:"remote_path"`
16 | URL string `json:"url"`
17 | Hash string `json:"hash"`
18 | UploadTime time.Time `json:"upload_time"`
19 | }
20 |
21 | // Cache manages information about uploaded files
22 | type Cache struct {
23 | Items map[string]CacheItem `json:"items"`
24 | Version string `json:"version"`
25 | CacheDir string `json:"cache_dir,omitempty"`
26 | mutex sync.RWMutex
27 | }
28 |
29 | // New creates a new cache instance
30 | func New(cacheDir string) *Cache {
31 | if cacheDir == "" {
32 | homeDir, err := os.UserHomeDir()
33 | if err == nil {
34 | cacheDir = filepath.Join(homeDir, ".cache", "mdctl")
35 | } else {
36 | // Fallback to temp directory
37 | cacheDir = filepath.Join(os.TempDir(), "mdctl-cache")
38 | }
39 | }
40 |
41 | return &Cache{
42 | Items: make(map[string]CacheItem),
43 | Version: "1.0",
44 | CacheDir: cacheDir,
45 | }
46 | }
47 |
48 | // saveWithoutLock writes cache to disk without acquiring the lock
49 | // This should only be called from methods that already hold a lock
50 | func (c *Cache) saveWithoutLock() error {
51 | // Ensure cache directory exists
52 | if err := os.MkdirAll(c.CacheDir, 0755); err != nil {
53 | return fmt.Errorf("failed to create cache directory: %v", err)
54 | }
55 |
56 | cacheFile := filepath.Join(c.CacheDir, "upload-cache.json")
57 | data, err := json.MarshalIndent(c, "", " ")
58 | if err != nil {
59 | return fmt.Errorf("failed to marshal cache: %v", err)
60 | }
61 |
62 | if err := os.WriteFile(cacheFile, data, 0644); err != nil {
63 | return fmt.Errorf("failed to write cache file: %v", err)
64 | }
65 |
66 | return nil
67 | }
68 |
69 | // Load reads cache from disk
70 | func (c *Cache) Load() error {
71 | c.mutex.Lock()
72 | defer c.mutex.Unlock()
73 |
74 | // Ensure cache directory exists
75 | if err := os.MkdirAll(c.CacheDir, 0755); err != nil {
76 | return fmt.Errorf("failed to create cache directory: %v", err)
77 | }
78 |
79 | cacheFile := filepath.Join(c.CacheDir, "upload-cache.json")
80 | if _, err := os.Stat(cacheFile); os.IsNotExist(err) {
81 | // Cache file doesn't exist yet, create a new one
82 | c.Items = make(map[string]CacheItem)
83 | return c.saveWithoutLock()
84 | }
85 |
86 | data, err := os.ReadFile(cacheFile)
87 | if err != nil {
88 | return fmt.Errorf("failed to read cache file: %v", err)
89 | }
90 |
91 | if err := json.Unmarshal(data, c); err != nil {
92 | // If cache is corrupt, start with a fresh one
93 | c.Items = make(map[string]CacheItem)
94 | return nil
95 | }
96 |
97 | return nil
98 | }
99 |
100 | // Save persists the cache to disk
101 | func (c *Cache) Save() error {
102 | c.mutex.Lock()
103 | defer c.mutex.Unlock()
104 |
105 | return c.saveWithoutLock() // Use the lockless version to avoid deadlock
106 | }
107 |
108 | // AddItem adds or updates a cache item
109 | func (c *Cache) AddItem(localPath, remotePath, url, hash string) {
110 | c.mutex.Lock()
111 | defer c.mutex.Unlock()
112 |
113 | c.Items[localPath] = CacheItem{
114 | LocalPath: localPath,
115 | RemotePath: remotePath,
116 | URL: url,
117 | Hash: hash,
118 | UploadTime: time.Now(),
119 | }
120 | }
121 |
122 | // GetItem retrieves a cache item by local path
123 | func (c *Cache) GetItem(localPath string) (CacheItem, bool) {
124 | c.mutex.RLock()
125 | defer c.mutex.RUnlock()
126 |
127 | item, exists := c.Items[localPath]
128 | return item, exists
129 | }
130 |
131 | // HasItemWithHash checks if an item with the same hash exists
132 | func (c *Cache) HasItemWithHash(hash string) (CacheItem, bool) {
133 | c.mutex.RLock()
134 | defer c.mutex.RUnlock()
135 |
136 | for _, item := range c.Items {
137 | if item.Hash == hash {
138 | return item, true
139 | }
140 | }
141 | return CacheItem{}, false
142 | }
143 |
144 | // RemoveItem removes an item from the cache
145 | func (c *Cache) RemoveItem(localPath string) {
146 | c.mutex.Lock()
147 | defer c.mutex.Unlock()
148 |
149 | delete(c.Items, localPath)
150 | }
151 |
--------------------------------------------------------------------------------
/internal/llmstxt/generator.go:
--------------------------------------------------------------------------------
1 | package llmstxt
2 |
3 | import (
4 | "fmt"
5 | "io"
6 | "log"
7 | "os"
8 | "sort"
9 | "time"
10 | )
11 |
12 | // GeneratorConfig contains the configuration required to generate llms.txt
13 | type GeneratorConfig struct {
14 | SitemapURL string
15 | IncludePaths []string
16 | ExcludePaths []string
17 | FullMode bool
18 | Concurrency int
19 | Timeout int
20 | UserAgent string
21 | Verbose bool
22 | VeryVerbose bool // More detailed log output
23 | MaxPages int // Maximum number of pages to process, 0 means no limit
24 | }
25 |
26 | // PageInfo stores page information
27 | type PageInfo struct {
28 | Title string
29 | URL string
30 | Description string
31 | Content string // Page content, only filled in full mode
32 | Section string // First segment of URL path as section
33 | }
34 |
35 | // Generator is the llms.txt generator
36 | type Generator struct {
37 | config GeneratorConfig
38 | logger *log.Logger
39 | }
40 |
41 | // NewGenerator creates a new generator instance
42 | func NewGenerator(config GeneratorConfig) *Generator {
43 | var logger *log.Logger
44 | if config.Verbose || config.VeryVerbose {
45 | logger = log.New(os.Stdout, "[LLMSTXT] ", log.LstdFlags)
46 | } else {
47 | logger = log.New(io.Discard, "", 0)
48 | }
49 |
50 | return &Generator{
51 | config: config,
52 | logger: logger,
53 | }
54 | }
55 |
56 | // Generate performs the generation process and returns the generated content
57 | func (g *Generator) Generate() (string, error) {
58 | startTime := time.Now()
59 | g.logger.Printf("Starting generation for sitemap: %s", g.config.SitemapURL)
60 | if g.config.FullMode {
61 | g.logger.Println("Full-content mode enabled")
62 | }
63 |
64 | // 1. Parse sitemap.xml to get URL list
65 | urls, err := g.parseSitemap()
66 | if err != nil {
67 | return "", fmt.Errorf("failed to parse sitemap: %w", err)
68 | }
69 | g.logger.Printf("Found %d URLs in sitemap", len(urls))
70 |
71 | // 2. Filter URLs (based on include/exclude mode)
72 | urls = g.filterURLs(urls)
73 | g.logger.Printf("%d URLs after filtering", len(urls))
74 |
75 | // 2.1. Apply max page limit
76 | if g.config.MaxPages > 0 && len(urls) > g.config.MaxPages {
77 | g.logger.Printf("Limiting to %d pages as requested (--max-pages)", g.config.MaxPages)
78 | urls = urls[:g.config.MaxPages]
79 | }
80 |
81 | // 3. Create worker pool and get page info
82 | pages, err := g.fetchPages(urls)
83 | if err != nil {
84 | return "", fmt.Errorf("failed to fetch pages: %w", err)
85 | }
86 |
87 | // 4. Group pages by section
88 | sections := g.groupBySections(pages)
89 |
90 | // 5. Format to Markdown content
91 | content := g.formatContent(sections)
92 |
93 | elapsedTime := time.Since(startTime).Round(time.Millisecond)
94 | g.logger.Printf("Generation completed successfully in %v", elapsedTime)
95 | return content, nil
96 | }
97 |
98 | // Group pages by section
99 | func (g *Generator) groupBySections(pages []PageInfo) map[string][]PageInfo {
100 | sections := make(map[string][]PageInfo)
101 |
102 | for _, page := range pages {
103 | sections[page.Section] = append(sections[page.Section], page)
104 | }
105 |
106 | // Sort pages within each section by URL path length
107 | for section, sectionPages := range sections {
108 | sort.Slice(sectionPages, func(i, j int) bool {
109 | return len(sectionPages[i].URL) < len(sectionPages[j].URL)
110 | })
111 | sections[section] = sectionPages
112 | }
113 |
114 | return sections
115 | }
116 |
117 | // Get sorted section name list, ensuring ROOT section is always first
118 | func (g *Generator) getSortedSections(sections map[string][]PageInfo) []string {
119 | sectionNames := make([]string, 0, len(sections))
120 |
121 | // Add ROOT section first (if exists)
122 | if _, hasRoot := sections["ROOT"]; hasRoot {
123 | sectionNames = append(sectionNames, "ROOT")
124 | }
125 |
126 | // Add other sections and sort alphabetically
127 | for section := range sections {
128 | if section != "ROOT" {
129 | sectionNames = append(sectionNames, section)
130 | }
131 | }
132 |
133 | // Only sort if there are non-ROOT sections
134 | if len(sectionNames) > 1 {
135 | // Only sort non-ROOT sections
136 | nonRootSections := sectionNames[1:]
137 | sort.Strings(nonRootSections)
138 | }
139 |
140 | return sectionNames
141 | }
142 |
--------------------------------------------------------------------------------
/cmd/translate.go:
--------------------------------------------------------------------------------
1 | package cmd
2 |
3 | import (
4 | "fmt"
5 | "os"
6 | "path/filepath"
7 | "strings"
8 |
9 | "github.com/samzong/mdctl/internal/config"
10 | "github.com/samzong/mdctl/internal/translator"
11 | "github.com/spf13/cobra"
12 | )
13 |
14 | var (
15 | fromPath string
16 | toPath string
17 | locale string
18 | force bool
19 | format bool
20 | )
21 |
22 | // Generate target file path
23 | func generateTargetPath(sourcePath, lang string) string {
24 | dir := filepath.Dir(sourcePath)
25 | base := filepath.Base(sourcePath)
26 | ext := filepath.Ext(base)
27 | nameWithoutExt := strings.TrimSuffix(base, ext)
28 | return filepath.Join(dir, nameWithoutExt+"_"+lang+ext)
29 | }
30 |
31 | var translateCmd = &cobra.Command{
32 | Use: "translate",
33 | Short: "Translate markdown files using AI models",
34 | Long: `Translate markdown files or directories to specified language using AI models.
35 |
36 | Supported AI Models:
37 | - OpenAI (Current)
38 | - DeepSeek R1 (Current)
39 | - Llama (Current)
40 |
41 | Supported Languages:
42 | ar (العربية), de (Deutsch), en (English), es (Español), fr (Français),
43 | hi (हिन्दी), it (Italiano), ja (日本語), ko (한국어), pt (Português),
44 | ru (Русский), th (ไทย), vi (Tiếng Việt), zh (中文)
45 |
46 | Examples:
47 | # Translate a single file to Chinese
48 | mdctl translate -f README.md -l zh
49 |
50 | # Translate a directory to Japanese
51 | mdctl translate -f docs -l ja
52 |
53 | # Force translate an already translated file
54 | mdctl translate -f README.md -l ko -F
55 |
56 | # Format markdown content after translation
57 | mdctl translate -f README.md -l zh -m
58 |
59 | # Translate to a specific output path
60 | mdctl translate -f docs -l fr -t translated_docs`,
61 | RunE: func(cmd *cobra.Command, args []string) error {
62 | cfg, err := config.LoadConfig()
63 | if err != nil {
64 | return fmt.Errorf("failed to load config: %v", err)
65 | }
66 |
67 | // Validate language option
68 | if !translator.IsLanguageSupported(locale) {
69 | return fmt.Errorf("unsupported locale: %s\nSupported languages: %s",
70 | locale,
71 | translator.GetSupportedLanguages())
72 | }
73 |
74 | // Check if source path exists
75 | if _, err := os.Stat(fromPath); os.IsNotExist(err) {
76 | return fmt.Errorf("source path does not exist: %s", fromPath)
77 | }
78 |
79 | // Get absolute path of source path
80 | srcAbs, err := filepath.Abs(fromPath)
81 | if err != nil {
82 | return fmt.Errorf("failed to get absolute path: %v", err)
83 | }
84 |
85 | // Check if it's a file or directory
86 | fi, err := os.Stat(srcAbs)
87 | if err != nil {
88 | return fmt.Errorf("failed to get file info: %v", err)
89 | }
90 |
91 | if fi.IsDir() {
92 | // If it's a directory and no target path specified, use the same directory structure
93 | if toPath == "" {
94 | return translator.ProcessDirectory(srcAbs, srcAbs, locale, cfg, force, format)
95 | }
96 | // If target path is specified, use the specified path
97 | dstAbs, err := filepath.Abs(toPath)
98 | if err != nil {
99 | return fmt.Errorf("failed to get absolute path: %v", err)
100 | }
101 | return translator.ProcessDirectory(srcAbs, dstAbs, locale, cfg, force, format)
102 | }
103 |
104 | // Process single file
105 | var dstAbs string
106 | if toPath == "" {
107 | // If no target path specified, generate name_lang.md in the same directory as source
108 | dstAbs = generateTargetPath(srcAbs, locale)
109 | } else {
110 | // If target path specified, use the specified path
111 | dstAbs, err = filepath.Abs(toPath)
112 | if err != nil {
113 | return fmt.Errorf("failed to get absolute path: %v", err)
114 | }
115 | }
116 |
117 | return translator.ProcessFile(srcAbs, dstAbs, locale, cfg, format, force)
118 | },
119 | }
120 |
121 | func init() {
122 | translateCmd.Flags().StringVarP(&fromPath, "from", "f", "", "Source file or directory path")
123 | translateCmd.Flags().StringVarP(&toPath, "to", "t", "", "Target file or directory path (optional, default: generate in same directory as source)")
124 | translateCmd.Flags().StringVarP(&locale, "locales", "l", "", "Target language code (e.g., zh, en, ja, ko, fr, de, es, etc.)")
125 | translateCmd.Flags().BoolVarP(&force, "force", "F", false, "Force translate even if already translated")
126 | translateCmd.Flags().BoolVarP(&format, "format", "m", false, "Format markdown content after translation")
127 |
128 | translateCmd.MarkFlagRequired("from")
129 | translateCmd.MarkFlagRequired("locales")
130 | }
131 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # mdctl - A CLI Tool for Markdown File Operations
2 |
3 |
4 |

5 |
6 |
An AI-powered CLI tool to enhance your Markdown workflow, with auto-image downloading, translation, and more features coming soon!
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 | ## Key Features
17 |
18 | - Automatically downloads remote images to a specified local directory.
19 | - Translates markdown files using AI models with support for multiple languages.
20 | - Uploads local images in markdown files to cloud storage services and updates references.
21 | - Exports markdown files to various document formats (DOCX, PDF, EPUB) with customization options.
22 | - Generates llms.txt files from website sitemaps for training language models.
23 |
24 | ## Installation
25 |
26 | Use Homebrew to install mdctl. Follow the [Homebrew Installation Guide](https://brew.sh/) to install Homebrew.
27 |
28 | ```bash
29 | brew tap samzong/tap
30 | brew install samzong/tap/mdctl
31 | ```
32 |
33 | Or use go to install mdctl.
34 |
35 | ```bash
36 | go install github.com/samzong/mdctl@latest
37 | ```
38 |
39 | ## Usage
40 |
41 | Quick examples for common tasks:
42 |
43 | ### Downloading Images
44 |
45 | ```bash
46 | # Process a single file
47 | mdctl download -f path/to/your/file.md
48 |
49 | # Process a directory
50 | mdctl download -d path/to/your/directory
51 | ```
52 |
53 | ### Translating I18n
54 |
55 | ```bash
56 | # Translate to Chinese
57 | mdctl translate -f README.md -l zh
58 |
59 | # Translate a directory to Japanese
60 | mdctl translate -d docs/ -l ja
61 | ```
62 |
63 | ### Uploading Images to Cloud Storage
64 |
65 | ```bash
66 | # Upload images from a file
67 | mdctl upload -f post.md
68 |
69 | # Upload images from a directory
70 | mdctl upload -d docs/
71 | ```
72 |
73 | ### Exporting Documents to `.docx`
74 |
75 | ```bash
76 | # Export to DOCX
77 | mdctl export -f README.md -o output.docx
78 |
79 | # Export to PDF with table of contents
80 | mdctl export -d docs/ -o documentation.pdf -F pdf --toc
81 | ```
82 |
83 | ### Generating `llms.txt` from `sitemap.xml`
84 |
85 | ```bash
86 | # Standard mode (titles and descriptions)
87 | mdctl llmstxt https://example.com/sitemap.xml > llms.txt
88 |
89 | # Full-content mode
90 | mdctl llmstxt -f https://example.com/sitemap.xml > llms-full.txt
91 | ```
92 |
93 | ### GitHub Action
94 |
95 | Use mdctl in your CI with the Docker-based Action in this repo. Example workflow step:
96 |
97 | ```yaml
98 | jobs:
99 | docs:
100 | runs-on: ubuntu-latest
101 | steps:
102 | - uses: actions/checkout@v4
103 | - name: Export docs to DOCX
104 | uses: samzong/mdctl@v1
105 | with:
106 | args: "export -f README.md -o output.docx"
107 | ```
108 |
109 | Notes:
110 | - Set `with.args` to any mdctl command and flags (e.g., `download`, `translate`, `upload`, `export`, `llmstxt`).
111 | - Provide necessary credentials via `env` when using cloud features (e.g., S3 for `upload`).
112 | - You can set `working-directory` on the step if needed.
113 |
114 | ## Developer's Guide
115 |
116 | If you are interested in contributing, please refer to the [DEVELOPMENT.md](docs/DEVELOPMENT.md) file for a complete technical architecture, component design, and development guide.
117 |
118 | ## Contributing
119 |
120 | Welcome to contribute code, report issues, or suggest features! Please follow these steps:
121 |
122 | 1. Fork this repository
123 | 2. Create your feature branch (`git checkout -b feature/amazing-feature`)
124 | 3. Commit your changes (`git commit -m 'Add some amazing feature'`)
125 | 4. Push to the branch (`git push origin feature/amazing-feature`)
126 | 5. Open a Pull Request
127 |
128 | ## License
129 |
130 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
131 |
--------------------------------------------------------------------------------
/cmd/export.go:
--------------------------------------------------------------------------------
1 | package cmd
2 |
3 | import (
4 | "fmt"
5 | "io"
6 | "log"
7 | "os"
8 |
9 | "github.com/samzong/mdctl/internal/exporter"
10 | "github.com/spf13/cobra"
11 | )
12 |
13 | var (
14 | exportFile string
15 | exportDir string
16 | siteType string
17 | exportOutput string
18 | exportTemplate string
19 | exportFormat string
20 | generateToc bool
21 | shiftHeadingLevelBy int
22 | fileAsTitle bool
23 | tocDepth int
24 | navPath string
25 | logger *log.Logger
26 |
27 | exportCmd = &cobra.Command{
28 | Use: "export",
29 | Short: "Export markdown files to other formats",
30 | Long: `Export markdown files to other formats like DOCX, PDF, EPUB.
31 | Uses Pandoc as the underlying conversion tool.
32 |
33 | Examples:
34 | mdctl export -f README.md -o output.docx
35 | mdctl export -d docs/ -o documentation.docx
36 | mdctl export -d docs/ -s mkdocs -o site_docs.docx
37 | mdctl export -d docs/ -o report.docx -t templates/corporate.docx
38 | mdctl export -d docs/ -o documentation.docx --shift-heading-level-by 2
39 | mdctl export -d docs/ -o documentation.docx --toc --toc-depth 4
40 | mdctl export -d docs/ -o documentation.pdf -F pdf`,
41 | RunE: func(cmd *cobra.Command, args []string) error {
42 | // Initialize logger
43 | if verbose {
44 | logger = log.New(os.Stdout, "[EXPORT] ", log.LstdFlags)
45 | } else {
46 | logger = log.New(io.Discard, "", 0)
47 | }
48 |
49 | logger.Println("Starting export process...")
50 |
51 | // Parameter validation
52 | if exportFile == "" && exportDir == "" {
53 | return fmt.Errorf("either source file (-f) or source directory (-d) must be specified")
54 | }
55 | if exportFile != "" && exportDir != "" {
56 | return fmt.Errorf("cannot specify both source file (-f) and source directory (-d)")
57 | }
58 | if exportOutput == "" {
59 | return fmt.Errorf("output file (-o) must be specified")
60 | }
61 |
62 | logger.Printf("Validating parameters: file=%s, dir=%s, output=%s, format=%s, site-type=%s",
63 | exportFile, exportDir, exportOutput, exportFormat, siteType)
64 |
65 | // Check if Pandoc is available
66 | logger.Println("Checking Pandoc availability...")
67 | if err := exporter.CheckPandocAvailability(); err != nil {
68 | return err
69 | }
70 | logger.Println("Pandoc is available.")
71 |
72 | // Create export options
73 | options := exporter.ExportOptions{
74 | Template: exportTemplate,
75 | GenerateToc: generateToc,
76 | ShiftHeadingLevelBy: shiftHeadingLevelBy,
77 | FileAsTitle: fileAsTitle,
78 | Format: exportFormat,
79 | SiteType: siteType,
80 | Verbose: verbose,
81 | Logger: logger,
82 | TocDepth: tocDepth,
83 | NavPath: navPath,
84 | }
85 |
86 | logger.Printf("Export options: template=%s, toc=%v, toc-depth=%d, shift-heading=%d, file-as-title=%v",
87 | exportTemplate, generateToc, tocDepth, shiftHeadingLevelBy, fileAsTitle)
88 |
89 | // Execute export
90 | exp := exporter.NewExporter()
91 | var err error
92 |
93 | if exportFile != "" {
94 | logger.Printf("Exporting single file: %s -> %s", exportFile, exportOutput)
95 | err = exp.ExportFile(exportFile, exportOutput, options)
96 | } else {
97 | logger.Printf("Exporting directory: %s -> %s", exportDir, exportOutput)
98 | err = exp.ExportDirectory(exportDir, exportOutput, options)
99 | }
100 |
101 | if err != nil {
102 | logger.Printf("Export failed: %s", err)
103 | return err
104 | }
105 |
106 | logger.Println("Export completed successfully.")
107 | return nil
108 | },
109 | }
110 | )
111 |
112 | func init() {
113 | exportCmd.Flags().StringVarP(&exportFile, "file", "f", "", "Source markdown file to export")
114 | exportCmd.Flags().StringVarP(&exportDir, "dir", "d", "", "Source directory containing markdown files to export")
115 | exportCmd.Flags().StringVarP(&siteType, "site-type", "s", "basic", "Site type (basic, mkdocs, hugo, docusaurus)")
116 | exportCmd.Flags().StringVarP(&exportOutput, "output", "o", "", "Output file path")
117 | exportCmd.Flags().StringVarP(&exportTemplate, "template", "t", "", "Word template file path")
118 | exportCmd.Flags().StringVarP(&exportFormat, "format", "F", "docx", "Output format (docx, pdf, epub)")
119 | exportCmd.Flags().BoolVar(&generateToc, "toc", false, "Generate table of contents")
120 | exportCmd.Flags().IntVar(&shiftHeadingLevelBy, "shift-heading-level-by", 0, "Shift heading level by N")
121 | exportCmd.Flags().BoolVar(&fileAsTitle, "file-as-title", false, "Use filename as section title")
122 | exportCmd.Flags().IntVar(&tocDepth, "toc-depth", 3, "Depth of table of contents (default 3)")
123 | exportCmd.Flags().StringVarP(&navPath, "nav-path", "n", "", "Specify the navigation path to export (e.g. 'Section1/Subsection2')")
124 | }
125 |
--------------------------------------------------------------------------------
/internal/linter/fixer.go:
--------------------------------------------------------------------------------
1 | package linter
2 |
3 | import (
4 | "regexp"
5 | "strings"
6 | )
7 |
8 | // Fixer provides auto-fix functionality for markdown issues
9 | type Fixer struct {
10 | rules map[string]func([]string) ([]string, int)
11 | }
12 |
13 | // NewFixer creates a new fixer instance
14 | func NewFixer() *Fixer {
15 | f := &Fixer{
16 | rules: make(map[string]func([]string) ([]string, int)),
17 | }
18 |
19 | // Register fix functions for each rule
20 | f.rules["MD009"] = f.fixTrailingSpaces
21 | f.rules["MD010"] = f.fixHardTabs
22 | f.rules["MD012"] = f.fixMultipleBlankLines
23 | f.rules["MD018"] = f.fixNoSpaceAfterHash
24 | f.rules["MD019"] = f.fixMultipleSpacesAfterHash
25 | f.rules["MD023"] = f.fixHeadingIndentation
26 | f.rules["MD032"] = f.fixListSpacing
27 | f.rules["MD047"] = f.fixFileEndNewline
28 |
29 | return f
30 | }
31 |
32 | // ApplyFixes applies fixes for the given issues
33 | func (f *Fixer) ApplyFixes(content string, issues []*Issue) (string, int) {
34 | lines := strings.Split(content, "\n")
35 | totalFixed := 0
36 |
37 | // Group issues by rule for efficient processing
38 | ruleIssues := make(map[string][]*Issue)
39 | for _, issue := range issues {
40 | ruleIssues[issue.Rule] = append(ruleIssues[issue.Rule], issue)
41 | }
42 |
43 | // Apply fixes for each rule
44 | for rule, ruleSpecificIssues := range ruleIssues {
45 | if fixFunc, exists := f.rules[rule]; exists {
46 | var fixed int
47 | lines, fixed = fixFunc(lines)
48 | totalFixed += fixed
49 |
50 | // Mark issues as fixed
51 | for _, issue := range ruleSpecificIssues {
52 | issue.Fixed = true
53 | }
54 | }
55 | }
56 |
57 | return strings.Join(lines, "\n"), totalFixed
58 | }
59 |
60 | // fixTrailingSpaces removes trailing spaces from lines
61 | func (f *Fixer) fixTrailingSpaces(lines []string) ([]string, int) {
62 | fixed := 0
63 | for i, line := range lines {
64 | trimmed := strings.TrimRight(line, " \t")
65 | if trimmed != line {
66 | lines[i] = trimmed
67 | fixed++
68 | }
69 | }
70 | return lines, fixed
71 | }
72 |
73 | // fixHardTabs replaces hard tabs with spaces
74 | func (f *Fixer) fixHardTabs(lines []string) ([]string, int) {
75 | fixed := 0
76 | for i, line := range lines {
77 | if strings.Contains(line, "\t") {
78 | lines[i] = strings.ReplaceAll(line, "\t", " ")
79 | fixed++
80 | }
81 | }
82 | return lines, fixed
83 | }
84 |
85 | // fixMultipleBlankLines removes consecutive blank lines
86 | func (f *Fixer) fixMultipleBlankLines(lines []string) ([]string, int) {
87 | var result []string
88 | fixed := 0
89 | prevBlank := false
90 |
91 | for _, line := range lines {
92 | isBlank := strings.TrimSpace(line) == ""
93 |
94 | if isBlank && prevBlank {
95 | fixed++ // Count removed blank lines
96 | continue
97 | }
98 |
99 | result = append(result, line)
100 | prevBlank = isBlank
101 | }
102 |
103 | return result, fixed
104 | }
105 |
106 | // fixNoSpaceAfterHash adds space after hash in headings
107 | func (f *Fixer) fixNoSpaceAfterHash(lines []string) ([]string, int) {
108 | fixed := 0
109 | re := regexp.MustCompile(`^(#+)([^# ])`)
110 |
111 | for i, line := range lines {
112 | trimmed := strings.TrimSpace(line)
113 | if re.MatchString(trimmed) {
114 | lines[i] = re.ReplaceAllString(trimmed, "$1 $2")
115 | fixed++
116 | }
117 | }
118 |
119 | return lines, fixed
120 | }
121 |
122 | // fixMultipleSpacesAfterHash removes extra spaces after hash in headings
123 | func (f *Fixer) fixMultipleSpacesAfterHash(lines []string) ([]string, int) {
124 | fixed := 0
125 | re := regexp.MustCompile(`^(#+)\s{2,}`)
126 |
127 | for i, line := range lines {
128 | trimmed := strings.TrimSpace(line)
129 | if re.MatchString(trimmed) {
130 | lines[i] = re.ReplaceAllString(trimmed, "$1 ")
131 | fixed++
132 | }
133 | }
134 |
135 | return lines, fixed
136 | }
137 |
138 | // fixHeadingIndentation removes leading spaces from headings
139 | func (f *Fixer) fixHeadingIndentation(lines []string) ([]string, int) {
140 | fixed := 0
141 | re := regexp.MustCompile(`^ +(#.*)`)
142 |
143 | for i, line := range lines {
144 | if re.MatchString(line) {
145 | lines[i] = re.ReplaceAllString(line, "$1")
146 | fixed++
147 | }
148 | }
149 |
150 | return lines, fixed
151 | }
152 |
153 | // fixListSpacing adds blank lines around lists
154 | func (f *Fixer) fixListSpacing(lines []string) ([]string, int) {
155 | fixed := 0
156 | var result []string
157 | listRe := regexp.MustCompile(`^(\s*[*+-] )`)
158 |
159 | for i, line := range lines {
160 | if listRe.MatchString(line) {
161 | // Check if previous line needs a blank line
162 | if i > 0 && strings.TrimSpace(lines[i-1]) != "" && len(result) > 0 {
163 | result = append(result, "")
164 | fixed++
165 | }
166 | }
167 | result = append(result, line)
168 | }
169 |
170 | return result, fixed
171 | }
172 |
173 | // fixFileEndNewline ensures file ends with single newline
174 | func (f *Fixer) fixFileEndNewline(lines []string) ([]string, int) {
175 | if len(lines) == 0 {
176 | return lines, 0
177 | }
178 |
179 | // Remove trailing empty lines
180 | for len(lines) > 0 && strings.TrimSpace(lines[len(lines)-1]) == "" {
181 | lines = lines[:len(lines)-1]
182 | }
183 |
184 | // Add single empty line at the end
185 | lines = append(lines, "")
186 |
187 | return lines, 1
188 | }
189 |
--------------------------------------------------------------------------------
/internal/markdownfmt/formatter.go:
--------------------------------------------------------------------------------
1 | package markdownfmt
2 |
3 | import (
4 | "fmt"
5 | "regexp"
6 | "strings"
7 | )
8 |
9 | // Formatter for formatting markdown content
10 | type Formatter struct {
11 | // Whether formatting is enabled
12 | enabled bool
13 | }
14 |
15 | // New creates a new formatter
16 | func New(enabled bool) *Formatter {
17 | return &Formatter{
18 | enabled: enabled,
19 | }
20 | }
21 |
22 | // Format formats markdown content
23 | func (f *Formatter) Format(content string) string {
24 | if !f.enabled {
25 | return content
26 | }
27 |
28 | // 1. Split content into lines
29 | lines := strings.Split(content, "\n")
30 |
31 | // 2. Process each line
32 | var formatted []string
33 | for i := 0; i < len(lines); i++ {
34 | line := lines[i]
35 |
36 | // Process headings: ensure there are blank lines before and after
37 | if isHeading(line) {
38 | // If not the first line and previous line is not blank, add a blank line
39 | if i > 0 && len(strings.TrimSpace(lines[i-1])) > 0 {
40 | formatted = append(formatted, "")
41 | }
42 | // Normalize heading format (one space after #)
43 | line = formatHeading(line)
44 | formatted = append(formatted, line)
45 | // If not the last line, add a blank line
46 | if i < len(lines)-1 {
47 | formatted = append(formatted, "")
48 | }
49 | continue
50 | }
51 |
52 | // Process spaces in links
53 | line = formatMarkdownLinks(line)
54 |
55 | // Process content in parentheses
56 | line = formatParentheses(line)
57 |
58 | // Process spaces between Chinese and English text
59 | line = formatChineseEnglishSpace(line)
60 |
61 | formatted = append(formatted, line)
62 | }
63 |
64 | // 3. Handle consecutive blank lines
65 | formatted = removeConsecutiveBlankLines(formatted)
66 |
67 | // 4. Join lines
68 | result := strings.Join(formatted, "\n")
69 |
70 | return result
71 | }
72 |
73 | // isHeading checks if the line is a heading
74 | func isHeading(line string) bool {
75 | return strings.HasPrefix(strings.TrimSpace(line), "#")
76 | }
77 |
78 | // formatHeading formats the heading line
79 | func formatHeading(line string) string {
80 | // Remove leading spaces
81 | line = strings.TrimSpace(line)
82 | // Ensure only one space between # and text
83 | re := regexp.MustCompile(`^(#+)\s*`)
84 | return re.ReplaceAllString(line, "$1 ")
85 | }
86 |
87 | // formatParentheses processes the format within parentheses
88 | func formatParentheses(line string) string {
89 | // First handle http/https links by temporarily replacing them
90 | linkPattern := regexp.MustCompile(`\([^)]*https?://[^)]+\)`)
91 | links := linkPattern.FindAllString(line, -1)
92 | for i, link := range links {
93 | line = strings.Replace(line, link, fmt.Sprintf("__LINK_PLACEHOLDER_%d__", i), 1)
94 | }
95 |
96 | // Process regular parentheses content
97 | re := regexp.MustCompile(`\(([^)]+)\)`)
98 | line = re.ReplaceAllStringFunc(line, func(match string) string {
99 | // Extract content within parentheses
100 | content := match[1 : len(match)-1]
101 | // Clean leading and trailing spaces
102 | content = strings.TrimSpace(content)
103 | // Replace consecutive spaces with a single space
104 | content = regexp.MustCompile(`\s+`).ReplaceAllString(content, " ")
105 | return fmt.Sprintf("(%s)", content)
106 | })
107 |
108 | // Restore links
109 | for i, link := range links {
110 | line = strings.Replace(line, fmt.Sprintf("__LINK_PLACEHOLDER_%d__", i), link, 1)
111 | }
112 |
113 | return line
114 | }
115 |
116 | // formatMarkdownLinks processes spaces in markdown links
117 | func formatMarkdownLinks(line string) string {
118 | // Match markdown link format [text](url), including possible spaces
119 | linkPattern := regexp.MustCompile(`\[(.*?)\]\(\s*(.*?)\s*\)`)
120 |
121 | // Process spaces in link text and URL
122 | line = linkPattern.ReplaceAllStringFunc(line, func(match string) string {
123 | // Extract link text and URL
124 | parts := linkPattern.FindStringSubmatch(match)
125 | if len(parts) != 3 {
126 | return match
127 | }
128 |
129 | text := parts[1]
130 | url := parts[2]
131 |
132 | // Clean spaces in URL
133 | url = strings.TrimSpace(url)
134 | // Remove all spaces and invisible characters in URL
135 | url = regexp.MustCompile(`[\s\p{Zs}\p{C}]+`).ReplaceAllString(url, "")
136 |
137 | // Keep spaces in link text, but clean leading/trailing spaces and consecutive spaces
138 | text = strings.TrimSpace(text)
139 | text = regexp.MustCompile(`\s+`).ReplaceAllString(text, " ")
140 |
141 | // Reassemble link
142 | return fmt.Sprintf("[%s](%s)", text, url)
143 | })
144 |
145 | // Process spaces in heading links
146 | headingLinkPattern := regexp.MustCompile(`\]\(#(.*?)\)`)
147 | line = headingLinkPattern.ReplaceAllStringFunc(line, func(match string) string {
148 | parts := headingLinkPattern.FindStringSubmatch(match)
149 | if len(parts) != 2 {
150 | return match
151 | }
152 |
153 | anchor := parts[1]
154 | // Remove all spaces
155 | anchor = regexp.MustCompile(`\s+`).ReplaceAllString(anchor, "")
156 | return fmt.Sprintf("](#%s)", anchor)
157 | })
158 |
159 | return line
160 | }
161 |
162 | // formatChineseEnglishSpace adds spaces between Chinese and English text
163 | func formatChineseEnglishSpace(line string) string {
164 | // Match boundaries between Chinese and English/numbers
165 | re := regexp.MustCompile(`([\p{Han}])([A-Za-z0-9])`)
166 | line = re.ReplaceAllString(line, "$1 $2")
167 |
168 | re = regexp.MustCompile(`([A-Za-z0-9])([\p{Han}])`)
169 | line = re.ReplaceAllString(line, "$1 $2")
170 |
171 | return line
172 | }
173 |
174 | // removeConsecutiveBlankLines removes consecutive blank lines
175 | func removeConsecutiveBlankLines(lines []string) []string {
176 | var result []string
177 | isPrevLineBlank := false
178 |
179 | for _, line := range lines {
180 | isCurrentLineBlank := len(strings.TrimSpace(line)) == 0
181 |
182 | if !isCurrentLineBlank || !isPrevLineBlank {
183 | result = append(result, line)
184 | }
185 |
186 | isPrevLineBlank = isCurrentLineBlank
187 | }
188 |
189 | return result
190 | }
191 |
--------------------------------------------------------------------------------
/docs/features/export.md:
--------------------------------------------------------------------------------
1 | # Export 功能设计文档
2 |
3 | ## 功能概述
4 |
5 | 为 mdctl 工具增加 `export` 子命令,用于将 Markdown 文件导出为其他格式。第一版将优先支持导出为 Word 文档格式(docx),后续可扩展支持更多格式(如 PDF、EPUB 等)。
6 |
7 | 该功能将利用 Pandoc 作为底层导出工具,支持 Pandoc 的模板系统,允许用户配置自定义的导出模板。
8 |
9 | ## 用户需求
10 |
11 | 1. 支持将单个 Markdown 文件导出为 Word 格式
12 | 2. 支持将多个 Markdown 文件合并后导出为单个 Word 文档
13 | 3. 支持按照文件夹中的文件名顺序合并文件
14 | 4. 支持多种文档系统(MkDocs 第一期、Hugo、Docusaurus coming soon)的文件读取方式
15 | 5. 在合并过程中智能调整标题层级,保持文档结构的清晰性
16 | 6. 支持自定义 Word 模板,使最终文档具有一致的样式
17 |
18 | ## 命令设计
19 |
20 | ```
21 | mdctl export [flags]
22 | ```
23 |
24 | ### 参数设计
25 |
26 | - `-f, --file`: 指定单个 Markdown 文件进行导出
27 | - `-d, --dir`: 指定包含多个 Markdown 文件的目录
28 | - `-s, --site-type`: 指定文档站点类型,可选值:mkdocs, hugo, docusaurus(默认:mkdocs)
29 | - `-o, --output`: 指定输出文件路径
30 | - `-t, --template`: 指定 Word 模板文件路径
31 | - `-F, --format`: 指定输出格式,可选值:docx, pdf, epub(默认:docx)
32 | - `--toc`: 是否生成目录(默认:false)
33 | - `--shift-heading-level-by`: 标题层级偏移量(默认:0)
34 | - `--file-as-title`: 是否使用文件名作为章节标题(默认:false)
35 |
36 | ### 使用示例
37 |
38 | ```bash
39 | # 导出单个文件
40 | mdctl export -f README.md -o output.docx
41 |
42 | # 导出整个目录
43 | mdctl export -d docs/ -o documentation.docx
44 |
45 | # 导出 MkDocs 站点
46 | mdctl export -d docs/ -s mkdocs -o site_docs.docx
47 |
48 | # 导出 Hugo 站点
49 | mdctl export -d content/ -s hugo -o hugo_docs.docx
50 |
51 | # 使用自定义模板
52 | mdctl export -d docs/ -o report.docx -t templates/corporate.docx
53 |
54 | # 指定标题层级偏移量
55 | mdctl export -d docs/ -o documentation.docx --shift-heading-level-by 2
56 |
57 | # 导出为 PDF 格式
58 | mdctl export -d docs/ -o documentation.pdf -F pdf
59 | ```
60 |
61 | ## 实现设计
62 |
63 | ### 整体架构
64 |
65 | 按照项目的现有结构,我们将在 `cmd/` 目录下创建 `export.go` 文件定义命令接口,在 `internal/` 目录下创建 `exporter/` 模块实现具体功能。
66 |
67 | ```
68 | mdctl/
69 | ├── cmd/
70 | │ └── export.go # 新增:export 命令定义
71 | ├── internal/
72 | │ └── exporter/ # 新增:导出功能实现
73 | │ ├── exporter.go # 导出器接口定义
74 | │ ├── pandoc.go # Pandoc 导出实现
75 | │ ├── merger.go # Markdown 合并实现
76 | │ ├── sitereader/ # 新增:不同文档系统的站点结构读取
77 | │ │ ├── reader.go # 站点读取器接口
78 | │ │ ├── mkdocs.go # MkDocs 站点读取
79 | │ │ ├── hugo.go # Hugo 站点读取
80 | │ │ └── docusaurus.go # Docusaurus 站点读取
81 | │ └── heading.go # 标题处理实现
82 | ```
83 |
84 | ### 核心组件
85 |
86 | #### 1. 命令处理器 (cmd/export.go)
87 |
88 | 负责解析命令行参数并调用导出功能。
89 |
90 | ```go
91 | var (
92 | exportFile string
93 | exportDir string
94 | siteType string
95 | configFile string
96 | exportOutput string
97 | exportTemplate string
98 | exportFormat string
99 | pandocPath string
100 | generateToc bool
101 | shiftHeadingLevelBy int
102 | fileAsTitle bool
103 |
104 | exportCmd = &cobra.Command{
105 | Use: "export",
106 | Short: "Export markdown files to other formats",
107 | Long: `...`,
108 | RunE: func(cmd *cobra.Command, args []string) error {
109 | // 参数验证和处理逻辑
110 | // 调用 internal/exporter 的功能
111 | },
112 | }
113 | )
114 | ```
115 |
116 | #### 2. 导出器接口 (internal/exporter/exporter.go)
117 |
118 | 定义导出功能的通用接口,支持扩展其他格式。
119 |
120 | ```go
121 | type Exporter interface {
122 | Export(input string, output string, options ExportOptions) error
123 | }
124 |
125 | type ExportOptions struct {
126 | Template string
127 | GenerateToc bool
128 | ShiftHeadingLevelBy int
129 | FileAsTitle bool
130 | Format string
131 | // 其他选项
132 | }
133 | ```
134 |
135 | #### 3. Pandoc 导出实现 (internal/exporter/pandoc.go)
136 |
137 | 使用 Pandoc 工具实现导出功能。
138 |
139 | ```go
140 | type PandocExporter struct {
141 | PandocPath string
142 | }
143 |
144 | func (e *PandocExporter) Export(input, output string, options ExportOptions) error {
145 | // 构建并执行 Pandoc 命令
146 | // 如果 pandoc 不可用,返回明确的错误提示
147 | }
148 | ```
149 |
150 | #### 4. 站点结构读取器 (internal/exporter/sitereader/)
151 |
152 | 负责识别和解析不同文档系统的站点结构。
153 |
154 | ```go
155 | // 站点读取器接口
156 | type SiteReader interface {
157 | // 检测给定目录是否为此类型的站点
158 | Detect(dir string) bool
159 |
160 | // 读取站点结构,返回按顺序排列的文件列表
161 | ReadStructure(dir string, configPath string) ([]string, error)
162 | }
163 |
164 | // 工厂函数,根据站点类型返回相应的读取器
165 | func GetSiteReader(siteType string) (SiteReader, error) {
166 | // 返回对应类型的读取器实现
167 | }
168 | ```
169 |
170 | #### 5. Markdown 合并器 (internal/exporter/merger.go)
171 |
172 | 负责合并多个 Markdown 文件。
173 |
174 | ```go
175 | type Merger struct {
176 | ShiftHeadingLevelBy int
177 | FileAsTitle bool
178 | }
179 |
180 | func (m *Merger) Merge(sources []string, target string) error {
181 | // 合并多个 Markdown 文件的逻辑
182 | // 自动处理标题层级
183 | }
184 | ```
185 |
186 | #### 6. 标题处理器 (internal/exporter/heading.go)
187 |
188 | 处理 Markdown 文件中的标题层级。
189 |
190 | ```go
191 | func ShiftHeadings(content string, levels int) string {
192 | // 调整标题层级的逻辑
193 | }
194 | ```
195 |
196 | ### 工作流程
197 |
198 | 1. **命令解析**:解析用户提供的命令行参数
199 | 2. **文件收集**:根据参数收集需要处理的 Markdown 文件
200 | - 单文件模式:直接使用指定文件
201 | - 目录模式:收集目录中的所有 Markdown 文件并按文件名排序
202 | - 站点模式:使用相应的站点读取器解析站点结构
203 | 3. **文件合并**:如果有多个文件,将它们合并为一个临时 Markdown 文件
204 | - 自动调整每个文件的标题层级
205 | - 可选添加文件名作为章节标题
206 | 4. **格式转换**:使用 Pandoc 将 Markdown 转换为目标格式
207 | - 应用用户指定的模板(如果有)
208 | - 生成目录(如果启用)
209 | 5. **输出处理**:将最终结果输出到用户指定的路径
210 |
211 | ## 标题层级处理策略
212 |
213 | 为了解决多文件合并时标题层级的问题,系统将自动处理标题层级:
214 |
215 | 1. 每个文件的标题层级将按照指定的偏移量调整:
216 | - H1 -> H(1+偏移量)
217 | - H2 -> H(2+偏移量)
218 | - ...
219 | - 如果调整后超过 H6,将转换为加粗文本 (**文本**)
220 |
221 | 2. 如果启用了文件名作为标题功能,会自动在每个文件内容前添加对应层级的标题
222 |
223 | 3. 系统会自动处理标题的相对层级关系,确保文档结构的逻辑性
224 |
225 | ## 依赖条件
226 |
227 | **Pandoc**:需要系统中安装 Pandoc 工具
228 | - 在执行导出命令时检查 Pandoc 是否可用
229 | - 如果找不到 Pandoc,提供明确的错误信息和安装指导
230 |
231 | ## 错误处理
232 |
233 | 1. Pandoc 不可用时提供明确的错误信息和安装指导
234 | 2. 文件不存在或无法访问时的错误处理
235 | 3. 合并过程中可能出现的格式问题处理
236 | 4. 模板文件异常的处理
237 | 5. 不支持的站点类型或配置文件处理
238 |
239 | ## 未来扩展
240 |
241 | 1. 增强模板管理功能,支持模板下载和更新
242 | 2. 支持更多的文档站点系统
243 | 3. 支持更复杂的文档结构处理,如自动生成封面、页眉页脚
244 | 4. 集成图表和公式渲染功能
--------------------------------------------------------------------------------
/internal/processor/processor.go:
--------------------------------------------------------------------------------
1 | package processor
2 |
3 | import (
4 | "crypto/md5"
5 | "fmt"
6 | "io"
7 | "net/http"
8 | "os"
9 | "path/filepath"
10 | "regexp"
11 | "strings"
12 | )
13 |
14 | type Processor struct {
15 | SourceFile string
16 | SourceDir string
17 | ImageOutputDir string
18 | }
19 |
20 | func New(sourceFile, sourceDir, imageOutputDir string) *Processor {
21 | return &Processor{
22 | SourceFile: sourceFile,
23 | SourceDir: sourceDir,
24 | ImageOutputDir: imageOutputDir,
25 | }
26 | }
27 |
28 | func (p *Processor) Process() error {
29 | if p.SourceFile != "" {
30 | return p.processFile(p.SourceFile)
31 | }
32 | return p.processDirectory(p.SourceDir)
33 | }
34 |
35 | func (p *Processor) processDirectory(dir string) error {
36 | fmt.Printf("Processing directory: %s\n", dir)
37 | return filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
38 | if err != nil {
39 | return err
40 | }
41 | if !info.IsDir() && (strings.HasSuffix(path, ".md") || strings.HasSuffix(path, ".markdown")) {
42 | return p.processFile(path)
43 | }
44 | return nil
45 | })
46 | }
47 |
48 | func (p *Processor) processFile(filePath string) error {
49 | fmt.Printf("Processing file: %s\n", filePath)
50 | content, err := os.ReadFile(filePath)
51 | if err != nil {
52 | return fmt.Errorf("failed to read file %s: %v", filePath, err)
53 | }
54 |
55 | // Determine image output directory
56 | imgDir := p.determineImageDir(filePath)
57 | if err := os.MkdirAll(imgDir, 0755); err != nil {
58 | return fmt.Errorf("failed to create image directory %s: %v", imgDir, err)
59 | }
60 |
61 | // Find all image links
62 | imgRegex := regexp.MustCompile(`!\[([^\]]*)\]\(([^)]+)\)`)
63 | matches := imgRegex.FindAllStringSubmatch(string(content), -1)
64 |
65 | fmt.Printf("Found %d images in file %s\n", len(matches), filePath)
66 |
67 | newContent := string(content)
68 | for _, match := range matches {
69 | imgAlt := match[1]
70 | imgURL := match[2]
71 |
72 | // Replace image URL starting with "//" to "https://"
73 | if strings.HasPrefix(imgURL, "//") {
74 | imgURL = strings.Replace(imgURL, "//", "https://", 1)
75 | }
76 | // Skip local images
77 | if !strings.HasPrefix(imgURL, "http://") && !strings.HasPrefix(imgURL, "https://") {
78 | continue
79 | }
80 |
81 | // Download and save image
82 | localPath, err := p.downloadImage(imgURL, imgDir)
83 | if err != nil {
84 | fmt.Printf("Warning: Failed to download image %s: %v\n", imgURL, err)
85 | continue
86 | }
87 |
88 | // Calculate relative path
89 | relPath, err := filepath.Rel(filepath.Dir(filePath), localPath)
90 | if err != nil {
91 | fmt.Printf("Warning: Failed to calculate relative path: %v\n", err)
92 | continue
93 | }
94 |
95 | // Replace image link
96 | oldLink := fmt.Sprintf("", match[1], match[2])
97 | newLink := fmt.Sprintf("", imgAlt, relPath)
98 | newContent = strings.Replace(newContent, oldLink, newLink, 1)
99 | }
100 |
101 | // Write back to file
102 | if err := os.WriteFile(filePath, []byte(newContent), 0644); err != nil {
103 | return fmt.Errorf("failed to write file %s: %v", filePath, err)
104 | }
105 |
106 | return nil
107 | }
108 |
109 | func (p *Processor) determineImageDir(filePath string) string {
110 | if p.ImageOutputDir != "" {
111 | return p.ImageOutputDir
112 | }
113 | if p.SourceDir != "" {
114 | return filepath.Join(p.SourceDir, "images")
115 | }
116 | return filepath.Join(filepath.Dir(filePath), "images")
117 | }
118 |
119 | func (p *Processor) downloadImage(url string, destDir string) (string, error) {
120 | resp, err := http.Get(url)
121 | if err != nil {
122 | return "", err
123 | }
124 | defer resp.Body.Close()
125 |
126 | // Get filename from URL or Content-Disposition
127 | filename := getFilenameFromURL(url, resp)
128 |
129 | // If no extension, try to get from Content-Type
130 | if filepath.Ext(filename) == "" {
131 | contentType := resp.Header.Get("Content-Type")
132 | ext := getExtensionFromContentType(contentType)
133 | if ext != "" {
134 | filename += ext
135 | }
136 | }
137 |
138 | // Ensure filename is unique
139 | hash := md5.New()
140 | io.WriteString(hash, url)
141 | urlHash := fmt.Sprintf("%x", hash.Sum(nil))[:8]
142 |
143 | ext := filepath.Ext(filename)
144 | basename := strings.TrimSuffix(filename, ext)
145 | filename = fmt.Sprintf("%s_%s%s", basename, urlHash, ext)
146 |
147 | localPath := filepath.Join(destDir, filename)
148 |
149 | // Create target file
150 | out, err := os.Create(localPath)
151 | if err != nil {
152 | return "", err
153 | }
154 | defer out.Close()
155 |
156 | // Write to file
157 | _, err = io.Copy(out, resp.Body)
158 | if err != nil {
159 | return "", err
160 | }
161 |
162 | fmt.Printf("Downloaded image to: %s\n", localPath)
163 | return localPath, nil
164 | }
165 |
166 | func getFilenameFromURL(url string, resp *http.Response) string {
167 | // First try to get from Content-Disposition
168 | if cd := resp.Header.Get("Content-Disposition"); cd != "" {
169 | if strings.Contains(cd, "filename=") {
170 | parts := strings.Split(cd, "filename=")
171 | if len(parts) > 1 {
172 | filename := strings.Trim(parts[1], `"'`)
173 | if filename != "" {
174 | return filename
175 | }
176 | }
177 | }
178 | }
179 |
180 | // Get from URL path
181 | parts := strings.Split(url, "/")
182 | if len(parts) > 0 {
183 | filename := parts[len(parts)-1]
184 | // Remove URL parameters
185 | if idx := strings.Index(filename, "?"); idx != -1 {
186 | filename = filename[:idx]
187 | }
188 | // Remove trailing "@" character
189 | if idx := strings.LastIndex(filename, "@"); idx != -1 {
190 | if idx > strings.LastIndex(filename, ".") {
191 | filename = filename[:idx]
192 | }
193 | }
194 | if filename != "" {
195 | return filename
196 | }
197 | }
198 |
199 | // Use default name
200 | return "image"
201 | }
202 |
203 | func getExtensionFromContentType(contentType string) string {
204 | switch contentType {
205 | case "image/jpeg", "image/jpg":
206 | return ".jpg"
207 | case "image/png":
208 | return ".png"
209 | case "image/gif":
210 | return ".gif"
211 | case "image/webp":
212 | return ".webp"
213 | default:
214 | return ""
215 | }
216 | }
217 |
--------------------------------------------------------------------------------
/internal/linter/linter.go:
--------------------------------------------------------------------------------
1 | package linter
2 |
3 | import (
4 | "fmt"
5 | "io"
6 | "os"
7 | "strings"
8 |
9 | "github.com/samzong/mdctl/internal/markdownfmt"
10 | )
11 |
12 | // Config holds the linter configuration
13 | type Config struct {
14 | AutoFix bool
15 | OutputFormat string
16 | RulesFile string
17 | EnableRules []string
18 | DisableRules []string
19 | Verbose bool
20 | }
21 |
22 | // Issue represents a linting issue
23 | type Issue struct {
24 | Line int `json:"line"`
25 | Column int `json:"column,omitempty"`
26 | Rule string `json:"rule"`
27 | Message string `json:"message"`
28 | Context string `json:"context,omitempty"`
29 | Fixed bool `json:"fixed,omitempty"`
30 | }
31 |
32 | // Result holds the linting results for a file
33 | type Result struct {
34 | Filename string `json:"filename"`
35 | Issues []*Issue `json:"issues"`
36 | FixedCount int `json:"fixed_count"`
37 | }
38 |
39 | // Linter performs markdown linting
40 | type Linter struct {
41 | config *Config
42 | rules *RuleSet
43 | formatter *markdownfmt.Formatter
44 | fixer *Fixer
45 | }
46 |
47 | // New creates a new linter instance
48 | func New(config *Config) *Linter {
49 | rules := NewRuleSet()
50 |
51 | // Load configuration file if specified
52 | if config.RulesFile != "" {
53 | if configFile, err := LoadConfigFile(config.RulesFile); err == nil {
54 | configFile.ApplyToRuleSet(rules)
55 | } else if config.Verbose {
56 | fmt.Printf("Warning: Could not load rules file %s: %v\n", config.RulesFile, err)
57 | }
58 | } else {
59 | // Try to find and load default config file
60 | if configFile, err := LoadConfigFile(""); err == nil {
61 | configFile.ApplyToRuleSet(rules)
62 | }
63 | }
64 |
65 | // Apply rule configuration from command line
66 | if len(config.EnableRules) > 0 {
67 | rules.EnableOnly(config.EnableRules)
68 | }
69 |
70 | if len(config.DisableRules) > 0 {
71 | rules.Disable(config.DisableRules)
72 | }
73 |
74 | return &Linter{
75 | config: config,
76 | rules: rules,
77 | formatter: markdownfmt.New(true), // Enable formatter for auto-fix
78 | fixer: NewFixer(),
79 | }
80 | }
81 |
82 | // LintFile lints a single markdown file
83 | func (l *Linter) LintFile(filename string) (*Result, error) {
84 | // Check file size limit (10MB)
85 | const maxFileSize = 10 * 1024 * 1024
86 | if info, err := os.Stat(filename); err == nil {
87 | if info.Size() > maxFileSize {
88 | return nil, fmt.Errorf("file too large: %s (max %d bytes)", filename, maxFileSize)
89 | }
90 | }
91 |
92 | content, err := os.ReadFile(filename)
93 | if err != nil {
94 | return nil, fmt.Errorf("failed to read file: %v", err)
95 | }
96 |
97 | return l.LintContent(filename, string(content))
98 | }
99 |
100 | // LintContent lints markdown content
101 | func (l *Linter) LintContent(filename, content string) (*Result, error) {
102 | result := &Result{
103 | Filename: filename,
104 | Issues: []*Issue{},
105 | }
106 |
107 | lines := strings.Split(content, "\n")
108 |
109 | // Apply all enabled rules
110 | for _, rule := range l.rules.GetEnabledRules() {
111 | issues := rule.Check(lines)
112 | result.Issues = append(result.Issues, issues...)
113 | }
114 |
115 | // Apply auto-fix if requested
116 | if l.config.AutoFix && len(result.Issues) > 0 {
117 | fixedContent, fixedCount := l.applyFixes(content, result.Issues)
118 | result.FixedCount = fixedCount
119 |
120 | // Write fixed content back to file with backup
121 | if fixedCount > 0 {
122 | // Create backup before modifying the file
123 | if err := l.createBackup(filename); err != nil {
124 | return nil, fmt.Errorf("failed to create backup: %v", err)
125 | }
126 |
127 | if err := os.WriteFile(filename, []byte(fixedContent), 0644); err != nil {
128 | return nil, fmt.Errorf("failed to write fixed content: %v", err)
129 | }
130 |
131 | // Mark issues as fixed
132 | for _, issue := range result.Issues {
133 | if issue.Rule != "MD013" { // Don't mark line length issues as fixed automatically
134 | issue.Fixed = true
135 | }
136 | }
137 | }
138 | }
139 |
140 | return result, nil
141 | }
142 |
143 | // applyFixes applies automatic fixes to the content
144 | func (l *Linter) applyFixes(content string, issues []*Issue) (string, int) {
145 | // Use the dedicated fixer for rule-specific fixes
146 | fixedContent, fixedCount := l.fixer.ApplyFixes(content, issues)
147 |
148 | // Then apply general formatting fixes
149 | finalContent := l.formatter.Format(fixedContent)
150 |
151 | // If formatter made additional changes, count them
152 | if finalContent != fixedContent && fixedCount == 0 {
153 | fixedCount = l.countFixableIssues(issues)
154 | }
155 |
156 | return finalContent, fixedCount
157 | }
158 |
159 | // createBackup creates a backup of the file before modification
160 | func (l *Linter) createBackup(filename string) error {
161 | backupFilename := filename + ".orig"
162 |
163 | // Open source file
164 | src, err := os.Open(filename)
165 | if err != nil {
166 | return fmt.Errorf("failed to open source file: %v", err)
167 | }
168 | defer src.Close()
169 |
170 | // Create backup file
171 | dst, err := os.Create(backupFilename)
172 | if err != nil {
173 | return fmt.Errorf("failed to create backup file: %v", err)
174 | }
175 | defer dst.Close()
176 |
177 | // Copy content
178 | _, err = io.Copy(dst, src)
179 | if err != nil {
180 | return fmt.Errorf("failed to copy content to backup: %v", err)
181 | }
182 |
183 | return nil
184 | }
185 |
186 | // countFixableIssues counts how many issues can be automatically fixed
187 | func (l *Linter) countFixableIssues(issues []*Issue) int {
188 | fixableRules := map[string]bool{
189 | "MD009": true, // Trailing spaces
190 | "MD010": true, // Hard tabs
191 | "MD012": true, // Multiple consecutive blank lines
192 | "MD018": true, // No space after hash on atx style heading
193 | "MD019": true, // Multiple spaces after hash on atx style heading
194 | "MD023": true, // Headings must start at the beginning of the line
195 | "MD047": true, // Files should end with a single newline character
196 | }
197 |
198 | count := 0
199 | for _, issue := range issues {
200 | if fixableRules[issue.Rule] {
201 | count++
202 | }
203 | }
204 | return count
205 | }
206 |
--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
1 | github.com/PuerkitoBio/goquery v1.9.1 h1:mTL6XjbJTZdpfL+Gwl5U2h1l9yEkJjhmlTeV9VPW7UI=
2 | github.com/PuerkitoBio/goquery v1.9.1/go.mod h1:cW1n6TmIMDoORQU5IU/P1T3tGFunOeXEpGP2WHRwkbY=
3 | github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss=
4 | github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU=
5 | github.com/aws/aws-sdk-go v1.55.6 h1:cSg4pvZ3m8dgYcgqB97MrcdjUmZ1BeMYKUxMMB89IPk=
6 | github.com/aws/aws-sdk-go v1.55.6/go.mod h1:eRwEWoyTWFMVYVQzKMNHWP5/RV4xIUGMQfXQHfHkpNU=
7 | github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
8 | github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
9 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
10 | github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y=
11 | github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8=
12 | github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
13 | github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
14 | github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg=
15 | github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo=
16 | github.com/jmespath/go-jmespath/internal/testify v1.5.1 h1:shLQSRRSCCPj3f2gpwzGwWFoC7ycTf1rcQZHOlsJ6N8=
17 | github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U=
18 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
19 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
20 | github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
21 | github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM=
22 | github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y=
23 | github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
24 | github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
25 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
26 | github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
27 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
28 | golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
29 | golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
30 | golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
31 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
32 | golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
33 | golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
34 | golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
35 | golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns=
36 | golang.org/x/net v0.33.0 h1:74SYHlV8BIgHIFC/LrYkOGIwL19eTYXQ5wc6TBuO36I=
37 | golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4=
38 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
39 | golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
40 | golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
41 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
42 | golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
43 | golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
44 | golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
45 | golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
46 | golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
47 | golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
48 | golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
49 | golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
50 | golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
51 | golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY=
52 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
53 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
54 | golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
55 | golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
56 | golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
57 | golang.org/x/text v0.23.0 h1:D71I7dUrlY+VX0gQShAThNGHFxZ13dGLBHQLVl1mJlY=
58 | golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4=
59 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
60 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
61 | golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
62 | golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
63 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
64 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
65 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
66 | gopkg.in/yaml.v2 v2.2.8 h1:obN1ZagJSUGI0Ek/LBmuj4SNLPfIny3KsKFopxRdj10=
67 | gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
68 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
69 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
70 |
--------------------------------------------------------------------------------
/internal/linter/linter_test.go:
--------------------------------------------------------------------------------
1 | package linter
2 |
3 | import (
4 | "os"
5 | "testing"
6 | )
7 |
8 | func TestLinter_LintContent(t *testing.T) {
9 | tests := []struct {
10 | name string
11 | content string
12 | expectRules []string // Expected rule IDs that should trigger
13 | expectCount int // Expected number of issues
14 | }{
15 | {
16 | name: "valid markdown",
17 | content: "# Title\n\nThis is valid markdown.\n",
18 | expectRules: []string{},
19 | expectCount: 0,
20 | },
21 | {
22 | name: "trailing spaces",
23 | content: "# Title \n\nContent with trailing spaces. \n",
24 | expectRules: []string{"MD009"},
25 | expectCount: 2,
26 | },
27 | {
28 | name: "hard tabs",
29 | content: "# Title\n\n\tContent with hard tab.\n",
30 | expectRules: []string{"MD010"},
31 | expectCount: 1,
32 | },
33 | {
34 | name: "multiple blank lines",
35 | content: "# Title\n\n\n\nContent after multiple blank lines.\n",
36 | expectRules: []string{"MD012"},
37 | expectCount: 2, // MD012 triggers for each set of consecutive blank lines
38 | },
39 | {
40 | name: "no space after hash",
41 | content: "#Title\n\nContent.\n",
42 | expectRules: []string{"MD018"},
43 | expectCount: 1,
44 | },
45 | {
46 | name: "multiple spaces after hash",
47 | content: "# Title\n\nContent.\n",
48 | expectRules: []string{"MD019"},
49 | expectCount: 1,
50 | },
51 | {
52 | name: "heading not at start of line",
53 | content: "Some text\n # Title\n\nContent.\n",
54 | expectRules: []string{"MD023"},
55 | expectCount: 1,
56 | },
57 | {
58 | name: "list without blank line before",
59 | content: "# Title\nSome text\n- List item\n\nContent.\n",
60 | expectRules: []string{"MD032"},
61 | expectCount: 1,
62 | },
63 | {
64 | name: "list without blank line after",
65 | content: "# Title\n\n- List item\nSome text\n",
66 | expectRules: []string{"MD032"},
67 | expectCount: 1,
68 | },
69 | {
70 | name: "file not ending with newline",
71 | content: "# Title\n\nContent without final newline",
72 | expectRules: []string{"MD047"},
73 | expectCount: 1,
74 | },
75 | {
76 | name: "file ending with multiple newlines",
77 | content: "# Title\n\nContent.\n\n",
78 | expectRules: []string{"MD047", "MD012"},
79 | expectCount: 2, // Both MD047 and MD012 trigger
80 | },
81 | }
82 |
83 | for _, tt := range tests {
84 | t.Run(tt.name, func(t *testing.T) {
85 | linter := New(&Config{})
86 | result, err := linter.LintContent("test.md", tt.content)
87 |
88 | if err != nil {
89 | t.Fatalf("LintContent failed: %v", err)
90 | }
91 |
92 | if len(result.Issues) != tt.expectCount {
93 | t.Errorf("Expected %d issues, got %d", tt.expectCount, len(result.Issues))
94 | for _, issue := range result.Issues {
95 | t.Logf("Issue: %s - %s", issue.Rule, issue.Message)
96 | }
97 | }
98 |
99 | // Check that expected rules are triggered
100 | foundRules := make(map[string]bool)
101 | for _, issue := range result.Issues {
102 | foundRules[issue.Rule] = true
103 | }
104 |
105 | for _, expectedRule := range tt.expectRules {
106 | if !foundRules[expectedRule] {
107 | t.Errorf("Expected rule %s to be triggered, but it wasn't", expectedRule)
108 | }
109 | }
110 | })
111 | }
112 | }
113 |
114 | func TestLinter_AutoFix(t *testing.T) {
115 | tests := []struct {
116 | name string
117 | content string
118 | expectFixed bool
119 | expectFixCount int
120 | expectRules []string
121 | }{
122 | {
123 | name: "fix trailing spaces",
124 | content: "# Title \n\nContent with trailing spaces. \n",
125 | expectFixed: true,
126 | expectFixCount: 2,
127 | expectRules: []string{"MD009"},
128 | },
129 | {
130 | name: "fix hard tabs",
131 | content: "# Title\n\n\tContent with hard tab.\n",
132 | expectFixed: true,
133 | expectFixCount: 1,
134 | expectRules: []string{"MD010"},
135 | },
136 | {
137 | name: "fix multiple blank lines",
138 | content: "# Title\n\n\n\nContent after multiple blank lines.\n",
139 | expectFixed: true,
140 | expectFixCount: 2, // MD012 triggers multiple times
141 | expectRules: []string{"MD012"},
142 | },
143 | }
144 |
145 | for _, tt := range tests {
146 | t.Run(tt.name, func(t *testing.T) {
147 | // Create a temporary file
148 | tmpFile, err := os.CreateTemp("", "test_*.md")
149 | if err != nil {
150 | t.Fatalf("Failed to create temp file: %v", err)
151 | }
152 | defer os.Remove(tmpFile.Name())
153 | defer os.Remove(tmpFile.Name() + ".orig") // Remove backup file
154 |
155 | // Write content to temp file
156 | if _, err := tmpFile.WriteString(tt.content); err != nil {
157 | t.Fatalf("Failed to write to temp file: %v", err)
158 | }
159 | tmpFile.Close()
160 |
161 | // Run linter with auto-fix
162 | linter := New(&Config{AutoFix: true})
163 | result, err := linter.LintFile(tmpFile.Name())
164 |
165 | if err != nil {
166 | t.Fatalf("LintFile failed: %v", err)
167 | }
168 |
169 | if tt.expectFixed && result.FixedCount != tt.expectFixCount {
170 | t.Errorf("Expected %d fixes, got %d", tt.expectFixCount, result.FixedCount)
171 | }
172 |
173 | // Check that backup file was created
174 | if tt.expectFixed {
175 | if _, err := os.Stat(tmpFile.Name() + ".orig"); os.IsNotExist(err) {
176 | t.Error("Expected backup file to be created, but it wasn't")
177 | }
178 | }
179 | })
180 | }
181 | }
182 |
183 | func TestLinter_BackupCreation(t *testing.T) {
184 | // Create a temporary file
185 | tmpFile, err := os.CreateTemp("", "test_*.md")
186 | if err != nil {
187 | t.Fatalf("Failed to create temp file: %v", err)
188 | }
189 | defer os.Remove(tmpFile.Name())
190 | defer os.Remove(tmpFile.Name() + ".orig")
191 |
192 | originalContent := "# Title \n\nContent with trailing spaces. \n"
193 | if _, err := tmpFile.WriteString(originalContent); err != nil {
194 | t.Fatalf("Failed to write to temp file: %v", err)
195 | }
196 | tmpFile.Close()
197 |
198 | // Run linter with auto-fix
199 | linter := New(&Config{AutoFix: true})
200 | _, err = linter.LintFile(tmpFile.Name())
201 |
202 | if err != nil {
203 | t.Fatalf("LintFile failed: %v", err)
204 | }
205 |
206 | // Check that backup file exists and contains original content
207 | backupContent, err := os.ReadFile(tmpFile.Name() + ".orig")
208 | if err != nil {
209 | t.Fatalf("Failed to read backup file: %v", err)
210 | }
211 |
212 | if string(backupContent) != originalContent {
213 | t.Errorf("Backup content doesn't match original.\nExpected: %q\nGot: %q", originalContent, string(backupContent))
214 | }
215 | }
216 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | BINARY=mdctl
2 | VERSION=$(shell git describe --tags || echo "unknown version")
3 | BUILDTIME=$(shell date -u)
4 | GOBUILD=CGO_ENABLED=0 go build -trimpath -ldflags '-X "github.com/samzong/mdctl/cmd.Version=$(VERSION)" -X "github.com/samzong/mdctl/cmd.BuildTime=$(BUILDTIME)"'
5 |
6 | # Homebrew related variables
7 | CLEAN_VERSION=$(shell echo $(VERSION) | sed 's/^v//')
8 | DOWNLOAD_URL=https://github.com/samzong/mdctl/releases/download/$(VERSION)/mdctl-$(CLEAN_VERSION)-darwin-amd64.tar.gz
9 | HOMEBREW_TAP_REPO=homebrew-tap
10 | FORMULA_FILE=Formula/mdctl.rb
11 | BRANCH_NAME=update-mdctl-$(CLEAN_VERSION)
12 |
13 | # Adjust architecture definitions to match goreleaser output
14 | SUPPORTED_ARCHS = Darwin_x86_64 Darwin_arm64 Linux_x86_64 Linux_arm64
15 |
16 | .PHONY: deps
17 | deps:
18 | @echo "Installing Go dependencies..."
19 | go mod download
20 | go mod verify
21 |
22 | .PHONY: build
23 | build: deps
24 | $(GOBUILD) -o bin/$(BINARY)
25 |
26 | .PHONY: test
27 | test:
28 | go test -v ./...
29 |
30 | .PHONY: clean
31 | clean:
32 | rm -rf bin/
33 | go clean -i ./...
34 |
35 | .PHONY: fmt
36 | fmt:
37 | go fmt ./...
38 | go mod tidy
39 |
40 | .PHONY: all
41 | all: clean fmt build test
42 |
43 | .PHONY: update-homebrew
44 | update-homebrew:
45 | @echo "==> Starting Homebrew formula update process..."
46 | @if [ -z "$(GH_PAT)" ]; then \
47 | echo "❌ Error: GH_PAT environment variable is required"; \
48 | exit 1; \
49 | fi
50 |
51 | @echo "==> Current version information:"
52 | @echo " - VERSION: $(VERSION)"
53 | @echo " - CLEAN_VERSION: $(CLEAN_VERSION)"
54 |
55 | @echo "==> Preparing working directory..."
56 | @rm -rf tmp && mkdir -p tmp
57 |
58 | @echo "==> Cloning Homebrew tap repository..."
59 | @cd tmp && git clone https://$(GH_PAT)@github.com/samzong/$(HOMEBREW_TAP_REPO).git
60 | @cd tmp/$(HOMEBREW_TAP_REPO) && echo " - Creating new branch: $(BRANCH_NAME)" && git checkout -b $(BRANCH_NAME)
61 |
62 | @echo "==> Processing architectures and calculating checksums..."
63 | @cd tmp/$(HOMEBREW_TAP_REPO) && \
64 | for arch in $(SUPPORTED_ARCHS); do \
65 | echo " - Processing $$arch..."; \
66 | if [ "$(DRY_RUN)" = "1" ]; then \
67 | echo " [DRY_RUN] Would download: https://github.com/samzong/mdctl/releases/download/v$(CLEAN_VERSION)/mdctl_$${arch}.tar.gz"; \
68 | case "$$arch" in \
69 | Darwin_x86_64) DARWIN_AMD64_SHA="fake_sha_amd64" ;; \
70 | Darwin_arm64) DARWIN_ARM64_SHA="fake_sha_arm64" ;; \
71 | Linux_x86_64) LINUX_AMD64_SHA="fake_sha_linux_amd64" ;; \
72 | Linux_arm64) LINUX_ARM64_SHA="fake_sha_linux_arm64" ;; \
73 | esac; \
74 | else \
75 | echo " - Downloading release archive..."; \
76 | curl -L -sSfO "https://github.com/samzong/mdctl/releases/download/v$(CLEAN_VERSION)/mdctl_$${arch}.tar.gz" || { echo "❌ Failed to download $$arch archive"; exit 1; }; \
77 | echo " - Calculating SHA256..."; \
78 | sha=$$(shasum -a 256 "mdctl_$${arch}.tar.gz" | cut -d' ' -f1); \
79 | case "$$arch" in \
80 | Darwin_x86_64) DARWIN_AMD64_SHA="$$sha"; echo " ✓ Darwin AMD64 SHA: $$sha" ;; \
81 | Darwin_arm64) DARWIN_ARM64_SHA="$$sha"; echo " ✓ Darwin ARM64 SHA: $$sha" ;; \
82 | Linux_x86_64) LINUX_AMD64_SHA="$$sha"; echo " ✓ Linux AMD64 SHA: $$sha" ;; \
83 | Linux_arm64) LINUX_ARM64_SHA="$$sha"; echo " ✓ Linux ARM64 SHA: $$sha" ;; \
84 | esac; \
85 | fi; \
86 | done; \
87 | \
88 | if [ "$(DRY_RUN)" = "1" ]; then \
89 | echo "==> [DRY_RUN] Would update formula with:"; \
90 | echo " - Darwin AMD64 SHA: $$DARWIN_AMD64_SHA"; \
91 | echo " - Darwin ARM64 SHA: $$DARWIN_ARM64_SHA"; \
92 | echo " - Linux AMD64 SHA: $$LINUX_AMD64_SHA"; \
93 | echo " - Linux ARM64 SHA: $$LINUX_ARM64_SHA"; \
94 | echo " - Would commit and push changes"; \
95 | echo " - Would create PR"; \
96 | else \
97 | echo "==> Updating formula file..."; \
98 | echo " - Updating version to $(CLEAN_VERSION)"; \
99 | sed -i '' -e 's|version ".*"|version "$(CLEAN_VERSION)"|' $(FORMULA_FILE); \
100 | \
101 | echo " - Updating URLs and checksums"; \
102 | sed -i '' \
103 | -e '/on_macos/,/end/ { \
104 | /if Hardware::CPU.arm?/,/else/ { \
105 | s|url ".*"|url "https://github.com/samzong/mdctl/releases/download/v#{version}/mdctl_Darwin_arm64.tar.gz"|; \
106 | s|sha256 ".*"|sha256 "'"$$DARWIN_ARM64_SHA"'"|; \
107 | }; \
108 | /else/,/end/ { \
109 | s|url ".*"|url "https://github.com/samzong/mdctl/releases/download/v#{version}/mdctl_Darwin_x86_64.tar.gz"|; \
110 | s|sha256 ".*"|sha256 "'"$$DARWIN_AMD64_SHA"'"|; \
111 | }; \
112 | }' \
113 | -e '/on_linux/,/end/ { \
114 | /if Hardware::CPU.arm?/,/else/ { \
115 | s|url ".*"|url "https://github.com/samzong/mdctl/releases/download/v#{version}/mdctl_Linux_arm64.tar.gz"|; \
116 | s|sha256 ".*"|sha256 "'"$$LINUX_ARM64_SHA"'"|; \
117 | }; \
118 | /else/,/end/ { \
119 | s|url ".*"|url "https://github.com/samzong/mdctl/releases/download/v#{version}/mdctl_Linux_x86_64.tar.gz"|; \
120 | s|sha256 ".*"|sha256 "'"$$LINUX_AMD64_SHA"'"|; \
121 | }; \
122 | }' $(FORMULA_FILE); \
123 | \
124 | echo " - Checking for changes..."; \
125 | if ! git diff --quiet $(FORMULA_FILE); then \
126 | echo "==> Changes detected, creating pull request..."; \
127 | echo " - Adding changes to git"; \
128 | git add $(FORMULA_FILE); \
129 | echo " - Committing changes"; \
130 | git commit -m "chore: bump to $(VERSION)"; \
131 | echo " - Pushing to remote"; \
132 | git push -u origin $(BRANCH_NAME); \
133 | echo " - Preparing pull request data"; \
134 | pr_data=$$(jq -n \
135 | --arg title "chore: update mdctl to $(VERSION)" \
136 | --arg body "Auto-generated PR\nSHAs:\n- Darwin(amd64): $$DARWIN_AMD64_SHA\n- Darwin(arm64): $$DARWIN_ARM64_SHA" \
137 | --arg head "$(BRANCH_NAME)" \
138 | --arg base "main" \
139 | '{title: $$title, body: $$body, head: $$head, base: $$base}'); \
140 | echo " - Creating pull request"; \
141 | curl -X POST \
142 | -H "Authorization: token $(GH_PAT)" \
143 | -H "Content-Type: application/json" \
144 | https://api.github.com/repos/samzong/$(HOMEBREW_TAP_REPO)/pulls \
145 | -d "$$pr_data"; \
146 | echo "✅ Pull request created successfully"; \
147 | else \
148 | echo "❌ No changes detected in formula file"; \
149 | exit 1; \
150 | fi; \
151 | fi
152 |
153 | @echo "==> Cleaning up temporary files..."
154 | @rm -rf tmp
155 | @echo "✅ Homebrew formula update process completed"
156 |
157 | .PHONY: help
158 | help:
159 | @echo "Usage: make "
160 | @echo "Targets:"
161 | @echo " deps: Install Go dependencies"
162 | @echo " build: Build the binary"
163 | @echo " test: Run tests"
164 | @echo " clean: Clean up build artifacts"
165 | @echo " fmt: Format the code"
166 | @echo " all: Clean, format, build, and test"
167 | @echo " update-homebrew: Update Homebrew formula (requires GH_PAT)"
168 |
169 | .DEFAULT_GOAL := help
170 |
--------------------------------------------------------------------------------
/internal/llmstxt/extractor.go:
--------------------------------------------------------------------------------
1 | package llmstxt
2 |
3 | import (
4 | "fmt"
5 | "net/http"
6 | "net/url"
7 | "path"
8 | "strings"
9 |
10 | "github.com/PuerkitoBio/goquery"
11 | )
12 |
13 | // Extract page information from HTML content
14 | func (g *Generator) extractPageInfo(urlStr string, resp *http.Response) (PageInfo, error) {
15 | // Create PageInfo object
16 | pageInfo := PageInfo{
17 | URL: urlStr,
18 | Section: parseSection(urlStr),
19 | }
20 |
21 | // Parse HTML
22 | doc, err := goquery.NewDocumentFromReader(resp.Body)
23 | if err != nil {
24 | return pageInfo, err
25 | }
26 |
27 | // Extract title
28 | pageInfo.Title = extractTitle(doc)
29 | if g.config.VeryVerbose {
30 | g.logger.Printf("Extracted title from %s: %s", urlStr, pageInfo.Title)
31 | }
32 |
33 | if pageInfo.Title == "" {
34 | // If title cannot be extracted, use the last segment of the URL as the title
35 | pageInfo.Title = extractTitleFromURL(urlStr)
36 | if g.config.VeryVerbose {
37 | g.logger.Printf("Could not extract title, using URL-based title instead: %s", pageInfo.Title)
38 | }
39 | }
40 |
41 | // Extract description
42 | pageInfo.Description = extractDescription(doc)
43 | if g.config.VeryVerbose {
44 | g.logger.Printf("Extracted description from %s: %s", urlStr, truncateString(pageInfo.Description, 100))
45 | }
46 |
47 | // Extract content in full mode
48 | if g.config.FullMode {
49 | if g.config.VeryVerbose {
50 | g.logger.Printf("Extracting full content from %s", urlStr)
51 | }
52 | pageInfo.Content = extractContent(doc)
53 | if g.config.VeryVerbose {
54 | contentLen := len(pageInfo.Content)
55 | preview := truncateString(pageInfo.Content, 100)
56 | g.logger.Printf("Extracted content from %s (%d chars): %s", urlStr, contentLen, preview)
57 | }
58 | }
59 |
60 | return pageInfo, nil
61 | }
62 |
63 | // Helper function: truncate string and add ellipsis
64 | func truncateString(s string, maxLen int) string {
65 | s = strings.TrimSpace(s)
66 | if len(s) <= maxLen {
67 | return s
68 | }
69 | return s[:maxLen] + "..."
70 | }
71 |
72 | // Extract section information from URL
73 | func parseSection(urlStr string) string {
74 | // Parse URL
75 | parsedURL, err := url.Parse(urlStr)
76 | if err != nil {
77 | return "ROOT"
78 | }
79 |
80 | // Split path
81 | pathParts := strings.Split(strings.Trim(parsedURL.Path, "/"), "/")
82 |
83 | // If path is empty, return ROOT
84 | if len(pathParts) == 0 || pathParts[0] == "" {
85 | return "ROOT"
86 | }
87 |
88 | // Return first segment of path
89 | return pathParts[0]
90 | }
91 |
92 | // Extract title from HTML document
93 | func extractTitle(doc *goquery.Document) string {
94 | // Try to extract from title tag
95 | title := doc.Find("title").First().Text()
96 | title = strings.TrimSpace(title)
97 |
98 | // If no title tag, try to extract from h1 tag
99 | if title == "" {
100 | title = doc.Find("h1").First().Text()
101 | title = strings.TrimSpace(title)
102 | }
103 |
104 | return title
105 | }
106 |
107 | // Extract title from URL
108 | func extractTitleFromURL(urlStr string) string {
109 | // Parse URL
110 | parsedURL, err := url.Parse(urlStr)
111 | if err != nil {
112 | return urlStr
113 | }
114 |
115 | // Get the last segment of the path
116 | basename := path.Base(parsedURL.Path)
117 |
118 | // Remove file extension
119 | basename = strings.TrimSuffix(basename, path.Ext(basename))
120 |
121 | // If basename is empty or is "/", use hostname
122 | if basename == "" || basename == "." || basename == "/" {
123 | return parsedURL.Hostname()
124 | }
125 |
126 | // Replace hyphens and underscores with spaces, and capitalize
127 | basename = strings.ReplaceAll(basename, "-", " ")
128 | basename = strings.ReplaceAll(basename, "_", " ")
129 |
130 | return strings.Title(basename)
131 | }
132 |
133 | // Extract description from HTML document
134 | func extractDescription(doc *goquery.Document) string {
135 | var description string
136 |
137 | // Try meta description
138 | description, _ = doc.Find("meta[name='description']").Attr("content")
139 | if description != "" {
140 | return strings.TrimSpace(description)
141 | }
142 |
143 | // Try og:description
144 | description, _ = doc.Find("meta[property='og:description']").Attr("content")
145 | if description != "" {
146 | return strings.TrimSpace(description)
147 | }
148 |
149 | // Try twitter:description
150 | description, _ = doc.Find("meta[name='twitter:description']").Attr("content")
151 | if description != "" {
152 | return strings.TrimSpace(description)
153 | }
154 |
155 | // If none found, extract first text
156 | description = doc.Find("p").First().Text()
157 | if description != "" {
158 | // Limit length
159 | if len(description) > 200 {
160 | description = description[:197] + "..."
161 | }
162 | return strings.TrimSpace(description)
163 | }
164 |
165 | return "No description available"
166 | }
167 |
168 | // Extract content from HTML document
169 | func extractContent(doc *goquery.Document) string {
170 | var content strings.Builder
171 |
172 | // Try to find main content area
173 | mainContent := doc.Find("article, main, #content, .content, .post-content").First()
174 |
175 | // If no specific content area found, use body
176 | if mainContent.Length() == 0 {
177 | mainContent = doc.Find("body")
178 | }
179 |
180 | // Extract all paragraphs
181 | mainContent.Find("p, h1, h2, h3, h4, h5, h6, ul, ol, blockquote").Each(func(i int, s *goquery.Selection) {
182 | // Get tag name
183 | tagName := goquery.NodeName(s)
184 | text := strings.TrimSpace(s.Text())
185 |
186 | if text == "" {
187 | return
188 | }
189 |
190 | // Format according to tag type
191 | switch tagName {
192 | case "h1":
193 | content.WriteString("# " + text + "\n\n")
194 | case "h2":
195 | content.WriteString("## " + text + "\n\n")
196 | case "h3":
197 | content.WriteString("### " + text + "\n\n")
198 | case "h4":
199 | content.WriteString("#### " + text + "\n\n")
200 | case "h5":
201 | content.WriteString("##### " + text + "\n\n")
202 | case "h6":
203 | content.WriteString("###### " + text + "\n\n")
204 | case "p":
205 | content.WriteString(text + "\n\n")
206 | case "blockquote":
207 | content.WriteString("> " + text + "\n\n")
208 | case "ul", "ol":
209 | s.Find("li").Each(func(j int, li *goquery.Selection) {
210 | liText := strings.TrimSpace(li.Text())
211 | if liText != "" {
212 | if tagName == "ul" {
213 | content.WriteString("- " + liText + "\n")
214 | } else {
215 | content.WriteString(fmt.Sprintf("%d. %s\n", j+1, liText))
216 | }
217 | }
218 | })
219 | content.WriteString("\n")
220 | }
221 | })
222 |
223 | // Limit content length
224 | contentStr := content.String()
225 | if len(contentStr) > 10000 {
226 | // Find last paragraph end position
227 | lastParaEnd := strings.LastIndex(contentStr[:10000], "\n\n")
228 | if lastParaEnd == -1 {
229 | lastParaEnd = 10000
230 | }
231 | contentStr = contentStr[:lastParaEnd] + "\n\n... (content truncated)"
232 | }
233 |
234 | return contentStr
235 | }
236 |
--------------------------------------------------------------------------------
/internal/llmstxt/sitemap.go:
--------------------------------------------------------------------------------
1 | package llmstxt
2 |
3 | import (
4 | "encoding/xml"
5 | "fmt"
6 | "io"
7 | "net/http"
8 | "strings"
9 | "time"
10 |
11 | "github.com/gobwas/glob"
12 | )
13 |
14 | // Sitemap XML structure
15 | type Sitemap struct {
16 | XMLName xml.Name `xml:"urlset"`
17 | URLs []struct {
18 | Loc string `xml:"loc"`
19 | LastMod string `xml:"lastmod,omitempty"`
20 | ChangeFreq string `xml:"changefreq,omitempty"`
21 | Priority string `xml:"priority,omitempty"`
22 | } `xml:"url"`
23 | }
24 |
25 | // SitemapIndex XML structure
26 | type SitemapIndex struct {
27 | XMLName xml.Name `xml:"sitemapindex"`
28 | Sitemaps []struct {
29 | Loc string `xml:"loc"`
30 | LastMod string `xml:"lastmod,omitempty"`
31 | } `xml:"sitemap"`
32 | }
33 |
34 | // Parse sitemap.xml file and return all URLs
35 | func (g *Generator) parseSitemap() ([]string, error) {
36 | g.logger.Printf("Parsing sitemap from %s", g.config.SitemapURL)
37 |
38 | // Set HTTP client
39 | client := &http.Client{
40 | Timeout: time.Duration(g.config.Timeout) * time.Second,
41 | }
42 |
43 | // Build request
44 | req, err := http.NewRequest("GET", g.config.SitemapURL, nil)
45 | if err != nil {
46 | return nil, fmt.Errorf("failed to create request: %w", err)
47 | }
48 |
49 | // Set User-Agent
50 | req.Header.Set("User-Agent", g.config.UserAgent)
51 |
52 | // Send request
53 | resp, err := client.Do(req)
54 | if err != nil {
55 | return nil, fmt.Errorf("failed to fetch sitemap: %w", err)
56 | }
57 | defer resp.Body.Close()
58 |
59 | if resp.StatusCode != http.StatusOK {
60 | return nil, fmt.Errorf("failed to fetch sitemap, status code: %d", resp.StatusCode)
61 | }
62 |
63 | // Read response body
64 | body, err := io.ReadAll(resp.Body)
65 | if err != nil {
66 | return nil, fmt.Errorf("failed to read sitemap content: %w", err)
67 | }
68 |
69 | // Try to parse as standard sitemap
70 | var sitemap Sitemap
71 | if err := xml.Unmarshal(body, &sitemap); err == nil && len(sitemap.URLs) > 0 {
72 | g.logger.Println("Parsed standard sitemap")
73 | return g.extractURLsFromSitemap(sitemap), nil
74 | }
75 |
76 | // Try to parse as sitemap index
77 | var sitemapIndex SitemapIndex
78 | if err := xml.Unmarshal(body, &sitemapIndex); err == nil && len(sitemapIndex.Sitemaps) > 0 {
79 | g.logger.Println("Parsed sitemap index, fetching child sitemaps")
80 | return g.fetchSitemapIndex(sitemapIndex, client)
81 | }
82 |
83 | // If all parsing fails, try to handle as text sitemap (one URL per line)
84 | lines := string(body)
85 | if len(lines) > 0 {
86 | g.logger.Println("Parsing as text sitemap")
87 | return g.parseTextSitemap(lines), nil
88 | }
89 |
90 | return nil, fmt.Errorf("could not parse sitemap, unknown format")
91 | }
92 |
93 | // Extract URLs from standard sitemap
94 | func (g *Generator) extractURLsFromSitemap(sitemap Sitemap) []string {
95 | urls := make([]string, 0, len(sitemap.URLs))
96 | for _, urlEntry := range sitemap.URLs {
97 | if urlEntry.Loc != "" {
98 | urls = append(urls, urlEntry.Loc)
99 | }
100 | }
101 | return urls
102 | }
103 |
104 | // Get all child sitemap URLs from sitemap index
105 | func (g *Generator) fetchSitemapIndex(index SitemapIndex, client *http.Client) ([]string, error) {
106 | var allURLs []string
107 |
108 | for _, sitemapEntry := range index.Sitemaps {
109 | if sitemapEntry.Loc == "" {
110 | continue
111 | }
112 |
113 | g.logger.Printf("Fetching child sitemap: %s", sitemapEntry.Loc)
114 |
115 | // Build request
116 | req, err := http.NewRequest("GET", sitemapEntry.Loc, nil)
117 | if err != nil {
118 | g.logger.Printf("Warning: failed to create request for child sitemap %s: %v", sitemapEntry.Loc, err)
119 | continue
120 | }
121 |
122 | // Set User-Agent
123 | req.Header.Set("User-Agent", g.config.UserAgent)
124 |
125 | // Send request
126 | resp, err := client.Do(req)
127 | if err != nil {
128 | g.logger.Printf("Warning: failed to fetch child sitemap %s: %v", sitemapEntry.Loc, err)
129 | continue
130 | }
131 |
132 | // Read response body
133 | body, err := io.ReadAll(resp.Body)
134 | resp.Body.Close()
135 | if err != nil {
136 | g.logger.Printf("Warning: failed to read child sitemap %s: %v", sitemapEntry.Loc, err)
137 | continue
138 | }
139 |
140 | // Parse child sitemap
141 | var childSitemap Sitemap
142 | if err := xml.Unmarshal(body, &childSitemap); err != nil {
143 | g.logger.Printf("Warning: failed to parse child sitemap %s: %v", sitemapEntry.Loc, err)
144 | continue
145 | }
146 |
147 | // Extract URLs
148 | childURLs := g.extractURLsFromSitemap(childSitemap)
149 | g.logger.Printf("Found %d URLs in child sitemap %s", len(childURLs), sitemapEntry.Loc)
150 | allURLs = append(allURLs, childURLs...)
151 | }
152 |
153 | return allURLs, nil
154 | }
155 |
156 | // Parse text sitemap (one URL per line)
157 | func (g *Generator) parseTextSitemap(content string) []string {
158 | lines := splitLines(content)
159 | var urls []string
160 |
161 | for _, line := range lines {
162 | line = normalizeURL(line)
163 | if isValidURL(line) {
164 | urls = append(urls, line)
165 | }
166 | }
167 |
168 | return urls
169 | }
170 |
171 | // Filter URLs based on include/exclude mode
172 | func (g *Generator) filterURLs(urls []string) []string {
173 | if len(g.config.IncludePaths) == 0 && len(g.config.ExcludePaths) == 0 {
174 | return urls // No filtering rules, return directly
175 | }
176 |
177 | // Compile include/exclude mode
178 | var includeMatchers, excludeMatchers []glob.Glob
179 | for _, pattern := range g.config.IncludePaths {
180 | matcher, err := glob.Compile(pattern)
181 | if err != nil {
182 | g.logger.Printf("Warning: invalid include pattern '%s': %v", pattern, err)
183 | continue
184 | }
185 | includeMatchers = append(includeMatchers, matcher)
186 | }
187 |
188 | for _, pattern := range g.config.ExcludePaths {
189 | matcher, err := glob.Compile(pattern)
190 | if err != nil {
191 | g.logger.Printf("Warning: invalid exclude pattern '%s': %v", pattern, err)
192 | continue
193 | }
194 | excludeMatchers = append(excludeMatchers, matcher)
195 | }
196 |
197 | var filteredURLs []string
198 | for _, url := range urls {
199 | // If there are include rules, one of them must match
200 | if len(includeMatchers) > 0 {
201 | matched := false
202 | for _, matcher := range includeMatchers {
203 | if matcher.Match(url) {
204 | matched = true
205 | break
206 | }
207 | }
208 | if !matched {
209 | continue
210 | }
211 | }
212 |
213 | // If any exclude rules match, exclude
214 | excluded := false
215 | for _, matcher := range excludeMatchers {
216 | if matcher.Match(url) {
217 | excluded = true
218 | break
219 | }
220 | }
221 | if excluded {
222 | continue
223 | }
224 |
225 | filteredURLs = append(filteredURLs, url)
226 | }
227 |
228 | return filteredURLs
229 | }
230 |
231 | // Helper function: split text by line
232 | func splitLines(s string) []string {
233 | return strings.Split(s, "\n")
234 | }
235 |
236 | // Helper function: normalize URL (remove spaces, etc.)
237 | func normalizeURL(url string) string {
238 | return url
239 | }
240 |
241 | // Helper function: check if URL is valid
242 | func isValidURL(url string) bool {
243 | return url != ""
244 | }
245 |
--------------------------------------------------------------------------------
/internal/linter/rules_test.go:
--------------------------------------------------------------------------------
1 | package linter
2 |
3 | import (
4 | "testing"
5 | )
6 |
7 | func TestMD047_FileEndingCheck(t *testing.T) {
8 | tests := []struct {
9 | name string
10 | lines []string
11 | expectIssue bool
12 | description string
13 | }{
14 | {
15 | name: "file ends with single newline",
16 | lines: []string{"# Title", "Content", ""},
17 | expectIssue: false,
18 | description: "should not trigger issue when file ends with single newline",
19 | },
20 | {
21 | name: "file does not end with newline",
22 | lines: []string{"# Title", "Content"},
23 | expectIssue: true,
24 | description: "should trigger issue when file doesn't end with newline",
25 | },
26 | {
27 | name: "file ends with multiple newlines",
28 | lines: []string{"# Title", "Content", "", ""},
29 | expectIssue: true,
30 | description: "should trigger issue when file ends with multiple newlines",
31 | },
32 | {
33 | name: "empty file",
34 | lines: []string{},
35 | expectIssue: false,
36 | description: "should not trigger issue for empty file",
37 | },
38 | }
39 |
40 | rule := &MD047{BaseRule: BaseRule{id: "MD047", description: "Files should end with a single newline character", enabled: true}}
41 |
42 | for _, tt := range tests {
43 | t.Run(tt.name, func(t *testing.T) {
44 | issues := rule.Check(tt.lines)
45 | hasIssue := len(issues) > 0
46 |
47 | if hasIssue != tt.expectIssue {
48 | t.Errorf("%s: expected issue=%t, got issue=%t", tt.description, tt.expectIssue, hasIssue)
49 | if hasIssue {
50 | for _, issue := range issues {
51 | t.Logf("Issue: %s", issue.Message)
52 | }
53 | }
54 | }
55 | })
56 | }
57 | }
58 |
59 | func TestMD032_ListBlankLines(t *testing.T) {
60 | tests := []struct {
61 | name string
62 | lines []string
63 | expectCount int
64 | description string
65 | }{
66 | {
67 | name: "list with proper blank lines",
68 | lines: []string{
69 | "# Title",
70 | "",
71 | "- Item 1",
72 | "- Item 2",
73 | "",
74 | "Content after list",
75 | },
76 | expectCount: 0,
77 | description: "should not trigger issue when list has proper blank lines",
78 | },
79 | {
80 | name: "list without blank line before",
81 | lines: []string{
82 | "# Title",
83 | "Some text",
84 | "- Item 1",
85 | "",
86 | "Content after list",
87 | },
88 | expectCount: 1,
89 | description: "should trigger issue when list doesn't have blank line before",
90 | },
91 | {
92 | name: "list without blank line after",
93 | lines: []string{
94 | "# Title",
95 | "",
96 | "- Item 1",
97 | "Content after list",
98 | },
99 | expectCount: 1,
100 | description: "should trigger issue when list doesn't have blank line after",
101 | },
102 | {
103 | name: "list without blank lines before and after",
104 | lines: []string{
105 | "# Title",
106 | "Some text",
107 | "- Item 1",
108 | "Content after list",
109 | },
110 | expectCount: 2,
111 | description: "should trigger 2 issues when list doesn't have blank lines before and after",
112 | },
113 | }
114 |
115 | rule := &MD032{BaseRule: BaseRule{id: "MD032", description: "Lists should be surrounded by blank lines", enabled: true}}
116 |
117 | for _, tt := range tests {
118 | t.Run(tt.name, func(t *testing.T) {
119 | issues := rule.Check(tt.lines)
120 |
121 | if len(issues) != tt.expectCount {
122 | t.Errorf("%s: expected %d issues, got %d issues", tt.description, tt.expectCount, len(issues))
123 | for i, issue := range issues {
124 | t.Logf("Issue %d: Line %d - %s", i+1, issue.Line, issue.Message)
125 | }
126 | }
127 | })
128 | }
129 | }
130 |
131 | func TestRegexPrecompilation(t *testing.T) {
132 | tests := []struct {
133 | name string
134 | rule Rule
135 | }{
136 | {"MD018", &MD018{BaseRule: BaseRule{id: "MD018", enabled: true}}},
137 | {"MD019", &MD019{BaseRule: BaseRule{id: "MD019", enabled: true}}},
138 | {"MD023", &MD023{BaseRule: BaseRule{id: "MD023", enabled: true}}},
139 | {"MD032", &MD032{BaseRule: BaseRule{id: "MD032", enabled: true}}},
140 | }
141 |
142 | for _, tt := range tests {
143 | t.Run(tt.name, func(t *testing.T) {
144 | // Call Check method to trigger regex compilation
145 | _ = tt.rule.Check([]string{"# Test", "Content"})
146 |
147 | // Check that pattern was compiled for rules that have patterns
148 | switch rule := tt.rule.(type) {
149 | case *MD018:
150 | if rule.pattern == nil {
151 | t.Error("MD018 pattern was not compiled")
152 | }
153 | case *MD019:
154 | if rule.pattern == nil {
155 | t.Error("MD019 pattern was not compiled")
156 | }
157 | case *MD023:
158 | if rule.pattern == nil {
159 | t.Error("MD023 pattern was not compiled")
160 | }
161 | case *MD032:
162 | if rule.pattern == nil {
163 | t.Error("MD032 pattern was not compiled")
164 | }
165 | }
166 | })
167 | }
168 | }
169 |
170 | func TestMD018_NoSpaceAfterHash(t *testing.T) {
171 | rule := &MD018{BaseRule: BaseRule{id: "MD018", enabled: true}}
172 |
173 | tests := []struct {
174 | line string
175 | expectIssue bool
176 | }{
177 | {"# Proper heading", false},
178 | {"#Bad heading", true},
179 | {"## Another proper heading", false},
180 | {"##Bad heading", true},
181 | {"### Yet another proper heading", false},
182 | {"###Bad heading", true},
183 | {"Not a heading", false},
184 | {"", false},
185 | }
186 |
187 | for _, tt := range tests {
188 | t.Run(tt.line, func(t *testing.T) {
189 | issues := rule.Check([]string{tt.line})
190 | hasIssue := len(issues) > 0
191 |
192 | if hasIssue != tt.expectIssue {
193 | t.Errorf("Line %q: expected issue=%t, got issue=%t", tt.line, tt.expectIssue, hasIssue)
194 | }
195 | })
196 | }
197 | }
198 |
199 | func TestMD019_MultipleSpacesAfterHash(t *testing.T) {
200 | rule := &MD019{BaseRule: BaseRule{id: "MD019", enabled: true}}
201 |
202 | tests := []struct {
203 | line string
204 | expectIssue bool
205 | }{
206 | {"# Proper heading", false},
207 | {"# Bad heading", true},
208 | {"## Another proper heading", false},
209 | {"## Bad heading", true},
210 | {"### Very bad heading", true},
211 | {"Not a heading", false},
212 | {"", false},
213 | }
214 |
215 | for _, tt := range tests {
216 | t.Run(tt.line, func(t *testing.T) {
217 | issues := rule.Check([]string{tt.line})
218 | hasIssue := len(issues) > 0
219 |
220 | if hasIssue != tt.expectIssue {
221 | t.Errorf("Line %q: expected issue=%t, got issue=%t", tt.line, tt.expectIssue, hasIssue)
222 | }
223 | })
224 | }
225 | }
226 |
227 | func TestMD023_HeadingAtStartOfLine(t *testing.T) {
228 | rule := &MD023{BaseRule: BaseRule{id: "MD023", enabled: true}}
229 |
230 | tests := []struct {
231 | line string
232 | expectIssue bool
233 | }{
234 | {"# Proper heading", false},
235 | {" # Bad heading", true},
236 | {" ## Very bad heading", true},
237 | {"Not a heading", false},
238 | {"", false},
239 | }
240 |
241 | for _, tt := range tests {
242 | t.Run(tt.line, func(t *testing.T) {
243 | issues := rule.Check([]string{tt.line})
244 | hasIssue := len(issues) > 0
245 |
246 | if hasIssue != tt.expectIssue {
247 | t.Errorf("Line %q: expected issue=%t, got issue=%t", tt.line, tt.expectIssue, hasIssue)
248 | }
249 | })
250 | }
251 | }
252 |
--------------------------------------------------------------------------------
/internal/config/config.go:
--------------------------------------------------------------------------------
1 | package config
2 |
3 | import (
4 | "encoding/json"
5 | "fmt"
6 | "os"
7 | "path/filepath"
8 | )
9 |
10 | type CloudConfig struct {
11 | Provider string `json:"provider"`
12 | Region string `json:"region"`
13 | Endpoint string `json:"endpoint"`
14 | AccessKey string `json:"access_key"`
15 | SecretKey string `json:"secret_key"`
16 | Bucket string `json:"bucket"`
17 | AccountID string `json:"account_id,omitempty"`
18 | CustomDomain string `json:"custom_domain,omitempty"`
19 | PathPrefix string `json:"path_prefix,omitempty"`
20 | ProviderOpts map[string]string `json:"provider_opts,omitempty"`
21 | Concurrency int `json:"concurrency"`
22 | SkipVerify bool `json:"skip_verify"`
23 | CACertPath string `json:"ca_cert_path,omitempty"`
24 | ConflictPolicy string `json:"conflict_policy"`
25 | CacheDir string `json:"cache_dir,omitempty"`
26 | }
27 |
28 | type Config struct {
29 | TranslatePrompt string `json:"translate_prompt"`
30 | OpenAIEndpointURL string `json:"endpoint"`
31 | OpenAIAPIKey string `json:"api_key"`
32 | ModelName string `json:"model"`
33 | Temperature float64 `json:"temperature"`
34 | TopP float64 `json:"top_p"`
35 | CloudStorages map[string]CloudConfig `json:"cloud_storages,omitempty"`
36 | DefaultStorage string `json:"default_storage,omitempty"`
37 | }
38 |
39 | var DefaultCloudConfig = CloudConfig{
40 | Provider: "",
41 | Region: "auto",
42 | Endpoint: "",
43 | AccessKey: "",
44 | SecretKey: "",
45 | Bucket: "",
46 | Concurrency: 5,
47 | SkipVerify: false,
48 | ConflictPolicy: "rename",
49 | }
50 |
51 | var DefaultConfig = Config{
52 | TranslatePrompt: "Translate the markdown to {TARGET_LANG} as a native speaker - preserve code/YAML/links/cli commands (e.g. `kubectl apply` or `pip install langchain`) and tech terms (CRDs, Helm charts, RAG). Output ONLY fluently localized text with natural technical phrasing that doesn't read machine-generated.",
53 | OpenAIEndpointURL: "https://api.openai.com/v1",
54 | OpenAIAPIKey: "",
55 | ModelName: "gpt-3.5-turbo",
56 | Temperature: 0.0,
57 | TopP: 1.0,
58 | CloudStorages: make(map[string]CloudConfig),
59 | }
60 |
61 | func GetConfigPath() string {
62 | homeDir, err := os.UserHomeDir()
63 | if err != nil {
64 | return ""
65 | }
66 | return filepath.Join(homeDir, ".config", "mdctl", "config.json")
67 | }
68 |
69 | func LoadConfig() (*Config, error) {
70 | configPath := GetConfigPath()
71 | if configPath == "" {
72 | return &DefaultConfig, nil
73 | }
74 |
75 | if _, err := os.Stat(configPath); os.IsNotExist(err) {
76 | if err := SaveConfig(&DefaultConfig); err != nil {
77 | return &DefaultConfig, fmt.Errorf("failed to create default config: %v", err)
78 | }
79 | return &DefaultConfig, nil
80 | }
81 |
82 | data, err := os.ReadFile(configPath)
83 | if err != nil {
84 | return &DefaultConfig, fmt.Errorf("failed to read config file: %v", err)
85 | }
86 |
87 | var config Config
88 | if err := json.Unmarshal(data, &config); err != nil {
89 | os.Remove(configPath)
90 | if err := SaveConfig(&DefaultConfig); err != nil {
91 | return &DefaultConfig, fmt.Errorf("failed to create new config after invalid file: %v", err)
92 | }
93 | return &DefaultConfig, fmt.Errorf("invalid config file (recreated with defaults): %v", err)
94 | }
95 |
96 | if config.TranslatePrompt == "" {
97 | config.TranslatePrompt = DefaultConfig.TranslatePrompt
98 | }
99 | if config.OpenAIEndpointURL == "" {
100 | config.OpenAIEndpointURL = DefaultConfig.OpenAIEndpointURL
101 | }
102 | if config.ModelName == "" {
103 | config.ModelName = DefaultConfig.ModelName
104 | }
105 |
106 | // Ensure CloudStorages is non-nil
107 | if config.CloudStorages == nil {
108 | config.CloudStorages = make(map[string]CloudConfig)
109 | }
110 |
111 | // Check if default storage exists
112 | if config.DefaultStorage != "" {
113 | if _, exists := config.CloudStorages[config.DefaultStorage]; !exists {
114 | // If specified default storage doesn't exist, use the first available one
115 | if len(config.CloudStorages) > 0 {
116 | for name := range config.CloudStorages {
117 | config.DefaultStorage = name
118 | break
119 | }
120 | } else {
121 | config.DefaultStorage = ""
122 | }
123 | }
124 | } else if len(config.CloudStorages) > 0 {
125 | // If no default storage is set but there are storage configurations, set the first one as default
126 | for name := range config.CloudStorages {
127 | config.DefaultStorage = name
128 | break
129 | }
130 | }
131 |
132 | return &config, nil
133 | }
134 |
135 | func SaveConfig(config *Config) error {
136 | configPath := GetConfigPath()
137 | if configPath == "" {
138 | return fmt.Errorf("failed to get config path")
139 | }
140 |
141 | configDir := filepath.Dir(configPath)
142 | if err := os.MkdirAll(configDir, 0755); err != nil {
143 | return fmt.Errorf("failed to create config directory: %v", err)
144 | }
145 |
146 | data, err := json.MarshalIndent(config, "", " ")
147 | if err != nil {
148 | return fmt.Errorf("failed to marshal config: %v", err)
149 | }
150 |
151 | if err := os.WriteFile(configPath, data, 0644); err != nil {
152 | return fmt.Errorf("failed to write config file: %v", err)
153 | }
154 |
155 | return nil
156 | }
157 |
158 | // ApplyCloudConfig applies platform-specific settings to the cloud configuration
159 | func (c *Config) ApplyCloudConfig() {
160 | // Ensure CloudStorages is non-nil
161 | if c.CloudStorages == nil {
162 | c.CloudStorages = make(map[string]CloudConfig)
163 | }
164 |
165 | // Check if default storage exists
166 | if c.DefaultStorage != "" {
167 | if _, exists := c.CloudStorages[c.DefaultStorage]; !exists {
168 | // If specified default storage doesn't exist, use the first available one
169 | for name := range c.CloudStorages {
170 | c.DefaultStorage = name
171 | break
172 | }
173 | }
174 | }
175 |
176 | // If no default storage is set but there are storage configurations, set the first one as default
177 | if c.DefaultStorage == "" && len(c.CloudStorages) > 0 {
178 | for name := range c.CloudStorages {
179 | c.DefaultStorage = name
180 | break
181 | }
182 | }
183 | }
184 |
185 | // GetActiveCloudConfig returns the current active cloud storage configuration
186 | // The storageName parameter can specify which configuration to use, if empty the default configuration is used
187 | func (c *Config) GetActiveCloudConfig(storageName string) CloudConfig {
188 | // If a storage name is specified, try to get that configuration
189 | if storageName != "" {
190 | if storage, exists := c.CloudStorages[storageName]; exists {
191 | return storage
192 | }
193 | }
194 |
195 | // If there's a default configuration, use that
196 | if c.DefaultStorage != "" {
197 | if storage, exists := c.CloudStorages[c.DefaultStorage]; exists {
198 | return storage
199 | }
200 | }
201 |
202 | // If any configuration is available, return the first one found
203 | if len(c.CloudStorages) > 0 {
204 | for _, storage := range c.CloudStorages {
205 | return storage
206 | }
207 | }
208 |
209 | // Return default empty configuration
210 | return DefaultCloudConfig
211 | }
212 |
--------------------------------------------------------------------------------
/cmd/lint.go:
--------------------------------------------------------------------------------
1 | package cmd
2 |
3 | import (
4 | "encoding/json"
5 | "fmt"
6 | "os"
7 | "path/filepath"
8 | "strings"
9 |
10 | "github.com/samzong/mdctl/internal/linter"
11 | "github.com/spf13/cobra"
12 | )
13 |
14 | var (
15 | autoFix bool
16 | configRules []string
17 | outputFormat string
18 | rulesFile string
19 | enableRules []string
20 | disableRules []string
21 | initConfig bool
22 | configOutput string
23 | )
24 |
25 | var lintCmd = &cobra.Command{
26 | Use: "lint [files...]",
27 | Short: "Lint markdown files for syntax issues",
28 | Long: `Lint markdown files using markdownlint rules to find syntax issues.
29 |
30 | This command will scan markdown files and report any syntax issues found.
31 | It can also automatically fix issues when --fix flag is used.
32 |
33 | Examples:
34 | # Lint a single file
35 | mdctl lint README.md
36 |
37 | # Lint multiple files
38 | mdctl lint docs/*.md
39 |
40 | # Lint with auto-fix
41 | mdctl lint --fix README.md
42 |
43 | # Lint with custom rules configuration
44 | mdctl lint --config .markdownlint.json README.md
45 |
46 | # Enable specific rules
47 | mdctl lint --enable MD001,MD003 README.md
48 |
49 | # Disable specific rules
50 | mdctl lint --disable MD013,MD033 README.md
51 |
52 | # Create a default configuration file
53 | mdctl lint --init
54 |
55 | # Create a configuration file with custom name
56 | mdctl lint --init --init-config my-rules.json`,
57 | RunE: func(cmd *cobra.Command, args []string) error {
58 | // Handle config initialization
59 | if initConfig {
60 | configFile := configOutput
61 | if configFile == "" {
62 | configFile = ".markdownlint.json"
63 | }
64 |
65 | if err := linter.CreateDefaultConfig(configFile); err != nil {
66 | return fmt.Errorf("failed to create config file: %v", err)
67 | }
68 |
69 | fmt.Printf("Created markdownlint configuration file: %s\n", configFile)
70 | return nil
71 | }
72 |
73 | if len(args) == 0 {
74 | return fmt.Errorf("at least one markdown file must be specified")
75 | }
76 |
77 | // Expand file patterns
78 | var files []string
79 | for _, arg := range args {
80 | // Basic security validation - prevent path traversal
81 | if strings.Contains(arg, "..") {
82 | return fmt.Errorf("path traversal not allowed: %s", arg)
83 | }
84 |
85 | matches, err := filepath.Glob(arg)
86 | if err != nil {
87 | return fmt.Errorf("invalid file pattern %s: %v", arg, err)
88 | }
89 | if len(matches) == 0 {
90 | // If no glob matches, check if it's a direct file
91 | if _, err := os.Stat(arg); err == nil {
92 | files = append(files, arg)
93 | } else {
94 | fmt.Printf("Warning: No files found matching pattern: %s\n", arg)
95 | }
96 | } else {
97 | files = append(files, matches...)
98 | }
99 | }
100 |
101 | // Filter for markdown files
102 | var markdownFiles []string
103 | for _, file := range files {
104 | if strings.HasSuffix(strings.ToLower(file), ".md") || strings.HasSuffix(strings.ToLower(file), ".markdown") {
105 | markdownFiles = append(markdownFiles, file)
106 | }
107 | }
108 |
109 | if len(markdownFiles) == 0 {
110 | return fmt.Errorf("no markdown files found")
111 | }
112 |
113 | // Create linter configuration
114 | config := &linter.Config{
115 | AutoFix: autoFix,
116 | OutputFormat: outputFormat,
117 | RulesFile: rulesFile,
118 | EnableRules: enableRules,
119 | DisableRules: disableRules,
120 | Verbose: verbose,
121 | }
122 |
123 | // Create linter instance
124 | mdLinter := linter.New(config)
125 |
126 | // Process files
127 | var totalIssues int
128 | var totalFixed int
129 |
130 | for _, file := range markdownFiles {
131 | if verbose {
132 | fmt.Printf("Linting: %s\n", file)
133 | }
134 |
135 | result, err := mdLinter.LintFile(file)
136 | if err != nil {
137 | fmt.Printf("Error linting %s: %v\n", file, err)
138 | continue
139 | }
140 |
141 | totalIssues += len(result.Issues)
142 | totalFixed += result.FixedCount
143 |
144 | // Display results based on output format
145 | if err := displayResults(file, result, config); err != nil {
146 | return fmt.Errorf("error displaying results: %v", err)
147 | }
148 | }
149 |
150 | // Summary
151 | if verbose || len(markdownFiles) > 1 {
152 | fmt.Printf("\nSummary:\n")
153 | fmt.Printf(" Files processed: %d\n", len(markdownFiles))
154 | fmt.Printf(" Total issues: %d\n", totalIssues)
155 | if autoFix {
156 | fmt.Printf(" Issues fixed: %d\n", totalFixed)
157 | }
158 | }
159 |
160 | // Exit with error code if issues found and not in fix mode
161 | if totalIssues > 0 && !autoFix {
162 | os.Exit(1)
163 | }
164 |
165 | return nil
166 | },
167 | }
168 |
169 | func displayResults(filename string, result *linter.Result, config *linter.Config) error {
170 | switch config.OutputFormat {
171 | case "json":
172 | return displayJSONResults(filename, result)
173 | case "github":
174 | return displayGitHubResults(filename, result)
175 | default:
176 | return displayDefaultResults(filename, result, config)
177 | }
178 | }
179 |
180 | func displayDefaultResults(filename string, result *linter.Result, config *linter.Config) error {
181 | if len(result.Issues) == 0 {
182 | if config.Verbose {
183 | fmt.Printf("✓ %s: No issues found\n", filename)
184 | }
185 | return nil
186 | }
187 |
188 | fmt.Printf("%s:\n", filename)
189 | for _, issue := range result.Issues {
190 | status := "✗"
191 | if issue.Fixed {
192 | status = "✓"
193 | }
194 |
195 | fmt.Printf(" %s Line %d: %s (%s)\n",
196 | status, issue.Line, issue.Message, issue.Rule)
197 |
198 | if config.Verbose && issue.Context != "" {
199 | fmt.Printf(" Context: %s\n", issue.Context)
200 | }
201 | }
202 |
203 | if config.AutoFix && result.FixedCount > 0 {
204 | fmt.Printf(" Fixed %d issues\n", result.FixedCount)
205 | }
206 |
207 | return nil
208 | }
209 |
210 | func displayJSONResults(filename string, result *linter.Result) error {
211 | output := map[string]interface{}{
212 | "filename": result.Filename,
213 | "issues": result.Issues,
214 | "fixed_count": result.FixedCount,
215 | }
216 |
217 | data, err := json.MarshalIndent(output, "", " ")
218 | if err != nil {
219 | return err
220 | }
221 |
222 | fmt.Println(string(data))
223 | return nil
224 | }
225 |
226 | func displayGitHubResults(filename string, result *linter.Result) error {
227 | // GitHub Actions workflow commands format
228 | for _, issue := range result.Issues {
229 | level := "error"
230 | if issue.Fixed {
231 | level = "notice"
232 | }
233 |
234 | fmt.Printf("::%s file=%s,line=%d::%s (%s)\n",
235 | level, filename, issue.Line, issue.Message, issue.Rule)
236 | }
237 | return nil
238 | }
239 |
240 | func init() {
241 | lintCmd.Flags().BoolVar(&autoFix, "fix", false, "Automatically fix issues where possible")
242 | lintCmd.Flags().StringVar(&outputFormat, "format", "default", "Output format: default, json, github")
243 | lintCmd.Flags().StringVar(&rulesFile, "config", "", "Path to markdownlint configuration file")
244 | lintCmd.Flags().StringSliceVar(&enableRules, "enable", []string{}, "Enable specific rules (comma-separated)")
245 | lintCmd.Flags().StringSliceVar(&disableRules, "disable", []string{}, "Disable specific rules (comma-separated)")
246 | lintCmd.Flags().BoolVar(&initConfig, "init", false, "Create a default .markdownlint.json configuration file")
247 | lintCmd.Flags().StringVar(&configOutput, "init-config", "", "Path for the configuration file when using --init (default: .markdownlint.json)")
248 |
249 | lintCmd.GroupID = "core"
250 | }
251 |
--------------------------------------------------------------------------------
/cmd/upload.go:
--------------------------------------------------------------------------------
1 | package cmd
2 |
3 | import (
4 | "fmt"
5 | "strings"
6 |
7 | "github.com/samzong/mdctl/internal/config"
8 | "github.com/samzong/mdctl/internal/uploader"
9 | "github.com/spf13/cobra"
10 | )
11 |
12 | var (
13 | // Upload command flags
14 | uploadSourceFile string
15 | uploadSourceDir string
16 | uploadProvider string
17 | uploadBucket string
18 | uploadCustomDomain string
19 | uploadPathPrefix string
20 | uploadDryRun bool
21 | uploadConcurrency int
22 | uploadForceUpload bool
23 | uploadSkipVerify bool
24 | uploadCACertPath string
25 | uploadConflictPolicy string
26 | uploadCacheDir string
27 | uploadIncludeExts string
28 | uploadStorageName string
29 |
30 | uploadCmd = &cobra.Command{
31 | Use: "upload",
32 | Short: "Upload local images in markdown files to cloud storage",
33 | Long: `Upload local images in markdown files to cloud storage and rewrite URLs.
34 | Supports multiple cloud storage providers with S3-compatible APIs.
35 |
36 | Examples:
37 | mdctl upload -d docs/
38 | mdctl upload -f post.md
39 | mdctl upload -f post.md --storage my-s3`,
40 | RunE: func(cmd *cobra.Command, args []string) error {
41 | if uploadSourceFile == "" && uploadSourceDir == "" {
42 | return fmt.Errorf("either source file (-f) or source directory (-d) must be specified")
43 | }
44 | if uploadSourceFile != "" && uploadSourceDir != "" {
45 | return fmt.Errorf("cannot specify both source file (-f) and source directory (-d)")
46 | }
47 |
48 | // Load configuration file first
49 | cfg, err := config.LoadConfig()
50 | if err != nil {
51 | return fmt.Errorf("failed to load config: %v", err)
52 | }
53 |
54 | // Get active cloud storage configuration
55 | cloudConfig := cfg.GetActiveCloudConfig(uploadStorageName)
56 |
57 | // Command line parameters take precedence over configuration
58 | if uploadProvider == "" {
59 | uploadProvider = cloudConfig.Provider
60 | }
61 |
62 | if uploadBucket == "" {
63 | uploadBucket = cloudConfig.Bucket
64 | }
65 |
66 | // Check for empty values after using configuration file values
67 | if uploadProvider == "" {
68 | return fmt.Errorf("provider (-p) must be specified or set in configuration file")
69 | }
70 |
71 | if uploadBucket == "" {
72 | return fmt.Errorf("bucket (-b) must be specified or set in configuration file")
73 | }
74 |
75 | // Set default region for S3-compatible services
76 | // If region is not set or empty, set default region
77 | if cloudConfig.Region == "" {
78 | switch strings.ToLower(uploadProvider) {
79 | case "s3":
80 | // For AWS S3, default to us-east-1
81 | cloudConfig.Region = "us-east-1"
82 | case "r2", "minio", "b2":
83 | // For S3-compatible services, region can be any value but must be provided
84 | cloudConfig.Region = "auto"
85 | }
86 | }
87 |
88 | // If not specified in command line, get other configuration parameters
89 | if uploadCustomDomain == "" {
90 | uploadCustomDomain = cloudConfig.CustomDomain
91 | }
92 |
93 | if uploadPathPrefix == "" {
94 | uploadPathPrefix = cloudConfig.PathPrefix
95 | }
96 |
97 | if uploadConcurrency == 5 && cloudConfig.Concurrency != 0 { // 5 is default value
98 | uploadConcurrency = cloudConfig.Concurrency
99 | }
100 |
101 | if uploadCACertPath == "" {
102 | uploadCACertPath = cloudConfig.CACertPath
103 | }
104 |
105 | if uploadSkipVerify == false && cloudConfig.SkipVerify {
106 | uploadSkipVerify = true
107 | }
108 |
109 | if uploadConflictPolicy == "rename" && cloudConfig.ConflictPolicy != "" {
110 | uploadConflictPolicy = cloudConfig.ConflictPolicy
111 | }
112 |
113 | if uploadCacheDir == "" {
114 | uploadCacheDir = cloudConfig.CacheDir
115 | }
116 |
117 | // Parse include extensions
118 | var exts []string
119 | if uploadIncludeExts != "" {
120 | exts = strings.Split(uploadIncludeExts, ",")
121 | for i, ext := range exts {
122 | exts[i] = strings.TrimSpace(ext)
123 | }
124 | }
125 |
126 | // Validate conflict policy
127 | var conflictPolicy uploader.ConflictPolicy
128 | switch strings.ToLower(uploadConflictPolicy) {
129 | case "rename":
130 | conflictPolicy = uploader.ConflictPolicyRename
131 | case "version":
132 | conflictPolicy = uploader.ConflictPolicyVersion
133 | case "overwrite":
134 | conflictPolicy = uploader.ConflictPolicyOverwrite
135 | case "":
136 | conflictPolicy = uploader.ConflictPolicyRename // Default
137 | default:
138 | return fmt.Errorf("invalid conflict policy: %s (must be rename, version, or overwrite)", uploadConflictPolicy)
139 | }
140 |
141 | // For R2, use account ID from configuration file
142 | if strings.ToLower(uploadProvider) == "r2" && cloudConfig.AccountID == "" {
143 | fmt.Printf("Note: R2 account ID not found in configuration, please set account_id in config file if you want to use r2.dev public URLs\n")
144 | }
145 |
146 | // Create uploader
147 | up, err := uploader.New(uploader.UploaderConfig{
148 | SourceFile: uploadSourceFile,
149 | SourceDir: uploadSourceDir,
150 | Provider: uploadProvider,
151 | Bucket: uploadBucket,
152 | CustomDomain: uploadCustomDomain,
153 | PathPrefix: uploadPathPrefix,
154 | DryRun: uploadDryRun,
155 | Concurrency: uploadConcurrency,
156 | ForceUpload: uploadForceUpload,
157 | SkipVerify: uploadSkipVerify,
158 | CACertPath: uploadCACertPath,
159 | ConflictPolicy: conflictPolicy,
160 | CacheDir: uploadCacheDir,
161 | FileExtensions: exts,
162 | })
163 | if err != nil {
164 | return fmt.Errorf("failed to create uploader: %v", err)
165 | }
166 |
167 | // Process files
168 | stats, err := up.Process()
169 | if err != nil {
170 | return fmt.Errorf("failed to process files: %v", err)
171 | }
172 |
173 | // Print statistics
174 | fmt.Printf("\nUpload Statistics:\n")
175 | fmt.Printf(" Total Files Processed: %d\n", stats.ProcessedFiles)
176 | fmt.Printf(" Images Uploaded: %d\n", stats.UploadedImages)
177 | fmt.Printf(" Images Skipped: %d\n", stats.SkippedImages)
178 | fmt.Printf(" Failed Uploads: %d\n", stats.FailedImages)
179 | fmt.Printf(" Files Changed: %d\n", stats.ChangedFiles)
180 |
181 | return nil
182 | },
183 | }
184 | )
185 |
186 | func init() {
187 | // Add flags
188 | uploadCmd.Flags().StringVarP(&uploadSourceFile, "file", "f", "", "Source markdown file to process")
189 | uploadCmd.Flags().StringVarP(&uploadSourceDir, "dir", "d", "", "Source directory containing markdown files to process")
190 | uploadCmd.Flags().StringVarP(&uploadProvider, "provider", "p", "", "Cloud storage provider (s3, r2, minio)")
191 | uploadCmd.Flags().StringVarP(&uploadBucket, "bucket", "b", "", "Cloud storage bucket name")
192 | uploadCmd.Flags().StringVarP(&uploadCustomDomain, "custom-domain", "c", "", "Custom domain for generated URLs")
193 | uploadCmd.Flags().StringVar(&uploadPathPrefix, "prefix", "", "Path prefix for uploaded files")
194 | uploadCmd.Flags().BoolVar(&uploadDryRun, "dry-run", false, "Preview changes without uploading")
195 | uploadCmd.Flags().IntVar(&uploadConcurrency, "concurrency", 5, "Number of concurrent uploads")
196 | uploadCmd.Flags().BoolVarP(&uploadForceUpload, "force", "F", false, "Force upload even if file exists")
197 | uploadCmd.Flags().BoolVar(&uploadSkipVerify, "skip-verify", false, "Skip SSL verification")
198 | uploadCmd.Flags().StringVar(&uploadCACertPath, "ca-cert", "", "Path to CA certificate")
199 | uploadCmd.Flags().StringVar(&uploadConflictPolicy, "conflict", "rename", "Conflict policy (rename, version, overwrite)")
200 | uploadCmd.Flags().StringVar(&uploadCacheDir, "cache-dir", "", "Cache directory path")
201 | uploadCmd.Flags().StringVar(&uploadIncludeExts, "include", "", "Comma-separated list of file extensions to include")
202 | uploadCmd.Flags().StringVar(&uploadStorageName, "storage", "", "Storage name to use")
203 | }
204 |
--------------------------------------------------------------------------------
/docs/features/upload.md:
--------------------------------------------------------------------------------
1 | ## Design Document: Image Upload Feature for mdctl
2 |
3 | ### Overview
4 |
5 | Add a new feature to mdctl that uploads local images in markdown files to cloud storage services (S3-compatible APIs like Cloudflare R2, AWS S3, etc.) and rewrites the URLs in the markdown content.
6 |
7 | ### Goals
8 |
9 | 1. Upload local images to cloud storage services
10 | 2. Support multiple storage providers with S3-compatible APIs
11 | 3. Rewrite image URLs in markdown files to point to the cloud storage
12 | 4. Maintain the existing design patterns and code structure
13 | 5. Implement idempotent operations with content verification
14 | 6. Support concurrent uploads for performance optimization
15 | 7. Handle custom SSL certificates for various cloud providers
16 |
17 | ### Architecture
18 |
19 | Following the existing architecture pattern of mdctl, the upload feature will be implemented with these components:
20 |
21 | #### 1. Command Layer (`cmd/upload.go`)
22 |
23 | - Define CLI parameters:
24 | - Source file/directory (`-f/--file` or `-d/--dir`)
25 | - Cloud provider (`-p/--provider`)
26 | - Bucket name (`-b/--bucket`)
27 | - Custom domain (optional, `-c/--custom-domain`)
28 | - Path prefix (optional, `--prefix`)
29 | - File extensions to include (optional, `--include`)
30 | - Dry run mode (optional, `--dry-run`)
31 | - Concurrency level (optional, `--concurrency`)
32 | - Force upload (optional, `-F/--force`)
33 | - Skip SSL verification (optional, `--skip-verify`)
34 | - CA certificate path (optional, `--ca-cert`)
35 | - Conflict policy (optional, `--conflict=rename|version|overwrite`)
36 | - Cache directory (optional, `--cache-dir`)
37 |
38 | - Validate input parameters
39 | - Create and configure uploader component
40 | - Add to the "core" command group alongside download and translate
41 |
42 | #### 2. Uploader Module (`internal/uploader/uploader.go`)
43 |
44 | - Core business logic for uploading files
45 | - Methods for:
46 | - Processing single files or directories recursively
47 | - Identifying local images in markdown
48 | - Uploading files to cloud storage
49 | - Rewriting URLs in markdown content
50 | - Generating appropriate cloud storage paths
51 | - Managing the worker pool for concurrent uploads
52 | - Tracking upload progress with statistics
53 | - Calculating and verifying content hashes
54 | - Handling conflict resolution
55 | - Managing the local cache of uploaded files
56 |
57 | #### 3. Storage Provider Interface (`internal/storage/provider.go`)
58 |
59 | - Define a provider interface with methods:
60 | - `Upload(localPath, remotePath string, metadata map[string]string) (url string, err error)`
61 | - `Configure(config CloudConfig) error`
62 | - `GetPublicURL(remotePath string) string`
63 | - `ObjectExists(remotePath string) (bool, error)`
64 | - `CompareHash(remotePath, localHash string) (bool, error)`
65 | - `SetObjectMetadata(remotePath string, metadata map[string]string) error`
66 | - `GetObjectMetadata(remotePath string) (map[string]string, error)`
67 |
68 | #### 4. Storage Provider Implementations
69 |
70 | - S3-compatible provider (`internal/storage/s3.go`):
71 | - Implementation for AWS S3, Cloudflare R2, Minio, etc.
72 | - Configure region, endpoint, credentials
73 | - Handle authentication and uploads
74 | - Support custom certificates and SSL verification options
75 | - Implement content verification with ETag/MD5 hash comparison
76 | - Support object tagging for metadata
77 |
78 | #### 5. Cache Management (`internal/cache/cache.go`)
79 |
80 | - Maintain record of uploaded files with their hash values
81 | - Cache structure with file path, remote URL, and hash
82 | - Support for serializing/deserializing cache to disk
83 | - Methods for lookup, update, and verification
84 |
85 | #### 6. Configuration Extensions (`internal/config/config.go`)
86 |
87 | Add new configuration fields:
88 | ```go
89 | type CloudConfig struct {
90 | Provider string `json:"provider"`
91 | Region string `json:"region"`
92 | Endpoint string `json:"endpoint"`
93 | AccessKey string `json:"access_key"`
94 | SecretKey string `json:"secret_key"`
95 | Bucket string `json:"bucket"`
96 | CustomDomain string `json:"custom_domain,omitempty"`
97 | PathPrefix string `json:"path_prefix,omitempty"`
98 | ProviderOpts map[string]string `json:"provider_opts,omitempty"`
99 | Concurrency int `json:"concurrency"`
100 | SkipVerify bool `json:"skip_verify"`
101 | CACertPath string `json:"ca_cert_path,omitempty"`
102 | ConflictPolicy string `json:"conflict_policy"`
103 | CacheDir string `json:"cache_dir,omitempty"`
104 | }
105 |
106 | // Add to Config struct
107 | type Config struct {
108 | // Existing fields...
109 | CloudStorage CloudConfig `json:"cloud_storage"`
110 | }
111 | ```
112 |
113 | ### Implementation Plan
114 |
115 | 1. Add cloud storage config section to config.go
116 | 2. Implement cache management module
117 | 3. Create storage provider interface
118 | 4. Implement S3-compatible provider with SSL handling
119 | 5. Create worker pool for concurrent uploads
120 | 6. Create uploader module implementation with verification logic
121 | 7. Implement idempotency and conflict resolution strategies
122 | 8. Add upload command to cmd package
123 | 9. Create comprehensive tests
124 | 10. Update help text and documentation
125 | 11. Add sample usage to README
126 |
127 | ### Command Usage Examples
128 |
129 | ```bash
130 | # Upload images from a single file
131 | mdctl upload -f path/to/file.md -p s3 -b my-bucket
132 |
133 | # Upload images from a directory
134 | mdctl upload -d path/to/dir -p r2 -b my-images --prefix blog/
135 |
136 | # Use with a custom domain
137 | mdctl upload -f post.md -p s3 -b media-bucket -c assets.example.com
138 |
139 | # Use custom concurrency setting
140 | mdctl upload -f blog-post.md -p s3 -b my-bucket --concurrency 10
141 |
142 | # Force upload (bypass hash verification)
143 | mdctl upload -f readme.md -p r2 -b my-images -F
144 |
145 | # Specify conflict resolution strategy
146 | mdctl upload -d docs/ -p s3 -b media --conflict=version
147 |
148 | # Use custom SSL certificate
149 | mdctl upload -f doc.md -p s3 -b media --ca-cert /path/to/cert.pem
150 |
151 | # Skip SSL verification for self-signed certificates
152 | mdctl upload -f doc.md -p minio -b local --skip-verify
153 |
154 | # Configure cloud provider
155 | mdctl config set -k cloud_storage.provider -v "r2"
156 | mdctl config set -k cloud_storage.endpoint -v "https://xxxx.r2.cloudflarestorage.com"
157 | mdctl config set -k cloud_storage.access_key -v "YOUR_ACCESS_KEY"
158 | mdctl config set -k cloud_storage.secret_key -v "YOUR_SECRET_KEY"
159 | mdctl config set -k cloud_storage.bucket -v "my-images"
160 | mdctl config set -k cloud_storage.concurrency -v 5
161 | mdctl config set -k cloud_storage.conflict_policy -v "rename"
162 | ```
163 |
164 | ### Technical Considerations
165 |
166 | 1. **S3 SDK**: Use the AWS SDK for Go to interact with S3-compatible APIs
167 | 2. **Image Processing**: Optional compression/resizing before upload
168 | 3. **Error Handling**: Provide detailed error messages for failed uploads
169 | 4. **URL Generation**:
170 | - Support both direct S3 URLs or custom domain URLs
171 | - Handle path prefixing correctly
172 | 5. **Idempotency & Verification**:
173 | - Calculate content hashes (MD5/SHA) for each file
174 | - Store metadata in the object tags for verification
175 | - Skip uploads for identical content (check hash before upload)
176 | - Optional force upload flag to override verification
177 | - Maintain a local cache of uploaded files with their hashes
178 | 6. **Concurrency & Reliability**:
179 | - Implement worker pool for parallel uploads
180 | - Configurable concurrency level (default: 5)
181 | - Progress tracking for concurrent operations
182 | - Built-in retry mechanism for failed uploads (hardcoded 3 retry attempts)
183 | - Exponential backoff between retries (starting at 1s, doubling each retry)
184 | - Standard timeout for upload operations
185 | 7. **SSL/Certificate Handling**:
186 | - Support custom CA certificates
187 | - Option to skip verification for self-signed certificates
188 | - Configurable TLS settings per provider
189 | 8. **Conflict Resolution**:
190 | - Strategies for handling name collisions (rename, version, overwrite)
191 | - Option to preserve original filenames or use hashed names
192 | 9. **Incremental Uploads**:
193 | - Track already uploaded files to avoid redundant operations
194 | - Support for resuming interrupted batch uploads
195 |
196 | ### Testing Strategy
197 |
198 | 1. Unit tests for URL parsing and rewriting
199 | 2. Mocked storage provider for testing upload logic
200 | 3. Verification tests for hash calculation and comparison
201 | 4. Concurrency tests to ensure worker pool functions correctly
202 | 5. SSL/TLS configuration tests with mock certificates
203 | 6. Cache management tests for serialization/deserialization
204 | 7. Conflict resolution strategy tests
205 | 8. Integration tests with a local MinIO server
206 | 9. End-to-end tests with actual markdown files
207 | 10. Idempotency tests to verify repeated executions
208 | 11. Performance benchmarks for concurrent uploads
--------------------------------------------------------------------------------
/internal/exporter/sitereader/mkdocs.go:
--------------------------------------------------------------------------------
1 | package sitereader
2 |
3 | import (
4 | "fmt"
5 | "io"
6 | "log"
7 | "os"
8 | "path/filepath"
9 | "regexp"
10 | "strings"
11 |
12 | "gopkg.in/yaml.v3"
13 | )
14 |
15 | type MkDocsReader struct {
16 | Logger *log.Logger
17 | }
18 |
19 | type MkDocsConfig struct {
20 | Docs []string `yaml:"nav"`
21 | DocsDir string `yaml:"docs_dir"`
22 | Inherit string `yaml:"INHERIT"`
23 | }
24 |
25 | func (r *MkDocsReader) Detect(dir string) bool {
26 | // Setting up the Logger
27 | if r.Logger == nil {
28 | r.Logger = log.New(io.Discard, "", 0)
29 | }
30 |
31 | // Check if mkdocs.yml file exists
32 | mkdocsPath := filepath.Join(dir, "mkdocs.yml")
33 | if _, err := os.Stat(mkdocsPath); os.IsNotExist(err) {
34 | // Try mkdocs.yaml
35 | mkdocsPath = filepath.Join(dir, "mkdocs.yaml")
36 | if _, err := os.Stat(mkdocsPath); os.IsNotExist(err) {
37 | r.Logger.Printf("No mkdocs.yml or mkdocs.yaml found in %s", dir)
38 | return false
39 | }
40 | }
41 |
42 | r.Logger.Printf("Found MkDocs configuration file: %s", mkdocsPath)
43 | return true
44 | }
45 |
46 | func (r *MkDocsReader) ReadStructure(dir string, configPath string, navPath string) ([]string, error) {
47 | // Setting up the Logger
48 | if r.Logger == nil {
49 | r.Logger = log.New(io.Discard, "", 0)
50 | }
51 |
52 | r.Logger.Printf("Reading MkDocs site structure from: %s", dir)
53 | if navPath != "" {
54 | r.Logger.Printf("Filtering by navigation path: %s", navPath)
55 | }
56 |
57 | // Find config file
58 | if configPath == "" {
59 | configNames := []string{"mkdocs.yml", "mkdocs.yaml"}
60 | var err error
61 | configPath, err = FindConfigFile(dir, configNames)
62 | if err != nil {
63 | r.Logger.Printf("Failed to find MkDocs config file: %s", err)
64 | return nil, fmt.Errorf("failed to find MkDocs config file: %s", err)
65 | }
66 | }
67 | r.Logger.Printf("Using config file: %s", configPath)
68 |
69 | // Read and parse config file, including handling INHERIT
70 | config, err := r.readAndMergeConfig(configPath, dir)
71 | if err != nil {
72 | r.Logger.Printf("Failed to read config file: %s", err)
73 | return nil, fmt.Errorf("failed to read config file: %s", err)
74 | }
75 |
76 | // Get docs directory
77 | docsDir := "docs"
78 | if docsDirValue, ok := config["docs_dir"]; ok {
79 | if docsDirStr, ok := docsDirValue.(string); ok {
80 | docsDir = docsDirStr
81 | }
82 | }
83 | docsDir = filepath.Join(dir, docsDir)
84 | r.Logger.Printf("Using docs directory: %s", docsDir)
85 |
86 | // Parse navigation structure
87 | var nav interface{}
88 | if navValue, ok := config["nav"]; ok {
89 | nav = navValue
90 | } else {
91 | // If no navigation config, try to find all Markdown files
92 | r.Logger.Println("No navigation configuration found, searching for all markdown files")
93 | return getAllMarkdownFiles(docsDir)
94 | }
95 |
96 | // Parse navigation structure, get file list
97 | files, err := parseNavigation(nav, docsDir, navPath)
98 | if err != nil {
99 | r.Logger.Printf("Failed to parse navigation: %s", err)
100 | return nil, fmt.Errorf("failed to parse navigation: %s", err)
101 | }
102 |
103 | r.Logger.Printf("Found %d files in navigation", len(files))
104 | return files, nil
105 | }
106 |
107 | // readAndMergeConfig Read and merge MkDocs config file, handling INHERIT directive
108 | func (r *MkDocsReader) readAndMergeConfig(configPath string, baseDir string) (map[string]interface{}, error) {
109 | r.Logger.Printf("Reading and merging config file: %s", configPath)
110 |
111 | // Read main config file
112 | configData, err := os.ReadFile(configPath)
113 | if err != nil {
114 | r.Logger.Printf("Failed to read MkDocs config file: %s", err)
115 | return nil, fmt.Errorf("failed to read MkDocs config file: %s", err)
116 | }
117 |
118 | // Parse config file
119 | var config map[string]interface{}
120 | if err := yaml.Unmarshal(configData, &config); err != nil {
121 | r.Logger.Printf("Failed to parse MkDocs config file: %s", err)
122 | return nil, fmt.Errorf("failed to parse MkDocs config file: %s", err)
123 | }
124 |
125 | // Check if there's an INHERIT directive
126 | inheritValue, hasInherit := config["INHERIT"]
127 | if !hasInherit {
128 | // No inherit, return current config
129 | return config, nil
130 | }
131 |
132 | // Handle INHERIT directive
133 | inheritPath, ok := inheritValue.(string)
134 | if !ok {
135 | r.Logger.Printf("Invalid INHERIT value, expected string but got: %T", inheritValue)
136 | return nil, fmt.Errorf("invalid INHERIT value, expected string")
137 | }
138 |
139 | r.Logger.Printf("Found INHERIT directive pointing to: %s", inheritPath)
140 |
141 | // Parse inherit path, may be relative to current config file
142 | configDir := filepath.Dir(configPath)
143 | inheritFullPath := filepath.Join(configDir, inheritPath)
144 |
145 | // Read inherited config file
146 | inheritConfig, err := r.readAndMergeConfig(inheritFullPath, baseDir)
147 | if err != nil {
148 | return nil, fmt.Errorf("failed to read inherited config file %s: %s", inheritFullPath, err)
149 | }
150 |
151 | // Merge config, current config takes precedence
152 | mergedConfig := make(map[string]interface{})
153 |
154 | // Copy inherit config first
155 | for k, v := range inheritConfig {
156 | mergedConfig[k] = v
157 | }
158 |
159 | // Override current config
160 | for k, v := range config {
161 | if k != "INHERIT" { // Don't copy INHERIT directive
162 | mergedConfig[k] = v
163 | }
164 | }
165 |
166 | r.Logger.Printf("Successfully merged config with inherited file")
167 | return mergedConfig, nil
168 | }
169 |
170 | // preprocessMarkdownFile Preprocess Markdown file, remove YAML front matter that may cause problems
171 | func preprocessMarkdownFile(filePath string) error {
172 | // Read file content
173 | content, err := os.ReadFile(filePath)
174 | if err != nil {
175 | return err
176 | }
177 |
178 | // Check if there's YAML front matter
179 | contentStr := string(content)
180 | yamlFrontMatterRegex := regexp.MustCompile(`(?s)^---\s*\n(.*?)\n---\s*\n`)
181 |
182 | // If there's YAML front matter, remove it
183 | if yamlFrontMatterRegex.MatchString(contentStr) {
184 | // Create temp file
185 | tempFile, err := os.CreateTemp("", "mdctl-*.md")
186 | if err != nil {
187 | return err
188 | }
189 | tempFilePath := tempFile.Name()
190 | tempFile.Close()
191 |
192 | // Remove YAML front matter
193 | processedContent := yamlFrontMatterRegex.ReplaceAllString(contentStr, "")
194 |
195 | // Write processed content to temp file
196 | if err := os.WriteFile(tempFilePath, []byte(processedContent), 0644); err != nil {
197 | os.Remove(tempFilePath)
198 | return err
199 | }
200 |
201 | // Replace original file
202 | if err := os.Rename(tempFilePath, filePath); err != nil {
203 | os.Remove(tempFilePath)
204 | return err
205 | }
206 | }
207 |
208 | return nil
209 | }
210 |
211 | // parseNavigation Parse MkDocs navigation structure
212 | func parseNavigation(nav interface{}, docsDir string, navPath string) ([]string, error) {
213 | var files []string
214 |
215 | switch v := nav.(type) {
216 | case []interface{}:
217 | // Navigation is a list
218 | for _, item := range v {
219 | itemFiles, err := parseNavigation(item, docsDir, navPath)
220 | if err != nil {
221 | return nil, err
222 | }
223 | files = append(files, itemFiles...)
224 | }
225 | case map[string]interface{}:
226 | // Navigation is a map
227 | for title, value := range v {
228 | // If nav path is specified, check if current node title matches
229 | if navPath != "" {
230 | // Support simple path matching, e.g. "Section1/Subsection2"
231 | navParts := strings.Split(navPath, "/")
232 | if strings.TrimSpace(title) == strings.TrimSpace(navParts[0]) {
233 | // If it's a multi-level path, continue matching the next level
234 | if len(navParts) > 1 {
235 | subNavPath := strings.Join(navParts[1:], "/")
236 | itemFiles, err := parseNavigation(value, docsDir, subNavPath)
237 | if err != nil {
238 | return nil, err
239 | }
240 | files = append(files, itemFiles...)
241 | continue
242 | } else {
243 | // If it's a single-level path and matches, only handle this node
244 | itemFiles, err := parseNavigation(value, docsDir, "")
245 | if err != nil {
246 | return nil, err
247 | }
248 | files = append(files, itemFiles...)
249 | continue
250 | }
251 | } else {
252 | // Title doesn't match, skip this node
253 | continue
254 | }
255 | }
256 |
257 | // If no nav path is specified or already matched the path, handle normally
258 | itemFiles, err := parseNavigation(value, docsDir, "")
259 | if err != nil {
260 | return nil, err
261 | }
262 | files = append(files, itemFiles...)
263 | }
264 | case string:
265 | // Navigation item is a file path
266 | if strings.HasSuffix(v, ".md") {
267 | filePath := filepath.Join(docsDir, v)
268 | if _, err := os.Stat(filePath); err == nil {
269 | // If no nav path is specified or already handled in nav path filtering, add file
270 | if navPath == "" {
271 | files = append(files, filePath)
272 | }
273 | }
274 | }
275 | }
276 |
277 | return files, nil
278 | }
279 |
280 | // getAllMarkdownFiles Get all Markdown files in a directory
281 | func getAllMarkdownFiles(dir string) ([]string, error) {
282 | var files []string
283 |
284 | err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
285 | if err != nil {
286 | return err
287 | }
288 | if !info.IsDir() {
289 | ext := strings.ToLower(filepath.Ext(path))
290 | if ext == ".md" || ext == ".markdown" {
291 | files = append(files, path)
292 | }
293 | }
294 | return nil
295 | })
296 |
297 | if err != nil {
298 | return nil, fmt.Errorf("failed to walk directory %s: %s", dir, err)
299 | }
300 |
301 | return files, nil
302 | }
303 |
--------------------------------------------------------------------------------
/internal/exporter/exporter.go:
--------------------------------------------------------------------------------
1 | package exporter
2 |
3 | import (
4 | "fmt"
5 | "io"
6 | "log"
7 | "os"
8 | "path/filepath"
9 | "sort"
10 | "strings"
11 |
12 | "github.com/samzong/mdctl/internal/exporter/sitereader"
13 | )
14 |
15 | // ExportOptions defines export options
16 | type ExportOptions struct {
17 | Template string // Word template file path
18 | GenerateToc bool // Whether to generate table of contents
19 | ShiftHeadingLevelBy int // Heading level offset
20 | FileAsTitle bool // Whether to use filename as section title
21 | Format string // Output format (docx, pdf, epub)
22 | SiteType string // Site type (mkdocs, hugo, docusaurus)
23 | Verbose bool // Whether to enable verbose logging
24 | Logger *log.Logger // Logger
25 | SourceDirs []string // List of source directories for processing image paths
26 | TocDepth int // Table of contents depth, default is 3
27 | NavPath string // Specified navigation path to export
28 | }
29 |
30 | // Exporter defines exporter interface
31 | type Exporter interface {
32 | Export(input string, output string, options ExportOptions) error
33 | }
34 |
35 | // DefaultExporter is the default exporter implementation
36 | type DefaultExporter struct {
37 | pandocPath string
38 | logger *log.Logger
39 | }
40 |
41 | // NewExporter creates a new exporter
42 | func NewExporter() *DefaultExporter {
43 | return &DefaultExporter{
44 | pandocPath: "pandoc", // Default to pandoc in system PATH
45 | logger: log.New(os.Stdout, "[EXPORTER] ", log.LstdFlags),
46 | }
47 | }
48 |
49 | // ExportFile exports a single Markdown file
50 | func (e *DefaultExporter) ExportFile(input, output string, options ExportOptions) error {
51 | // Set logger
52 | if options.Logger != nil {
53 | e.logger = options.Logger
54 | } else if !options.Verbose {
55 | e.logger = log.New(io.Discard, "", 0)
56 | }
57 |
58 | e.logger.Printf("Exporting file: %s -> %s", input, output)
59 |
60 | // Check if file exists
61 | if _, err := os.Stat(input); os.IsNotExist(err) {
62 | e.logger.Printf("Error: input file does not exist: %s", input)
63 | return fmt.Errorf("input file does not exist: %s", input)
64 | }
65 | e.logger.Printf("Input file exists: %s", input)
66 |
67 | // Create output directory (if it doesn't exist)
68 | outputDir := filepath.Dir(output)
69 | if err := os.MkdirAll(outputDir, 0755); err != nil {
70 | e.logger.Printf("Error: failed to create output directory: %s", err)
71 | return fmt.Errorf("failed to create output directory: %s", err)
72 | }
73 | e.logger.Printf("Output directory created/verified: %s", outputDir)
74 |
75 | // Add source directory to SourceDirs
76 | sourceDir := filepath.Dir(input)
77 | if options.SourceDirs == nil {
78 | options.SourceDirs = []string{sourceDir}
79 | } else {
80 | // Check if already exists
81 | found := false
82 | for _, dir := range options.SourceDirs {
83 | if dir == sourceDir {
84 | found = true
85 | break
86 | }
87 | }
88 | if !found {
89 | options.SourceDirs = append(options.SourceDirs, sourceDir)
90 | }
91 | }
92 | e.logger.Printf("Added source directory to resource paths: %s", sourceDir)
93 |
94 | // Use Pandoc to export
95 | e.logger.Println("Starting Pandoc export process...")
96 | pandocExporter := &PandocExporter{
97 | PandocPath: e.pandocPath,
98 | Logger: e.logger,
99 | }
100 | err := pandocExporter.Export(input, output, options)
101 | if err != nil {
102 | e.logger.Printf("Pandoc export failed: %s", err)
103 | return err
104 | }
105 |
106 | e.logger.Printf("File export completed successfully: %s", output)
107 | return nil
108 | }
109 |
110 | // ExportDirectory exports Markdown files in a directory
111 | func (e *DefaultExporter) ExportDirectory(inputDir, output string, options ExportOptions) error {
112 | // Set logger
113 | if options.Logger != nil {
114 | e.logger = options.Logger
115 | } else if !options.Verbose {
116 | e.logger = log.New(io.Discard, "", 0)
117 | }
118 |
119 | e.logger.Printf("Exporting directory: %s -> %s", inputDir, output)
120 |
121 | // Check if directory exists
122 | if _, err := os.Stat(inputDir); os.IsNotExist(err) {
123 | e.logger.Printf("Error: input directory does not exist: %s", inputDir)
124 | return fmt.Errorf("input directory does not exist: %s", inputDir)
125 | }
126 | e.logger.Printf("Input directory exists: %s", inputDir)
127 |
128 | // Create output directory (if it doesn't exist)
129 | outputDir := filepath.Dir(output)
130 | if err := os.MkdirAll(outputDir, 0755); err != nil {
131 | e.logger.Printf("Error: failed to create output directory: %s", err)
132 | return fmt.Errorf("failed to create output directory: %s", err)
133 | }
134 | e.logger.Printf("Output directory created/verified: %s", outputDir)
135 |
136 | // Initialize SourceDirs (if nil)
137 | if options.SourceDirs == nil {
138 | options.SourceDirs = []string{inputDir}
139 | } else {
140 | // Check if already exists
141 | found := false
142 | for _, dir := range options.SourceDirs {
143 | if dir == inputDir {
144 | found = true
145 | break
146 | }
147 | }
148 | if !found {
149 | options.SourceDirs = append(options.SourceDirs, inputDir)
150 | }
151 | }
152 | e.logger.Printf("Added input directory to resource paths: %s", inputDir)
153 |
154 | // Depending on site type, choose different processing
155 | var files []string
156 | var err error
157 |
158 | if options.SiteType != "" && options.SiteType != "basic" {
159 | // Use site reader to get file list
160 | e.logger.Printf("Using site reader for site type: %s", options.SiteType)
161 | reader, err := sitereader.GetSiteReader(options.SiteType, options.Verbose, e.logger)
162 | if err != nil {
163 | e.logger.Printf("Error getting site reader: %s", err)
164 | return err
165 | }
166 |
167 | // Detect if it's the specified type of site
168 | e.logger.Printf("Detecting if directory is a %s site...", options.SiteType)
169 | if !reader.Detect(inputDir) {
170 | e.logger.Printf("Error: directory %s does not appear to be a %s site", inputDir, options.SiteType)
171 | return fmt.Errorf("directory %s does not appear to be a %s site", inputDir, options.SiteType)
172 | }
173 | e.logger.Printf("Directory confirmed as %s site", options.SiteType)
174 |
175 | e.logger.Println("Reading site structure...")
176 | files, err = reader.ReadStructure(inputDir, "", options.NavPath)
177 | if err != nil {
178 | e.logger.Printf("Error reading site structure: %s", err)
179 | return err
180 | }
181 | e.logger.Printf("Found %d files in site structure", len(files))
182 | } else {
183 | // Basic directory mode: sort files by name
184 | e.logger.Println("Using basic directory mode, sorting files by name")
185 | files, err = GetMarkdownFilesInDir(inputDir)
186 | if err != nil {
187 | e.logger.Printf("Error getting markdown files: %s", err)
188 | return err
189 | }
190 | e.logger.Printf("Found %d markdown files in directory", len(files))
191 | }
192 |
193 | if len(files) == 0 {
194 | e.logger.Printf("Error: no markdown files found in directory: %s", inputDir)
195 | return fmt.Errorf("no markdown files found in directory: %s", inputDir)
196 | }
197 |
198 | // If there's only one file, export directly
199 | if len(files) == 1 {
200 | e.logger.Printf("Only one file found, exporting directly: %s", files[0])
201 | return e.ExportFile(files[0], output, options)
202 | }
203 |
204 | // Merge multiple files
205 | e.logger.Printf("Merging %d files...", len(files))
206 | merger := &Merger{
207 | ShiftHeadingLevelBy: options.ShiftHeadingLevelBy,
208 | FileAsTitle: options.FileAsTitle,
209 | Logger: e.logger,
210 | SourceDirs: make([]string, 0),
211 | Verbose: options.Verbose,
212 | }
213 |
214 | // Create temporary file
215 | e.logger.Println("Creating temporary file for merged content...")
216 | tempFile, err := os.CreateTemp("", "mdctl-merged-*.md")
217 | if err != nil {
218 | e.logger.Printf("Error creating temporary file: %s", err)
219 | return fmt.Errorf("failed to create temporary file: %s", err)
220 | }
221 | tempFilePath := tempFile.Name()
222 | tempFile.Close()
223 | defer os.Remove(tempFilePath)
224 | e.logger.Printf("Temporary file created: %s", tempFilePath)
225 |
226 | // Merge files
227 | e.logger.Println("Merging files...")
228 | if err := merger.Merge(files, tempFilePath); err != nil {
229 | e.logger.Printf("Error merging files: %s", err)
230 | return fmt.Errorf("failed to merge files: %s", err)
231 | }
232 | e.logger.Println("Files merged successfully")
233 |
234 | // Add merger collected source directories to options
235 | if merger.SourceDirs != nil && len(merger.SourceDirs) > 0 {
236 | e.logger.Printf("Adding %d source directories from merger", len(merger.SourceDirs))
237 | for _, dir := range merger.SourceDirs {
238 | // Check if already exists
239 | found := false
240 | for _, existingDir := range options.SourceDirs {
241 | if existingDir == dir {
242 | found = true
243 | break
244 | }
245 | }
246 | if !found {
247 | options.SourceDirs = append(options.SourceDirs, dir)
248 | e.logger.Printf("Added source directory: %s", dir)
249 | }
250 | }
251 | }
252 |
253 | // Export merged file
254 | e.logger.Println("Starting Pandoc export process...")
255 | pandocExporter := &PandocExporter{
256 | PandocPath: e.pandocPath,
257 | Logger: e.logger,
258 | }
259 | err = pandocExporter.Export(tempFilePath, output, options)
260 | if err != nil {
261 | e.logger.Printf("Pandoc export failed: %s", err)
262 | return err
263 | }
264 |
265 | e.logger.Printf("Directory export completed successfully: %s", output)
266 | return nil
267 | }
268 |
269 | // SiteReader defines site reader interface
270 | type SiteReader interface {
271 | // Detect if given directory is this type of site
272 | Detect(dir string) bool
273 | // Read site structure, return sorted list of files
274 | ReadStructure(dir string, configPath string) ([]string, error)
275 | }
276 |
277 | // GetMarkdownFilesInDir gets all Markdown files in a directory and sorts them by filename
278 | func GetMarkdownFilesInDir(dir string) ([]string, error) {
279 | // Check if directory exists
280 | info, err := os.Stat(dir)
281 | if err != nil {
282 | return nil, err
283 | }
284 | if !info.IsDir() {
285 | return nil, fmt.Errorf("%s is not a directory", dir)
286 | }
287 |
288 | // Recursively find all Markdown files
289 | var files []string
290 | err = filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
291 | if err != nil {
292 | return err
293 | }
294 | if !info.IsDir() {
295 | ext := strings.ToLower(filepath.Ext(path))
296 | if ext == ".md" || ext == ".markdown" {
297 | files = append(files, path)
298 | }
299 | }
300 | return nil
301 | })
302 |
303 | if err != nil {
304 | return nil, fmt.Errorf("failed to walk directory %s: %s", dir, err)
305 | }
306 |
307 | // Sort by filename
308 | sort.Strings(files)
309 |
310 | return files, nil
311 | }
312 |
--------------------------------------------------------------------------------
/internal/exporter/merger.go:
--------------------------------------------------------------------------------
1 | package exporter
2 |
3 | import (
4 | "bytes"
5 | "fmt"
6 | "io"
7 | "log"
8 | "os"
9 | "path/filepath"
10 | "regexp"
11 | "strings"
12 | "unicode/utf8"
13 |
14 | "golang.org/x/text/encoding/simplifiedchinese"
15 | "golang.org/x/text/transform"
16 | )
17 |
18 | // Merger Merge multiple Markdown files
19 | type Merger struct {
20 | ShiftHeadingLevelBy int
21 | FileAsTitle bool
22 | Logger *log.Logger
23 | // Store all source directories, used to set Pandoc's resource paths
24 | SourceDirs []string
25 | // Whether to enable verbose logging
26 | Verbose bool
27 | }
28 |
29 | // Merge Merge multiple Markdown files into a single target file
30 | func (m *Merger) Merge(sources []string, target string) error {
31 | // If no logger is provided, create a default one
32 | if m.Logger == nil {
33 | if m.Verbose {
34 | m.Logger = log.New(os.Stdout, "[MERGER] ", log.LstdFlags)
35 | } else {
36 | m.Logger = log.New(io.Discard, "", 0)
37 | }
38 | }
39 |
40 | if len(sources) == 0 {
41 | m.Logger.Println("Error: no source files provided")
42 | return fmt.Errorf("no source files provided")
43 | }
44 |
45 | m.Logger.Printf("Merging %d files into: %s", len(sources), target)
46 | var mergedContent strings.Builder
47 |
48 | // Initialize source directory list
49 | m.SourceDirs = make([]string, 0, len(sources))
50 | sourceDirsMap := make(map[string]bool) // Used for deduplication
51 |
52 | // Process each source file
53 | for i, source := range sources {
54 | m.Logger.Printf("Processing file %d/%d: %s", i+1, len(sources), source)
55 |
56 | // Get source file's directory and add to list (deduplication)
57 | sourceDir := filepath.Dir(source)
58 | if !sourceDirsMap[sourceDir] {
59 | sourceDirsMap[sourceDir] = true
60 | m.SourceDirs = append(m.SourceDirs, sourceDir)
61 | }
62 |
63 | // Read file content
64 | content, err := os.ReadFile(source)
65 | if err != nil {
66 | m.Logger.Printf("Error reading file %s: %s", source, err)
67 | return fmt.Errorf("failed to read file %s: %s", source, err)
68 | }
69 |
70 | // Process content
71 | processedContent := string(content)
72 |
73 | // Ensure content is valid UTF-8
74 | if !utf8.ValidString(processedContent) {
75 | m.Logger.Printf("File %s contains invalid UTF-8, attempting to convert from GBK", source)
76 | // Attempt to convert content from GBK to UTF-8
77 | reader := transform.NewReader(bytes.NewReader(content), simplifiedchinese.GBK.NewDecoder())
78 | decodedContent, err := io.ReadAll(reader)
79 | if err != nil {
80 | m.Logger.Printf("Failed to decode content from file %s: %s", source, err)
81 | return fmt.Errorf("failed to decode content from file %s: %s", source, err)
82 | }
83 | processedContent = string(decodedContent)
84 | m.Logger.Printf("Successfully converted content from GBK to UTF-8")
85 | }
86 |
87 | // Remove YAML front matter
88 | m.Logger.Println("Removing YAML front matter...")
89 | processedContent = removeYAMLFrontMatter(processedContent)
90 |
91 | // Process image paths
92 | m.Logger.Println("Processing image paths...")
93 | processedContent, err = processImagePaths(processedContent, source, m.Logger, m.Verbose)
94 | if err != nil {
95 | m.Logger.Printf("Error processing image paths: %s", err)
96 | return fmt.Errorf("failed to process image paths: %s", err)
97 | }
98 |
99 | // Adjust heading levels
100 | if m.ShiftHeadingLevelBy != 0 {
101 | m.Logger.Printf("Shifting heading levels by %d", m.ShiftHeadingLevelBy)
102 | processedContent = ShiftHeadings(processedContent, m.ShiftHeadingLevelBy)
103 | }
104 |
105 | // Add filename as title
106 | if m.FileAsTitle {
107 | filename := filepath.Base(source)
108 | m.Logger.Printf("Adding filename as title: %s", filename)
109 | processedContent = AddTitleFromFilename(processedContent, filename, 1+m.ShiftHeadingLevelBy)
110 | }
111 |
112 | // Add to merged content
113 | m.Logger.Printf("Adding processed content to merged result (length: %d bytes)", len(processedContent))
114 | mergedContent.WriteString(processedContent)
115 |
116 | // If not the last file, add separator
117 | if i < len(sources)-1 {
118 | mergedContent.WriteString("\n\n")
119 | }
120 | }
121 |
122 | // Final content
123 | finalContent := mergedContent.String()
124 |
125 | // Check again for any YAML-related issues
126 | m.Logger.Println("Sanitizing final content...")
127 | finalContent = sanitizeContent(finalContent)
128 |
129 | // Write target file, ensuring UTF-8 encoding
130 | m.Logger.Printf("Writing merged content to target file: %s (size: %d bytes)", target, len(finalContent))
131 | err := os.WriteFile(target, []byte(finalContent), 0644)
132 | if err != nil {
133 | m.Logger.Printf("Error writing merged content: %s", err)
134 | return fmt.Errorf("failed to write merged content to %s: %s", target, err)
135 | }
136 |
137 | m.Logger.Printf("Successfully merged %d files into: %s", len(sources), target)
138 | return nil
139 | }
140 |
141 | // processImagePaths Process image paths in Markdown, converting relative paths to paths relative to the command execution location
142 | func processImagePaths(content, sourcePath string, logger *log.Logger, verbose bool) (string, error) {
143 | // If no logger is provided, create a default one
144 | if logger == nil {
145 | if verbose {
146 | logger = log.New(os.Stdout, "[IMAGE] ", log.LstdFlags)
147 | } else {
148 | logger = log.New(io.Discard, "", 0)
149 | }
150 | }
151 |
152 | // Get source file's directory
153 | sourceDir := filepath.Dir(sourcePath)
154 | if verbose {
155 | logger.Printf("Processing image paths: source file directory = %s", sourceDir)
156 | }
157 |
158 | // Get current working directory (location of command execution)
159 | workingDir, err := os.Getwd()
160 | if err != nil {
161 | return "", fmt.Errorf("unable to get current working directory: %v", err)
162 | }
163 | if verbose {
164 | logger.Printf("Current working directory = %s", workingDir)
165 | }
166 |
167 | // Get absolute path of source file's directory
168 | absSourceDir, err := filepath.Abs(sourceDir)
169 | if err != nil {
170 | return "", fmt.Errorf("unable to get absolute path of source file's directory: %v", err)
171 | }
172 | if verbose {
173 | logger.Printf("Source file's directory absolute path = %s", absSourceDir)
174 | }
175 |
176 | // Match Markdown image syntax: 
177 | imageRegex := regexp.MustCompile(`!\[(.*?)\]\((.*?)\)`)
178 |
179 | // Replace all image paths
180 | processedContent := imageRegex.ReplaceAllStringFunc(content, func(match string) string {
181 | // Extract image path
182 | submatches := imageRegex.FindStringSubmatch(match)
183 | if len(submatches) < 3 {
184 | return match // If match is incorrect, keep as-is
185 | }
186 |
187 | altText := submatches[1]
188 | imagePath := submatches[2]
189 | if verbose {
190 | logger.Printf("Found image: alt = %s, path = %s", altText, imagePath)
191 | }
192 |
193 | // If image is a web image (starts with http:// or https://), keep as-is
194 | if strings.HasPrefix(imagePath, "http://") || strings.HasPrefix(imagePath, "https://") {
195 | if verbose {
196 | logger.Printf("Keeping web image path: %s", imagePath)
197 | }
198 | return match
199 | }
200 |
201 | // Parse image's absolute path
202 | var absoluteImagePath string
203 | if filepath.IsAbs(imagePath) {
204 | absoluteImagePath = imagePath
205 | } else {
206 | // For relative paths, convert to absolute path first
207 | absoluteImagePath = filepath.Join(absSourceDir, imagePath)
208 | }
209 | if verbose {
210 | logger.Printf("Image path: relative path = %s, absolute path = %s", imagePath, absoluteImagePath)
211 | }
212 |
213 | // Check if image file exists
214 | if _, err := os.Stat(absoluteImagePath); os.IsNotExist(err) {
215 | if verbose {
216 | logger.Printf("Image does not exist: %s", absoluteImagePath)
217 | }
218 | // Image does not exist, try to find it in adjacent directories
219 | // For example, if path is ../images/image.png, try to find it in the images subdirectory of the parent directory of the source file's directory
220 | if strings.HasPrefix(imagePath, "../") {
221 | parentDir := filepath.Dir(absSourceDir)
222 | relPath := strings.TrimPrefix(imagePath, "../")
223 | alternativePath := filepath.Join(parentDir, relPath)
224 | if verbose {
225 | logger.Printf("Trying alternative path: %s", alternativePath)
226 | }
227 | if _, err := os.Stat(alternativePath); err == nil {
228 | absoluteImagePath = alternativePath
229 | if verbose {
230 | logger.Printf("Found image in alternative path: %s", absoluteImagePath)
231 | }
232 | } else {
233 | // Still not found, keep as-is
234 | if verbose {
235 | logger.Printf("Image does not exist in alternative path: %s", alternativePath)
236 | }
237 | return match
238 | }
239 | } else {
240 | // Image not found, keep as-is
241 | return match
242 | }
243 | }
244 |
245 | // Calculate image's path relative to current working directory
246 | relPath, err := filepath.Rel(workingDir, absoluteImagePath)
247 | if err != nil {
248 | if verbose {
249 | logger.Printf("Unable to calculate relative path, keeping original path: %s, error: %v", imagePath, err)
250 | }
251 | return match
252 | }
253 |
254 | // Update image reference with path relative to current working directory
255 | newRef := fmt.Sprintf("", altText, relPath)
256 | if verbose {
257 | logger.Printf("Updating image reference: %s -> %s", match, newRef)
258 | }
259 | return newRef
260 | })
261 |
262 | return processedContent, nil
263 | }
264 |
265 | // removeYAMLFrontMatter Remove YAML front matter
266 | func removeYAMLFrontMatter(content string) string {
267 | // Match YAML front matter
268 | yamlFrontMatterRegex := regexp.MustCompile(`(?s)^---\s*\n(.*?)\n---\s*\n`)
269 | return yamlFrontMatterRegex.ReplaceAllString(content, "")
270 | }
271 |
272 | // sanitizeContent Clean content, removing content that may cause Pandoc parsing errors
273 | func sanitizeContent(content string) string {
274 | // Remove lines that may cause YAML parsing errors
275 | lines := strings.Split(content, "\n")
276 | var cleanedLines []string
277 |
278 | for _, line := range lines {
279 | // Skip lines that may cause YAML parsing errors
280 | if strings.Contains(line, ":") && !strings.Contains(line, ": ") && !strings.HasPrefix(line, " ") && !strings.HasPrefix(line, "\t") {
281 | // In this case, there should be a space after the colon, but there isn't, which may cause YAML parsing errors
282 | // Try to fix it
283 | fixedLine := strings.Replace(line, ":", ": ", 1)
284 | cleanedLines = append(cleanedLines, fixedLine)
285 | } else if strings.HasPrefix(line, "-") && !strings.HasPrefix(line, "- ") && len(line) > 1 {
286 | // In this case, there should be a space after the dash, but there isn't, which may cause YAML parsing errors
287 | // Try to fix it
288 | fixedLine := strings.Replace(line, "-", "- ", 1)
289 | cleanedLines = append(cleanedLines, fixedLine)
290 | } else {
291 | cleanedLines = append(cleanedLines, line)
292 | }
293 | }
294 |
295 | return strings.Join(cleanedLines, "\n")
296 | }
297 |
--------------------------------------------------------------------------------
/internal/exporter/pandoc.go:
--------------------------------------------------------------------------------
1 | package exporter
2 |
3 | import (
4 | "fmt"
5 | "io"
6 | "log"
7 | "os"
8 | "os/exec"
9 | "path/filepath"
10 | "regexp"
11 | "strings"
12 | )
13 |
14 | // PandocExporter Use Pandoc to export Markdown files
15 | type PandocExporter struct {
16 | PandocPath string
17 | Logger *log.Logger
18 | }
19 |
20 | // Export Use Pandoc to export Markdown files
21 | func (e *PandocExporter) Export(input, output string, options ExportOptions) error {
22 | // If no logger is provided, create a default one
23 | if e.Logger == nil {
24 | if options.Verbose {
25 | e.Logger = log.New(os.Stdout, "[PANDOC] ", log.LstdFlags)
26 | } else {
27 | e.Logger = log.New(io.Discard, "", 0)
28 | }
29 | }
30 |
31 | e.Logger.Printf("Starting Pandoc export: %s -> %s", input, output)
32 |
33 | // Ensure output path is absolute
34 | absOutput, err := filepath.Abs(output)
35 | if err != nil {
36 | e.Logger.Printf("Failed to get absolute path for output: %s", err)
37 | return fmt.Errorf("failed to get absolute path for output: %s", err)
38 | }
39 | e.Logger.Printf("Using absolute output path: %s", absOutput)
40 |
41 | // Create a temporary file for sanitized content
42 | e.Logger.Println("Creating sanitized copy of input file...")
43 | tempFile, err := createSanitizedCopy(input, e.Logger)
44 | if err != nil {
45 | e.Logger.Printf("Failed to create sanitized copy: %s", err)
46 | return fmt.Errorf("failed to create sanitized copy: %s", err)
47 | }
48 | defer os.Remove(tempFile)
49 | e.Logger.Printf("Sanitized copy created: %s", tempFile)
50 |
51 | // Build Pandoc command arguments
52 | e.Logger.Println("Building Pandoc command arguments...")
53 | args := []string{
54 | tempFile,
55 | "-o", absOutput,
56 | "--standalone",
57 | "--pdf-engine=xelatex",
58 | "-V", "mainfont=SimSun", // Use SimSun as the main font
59 | "--wrap=preserve",
60 | "--embed-resources", // Embed resources into output file
61 | }
62 |
63 | // Add resource path parameters, helping Pandoc find images
64 | // Collect all possible resource paths
65 | resourcePaths := make(map[string]bool)
66 |
67 | // Add input file directory
68 | inputDir := filepath.Dir(input)
69 | resourcePaths[inputDir] = true
70 | e.Logger.Printf("Added input file directory to resource paths: %s", inputDir)
71 |
72 | // Add current working directory
73 | workingDir, err := os.Getwd()
74 | if err == nil {
75 | resourcePaths[workingDir] = true
76 | e.Logger.Printf("Added current working directory to resource paths: %s", workingDir)
77 | }
78 |
79 | // Add output file directory
80 | outputDir := filepath.Dir(absOutput)
81 | resourcePaths[outputDir] = true
82 | e.Logger.Printf("Added output file directory to resource paths: %s", outputDir)
83 |
84 | // Add source file directories to resource paths
85 | if len(options.SourceDirs) > 0 {
86 | for _, dir := range options.SourceDirs {
87 | resourcePaths[dir] = true
88 | e.Logger.Printf("Added source file directory to resource paths: %s", dir)
89 | }
90 | }
91 |
92 | // Add all resource paths to Pandoc arguments
93 | for path := range resourcePaths {
94 | args = append(args, "--resource-path", path)
95 | }
96 |
97 | // Add template parameter
98 | if options.Template != "" {
99 | e.Logger.Printf("Using template: %s", options.Template)
100 | args = append(args, "--reference-doc", options.Template)
101 | }
102 |
103 | // Add directory parameter
104 | if options.GenerateToc {
105 | e.Logger.Println("Generating table of contents")
106 | args = append(args, "--toc")
107 |
108 | // Add directory depth parameter
109 | if options.TocDepth > 0 {
110 | e.Logger.Printf("Setting table of contents depth to: %d", options.TocDepth)
111 | args = append(args, "--toc-depth", fmt.Sprintf("%d", options.TocDepth))
112 | }
113 | }
114 |
115 | // Add heading level offset parameter
116 | if options.ShiftHeadingLevelBy != 0 {
117 | e.Logger.Printf("Shifting heading levels by: %d", options.ShiftHeadingLevelBy)
118 | args = append(args, "--shift-heading-level-by", fmt.Sprintf("%d", options.ShiftHeadingLevelBy))
119 | }
120 |
121 | // Add specific parameters based on output format
122 | e.Logger.Printf("Using output format: %s", options.Format)
123 | switch options.Format {
124 | case "pdf":
125 | // PDF format needs special handling for Chinese
126 | e.Logger.Println("Adding PDF-specific parameters for CJK support")
127 | args = append(args,
128 | "-V", "CJKmainfont=SimSun", // CJK font settings
129 | "-V", "documentclass=article",
130 | "-V", "geometry=margin=1in")
131 | case "epub":
132 | // EPUB format specific parameters
133 | e.Logger.Println("Adding EPUB-specific parameters")
134 | args = append(args, "--epub-chapter-level=1")
135 | }
136 |
137 | // Execute Pandoc command
138 | e.Logger.Printf("Executing Pandoc command: %s %s", e.PandocPath, strings.Join(args, " "))
139 | cmd := exec.Command(e.PandocPath, args...)
140 |
141 | // Set working directory to input file directory, which helps Pandoc find relative paths for images
142 | cmd.Dir = inputDir
143 |
144 | outputBytes, err := cmd.CombinedOutput()
145 | if err != nil {
146 | // If execution fails, try to look at input file content for debugging
147 | e.Logger.Printf("Pandoc execution failed: %s", err)
148 | e.Logger.Printf("Pandoc output: %s", string(outputBytes))
149 |
150 | inputContent, readErr := os.ReadFile(tempFile)
151 | if readErr == nil {
152 | // Only show the first 500 characters to avoid too much output
153 | contentPreview := string(inputContent)
154 | if len(contentPreview) > 500 {
155 | contentPreview = contentPreview[:500] + "..."
156 | }
157 | e.Logger.Printf("Input file preview:\n%s", contentPreview)
158 | return fmt.Errorf("pandoc execution failed: %s\nOutput: %s\nCommand: %s\nInput file preview:\n%s",
159 | err, string(outputBytes), strings.Join(cmd.Args, " "), contentPreview)
160 | }
161 |
162 | return fmt.Errorf("pandoc execution failed: %s\nOutput: %s\nCommand: %s",
163 | err, string(outputBytes), strings.Join(cmd.Args, " "))
164 | }
165 |
166 | e.Logger.Printf("Pandoc export completed successfully: %s", output)
167 | return nil
168 | }
169 |
170 | // createSanitizedCopy Create a sanitized temporary file copy
171 | func createSanitizedCopy(inputFile string, logger *log.Logger) (string, error) {
172 | if logger == nil {
173 | logger = log.New(io.Discard, "", 0)
174 | }
175 |
176 | // Read input file content
177 | logger.Printf("Reading input file: %s", inputFile)
178 | content, err := os.ReadFile(inputFile)
179 | if err != nil {
180 | return "", fmt.Errorf("failed to read input file: %s", err)
181 | }
182 |
183 | // Convert content to string
184 | contentStr := string(content)
185 |
186 | // Remove YAML front matter
187 | logger.Println("Removing YAML front matter...")
188 | yamlFrontMatterRegex := regexp.MustCompile(`(?s)^---\s*\n(.*?)\n---\s*\n`)
189 | if yamlFrontMatterRegex.MatchString(contentStr) {
190 | logger.Println("YAML front matter found, removing it")
191 | contentStr = yamlFrontMatterRegex.ReplaceAllString(contentStr, "")
192 | }
193 |
194 | // Fix lines that may cause YAML parsing errors
195 | logger.Println("Fixing potential YAML parsing issues...")
196 | lines := strings.Split(contentStr, "\n")
197 | var cleanedLines []string
198 | fixedLines := 0
199 |
200 | for _, line := range lines {
201 | // Skip lines that may cause YAML parsing errors
202 | if strings.Contains(line, ":") && !strings.Contains(line, ": ") && !strings.HasPrefix(line, " ") && !strings.HasPrefix(line, "\t") {
203 | // In this case, there should be a space after the colon, but there isn't, which may cause YAML parsing errors
204 | // Try to fix it
205 | fixedLine := strings.Replace(line, ":", ": ", 1)
206 | cleanedLines = append(cleanedLines, fixedLine)
207 | fixedLines++
208 | logger.Printf("Fixed line with missing space after colon: %s -> %s", line, fixedLine)
209 | } else if strings.HasPrefix(line, "-") && !strings.HasPrefix(line, "- ") && len(line) > 1 {
210 | // In this case, there should be a space after the dash, but there isn't, which may cause YAML parsing errors
211 | // Try to fix it
212 | fixedLine := strings.Replace(line, "-", "- ", 1)
213 | cleanedLines = append(cleanedLines, fixedLine)
214 | fixedLines++
215 | logger.Printf("Fixed line with missing space after dash: %s -> %s", line, fixedLine)
216 | } else {
217 | cleanedLines = append(cleanedLines, line)
218 | }
219 | }
220 |
221 | logger.Printf("Fixed %d lines with potential YAML issues", fixedLines)
222 |
223 | // Create a temporary file
224 | tempDir := os.TempDir()
225 | tempFilePath := filepath.Join(tempDir, "mdctl-sanitized-"+filepath.Base(inputFile))
226 |
227 | // Write sanitized content to temporary file
228 | logger.Printf("Writing sanitized content to temporary file: %s", tempFilePath)
229 | err = os.WriteFile(tempFilePath, []byte(strings.Join(cleanedLines, "\n")), 0644)
230 | if err != nil {
231 | return "", err
232 | }
233 |
234 | return tempFilePath, nil
235 | }
236 |
237 | // preprocessInputFile Preprocess input file, removing content that may cause Pandoc parsing errors
238 | func preprocessInputFile(filePath string) error {
239 | // Read file content
240 | content, err := os.ReadFile(filePath)
241 | if err != nil {
242 | return err
243 | }
244 |
245 | contentStr := string(content)
246 |
247 | // Check for unconventional YAML front matter
248 | yamlFrontMatterRegex := regexp.MustCompile(`(?s)^---\s*\n(.*?)\n---\s*\n`)
249 | if yamlFrontMatterRegex.MatchString(contentStr) {
250 | // Extract YAML front matter content
251 | matches := yamlFrontMatterRegex.FindStringSubmatch(contentStr)
252 | if len(matches) > 1 {
253 | yamlContent := matches[1]
254 |
255 | // Check if YAML content has formatting issues
256 | if strings.Contains(yamlContent, "\n-") && !strings.Contains(yamlContent, "\n- ") {
257 | // Fix formatting issue: ensure there's a space after the dash
258 | fixedYaml := strings.ReplaceAll(yamlContent, "\n-", "\n- ")
259 | fixedContent := strings.Replace(contentStr, yamlContent, fixedYaml, 1)
260 |
261 | // Write back to file
262 | return os.WriteFile(filePath, []byte(fixedContent), 0644)
263 | }
264 | }
265 |
266 | // If YAML format has other issues, remove entire front matter
267 | processedContent := yamlFrontMatterRegex.ReplaceAllString(contentStr, "")
268 | return os.WriteFile(filePath, []byte(processedContent), 0644)
269 | }
270 |
271 | return nil
272 | }
273 |
274 | // CheckPandocAvailability Check if Pandoc is available
275 | func CheckPandocAvailability() error {
276 | cmd := exec.Command("pandoc", "--version")
277 | outputBytes, err := cmd.CombinedOutput()
278 | if err != nil {
279 | return fmt.Errorf("pandoc is not available: %s\n\nPlease install Pandoc to use the export feature:\n\n"+
280 | "macOS: brew install pandoc\n"+
281 | "Ubuntu/Debian: sudo apt-get install pandoc\n"+
282 | "Windows: choco install pandoc\n\n"+
283 | "For more information, visit: https://pandoc.org/installing.html", err)
284 | }
285 |
286 | // Check version
287 | versionStr := string(outputBytes)
288 | if !strings.Contains(versionStr, "pandoc") {
289 | return fmt.Errorf("unexpected pandoc version output: %s", versionStr)
290 | }
291 |
292 | return nil
293 | }
294 |
--------------------------------------------------------------------------------
/internal/translator/translator.go:
--------------------------------------------------------------------------------
1 | package translator
2 |
3 | import (
4 | "bytes"
5 | "encoding/json"
6 | "fmt"
7 | "io"
8 | "net/http"
9 | "os"
10 | "path/filepath"
11 | "regexp"
12 | "sort"
13 | "strings"
14 |
15 | "github.com/samzong/mdctl/internal/config"
16 | "github.com/samzong/mdctl/internal/markdownfmt"
17 | "gopkg.in/yaml.v3"
18 | )
19 |
20 | // SupportedLanguages defines the mapping of supported languages
21 | var SupportedLanguages = map[string]string{
22 | "zh": "中文",
23 | "en": "English",
24 | "ja": "日本語",
25 | "ko": "한국어",
26 | "fr": "Français",
27 | "de": "Deutsch",
28 | "es": "Español",
29 | "it": "Italiano",
30 | "ru": "Русский",
31 | "pt": "Português",
32 | "vi": "Tiếng Việt",
33 | "th": "ไทย",
34 | "ar": "العربية",
35 | "hi": "हिन्दी",
36 | }
37 |
38 | // IsLanguageSupported checks if the language is supported
39 | func IsLanguageSupported(lang string) bool {
40 | _, ok := SupportedLanguages[lang]
41 | return ok
42 | }
43 |
44 | // GetSupportedLanguages returns a list of supported languages
45 | func GetSupportedLanguages() string {
46 | var langs []string
47 | for code, name := range SupportedLanguages {
48 | langs = append(langs, fmt.Sprintf("%s (%s)", code, name))
49 | }
50 | sort.Strings(langs)
51 | return strings.Join(langs, ", ")
52 | }
53 |
54 | type OpenAIMessage struct {
55 | Role string `json:"role"`
56 | Content string `json:"content"`
57 | }
58 |
59 | type OpenAIRequest struct {
60 | Model string `json:"model"`
61 | Messages []OpenAIMessage `json:"messages"`
62 | Temperature float64 `json:"temperature"`
63 | TopP float64 `json:"top_p"`
64 | }
65 |
66 | type OpenAIResponse struct {
67 | Choices []struct {
68 | Message struct {
69 | Content string `json:"content"`
70 | } `json:"message"`
71 | } `json:"choices"`
72 | }
73 |
74 | // Progress is used to track translation progress
75 | type Progress struct {
76 | Total int
77 | Current int
78 | SourceFile string
79 | TargetFile string
80 | }
81 |
82 | // ProgressCallback defines the progress callback function type
83 | type ProgressCallback func(progress Progress)
84 |
85 | // Translator struct for the translator
86 | type Translator struct {
87 | config *config.Config
88 | format bool
89 | progress ProgressCallback
90 | }
91 |
92 | // New creates a new translator instance
93 | func New(cfg *config.Config, format bool) *Translator {
94 | return &Translator{
95 | config: cfg,
96 | format: format,
97 | progress: func(p Progress) {
98 | if p.Total > 1 {
99 | fmt.Printf("Translating file [%d/%d]: %s\n", p.Current, p.Total, p.SourceFile)
100 | }
101 | },
102 | }
103 | }
104 |
105 | var (
106 | // RegexPatterns defines patterns for removing special content blocks
107 | RegexPatterns = []struct {
108 | Pattern string
109 | Replace string
110 | }{
111 | {`(?s).*?\n?`, ""}, // Remove ollama deepthink thinking process
112 | }
113 | )
114 |
115 | // TranslateContent translates the content
116 | func (t *Translator) TranslateContent(content string, lang string) (string, error) {
117 | // Remove potential front matter
118 | content = removeFrontMatter(content)
119 |
120 | prompt := strings.Replace(t.config.TranslatePrompt, "{TARGET_LANG}", lang, 1)
121 |
122 | messages := []OpenAIMessage{
123 | {Role: "system", Content: prompt},
124 | {Role: "user", Content: content},
125 | }
126 |
127 | reqBody := OpenAIRequest{
128 | Model: t.config.ModelName,
129 | Messages: messages,
130 | Temperature: t.config.Temperature,
131 | TopP: t.config.TopP,
132 | }
133 |
134 | jsonData, err := json.Marshal(reqBody)
135 | if err != nil {
136 | return "", fmt.Errorf("failed to marshal request: %v", err)
137 | }
138 |
139 | req, err := http.NewRequest("POST", t.config.OpenAIEndpointURL+"/chat/completions", bytes.NewBuffer(jsonData))
140 | if err != nil {
141 | return "", fmt.Errorf("failed to create request: %v", err)
142 | }
143 |
144 | req.Header.Set("Content-Type", "application/json")
145 | req.Header.Set("Authorization", "Bearer "+t.config.OpenAIAPIKey)
146 |
147 | client := &http.Client{}
148 | resp, err := client.Do(req)
149 | if err != nil {
150 | return "", fmt.Errorf("failed to send request: %v", err)
151 | }
152 | defer resp.Body.Close()
153 |
154 | body, err := io.ReadAll(resp.Body)
155 | if err != nil {
156 | return "", fmt.Errorf("failed to read response: %v", err)
157 | }
158 |
159 | var response OpenAIResponse
160 | if err := json.Unmarshal(body, &response); err != nil {
161 | return "", fmt.Errorf("failed to parse response: %v\nResponse body: %s", err, string(body))
162 | }
163 |
164 | if len(response.Choices) == 0 {
165 | return "", fmt.Errorf("no translation result\nResponse body: %s", string(body))
166 | }
167 |
168 | // Get translated content
169 | translatedContent := response.Choices[0].Message.Content
170 |
171 | // Remove special content blocks
172 | for _, pattern := range RegexPatterns {
173 | translatedContent = regexp.MustCompile(pattern.Pattern).ReplaceAllString(translatedContent, pattern.Replace)
174 | }
175 |
176 | // Remove potential markdown code block markers
177 | translatedContent = strings.TrimPrefix(translatedContent, "\n")
178 |
179 | // If formatting is enabled, format the translated content
180 | if t.format {
181 | formatter := markdownfmt.New(true)
182 | translatedContent = formatter.Format(translatedContent)
183 | }
184 |
185 | return translatedContent, nil
186 | }
187 |
188 | // removeFrontMatter removes front matter from content
189 | func removeFrontMatter(content string) string {
190 | // If content starts with ---, it may contain front matter
191 | trimmedContent := strings.TrimSpace(content)
192 | if strings.HasPrefix(trimmedContent, "---") {
193 | parts := strings.SplitN(trimmedContent, "---", 3)
194 | if len(parts) >= 3 {
195 | return strings.TrimSpace(parts[2])
196 | }
197 | }
198 | return content
199 | }
200 |
201 | // ProcessFile handles translation of a single file
202 | func ProcessFile(srcPath, dstPath, targetLang string, cfg *config.Config, format bool, force bool) error {
203 | t := New(cfg, format)
204 |
205 | // Check if target path is a directory
206 | dstInfo, err := os.Stat(dstPath)
207 | if err == nil && dstInfo.IsDir() {
208 | dstPath = filepath.Join(dstPath, filepath.Base(srcPath))
209 | }
210 |
211 | // Check if target file already exists
212 | if _, err := os.Stat(dstPath); err == nil {
213 | dstContent, err := os.ReadFile(dstPath)
214 | if err != nil {
215 | return fmt.Errorf("failed to read target file: %v", err)
216 | }
217 |
218 | // Check if already translated
219 | var dstFrontMatter map[string]interface{}
220 | if strings.HasPrefix(string(dstContent), "---\n") {
221 | parts := strings.SplitN(string(dstContent)[4:], "\n---\n", 2)
222 | if len(parts) == 2 {
223 | if err := yaml.Unmarshal([]byte(parts[0]), &dstFrontMatter); err != nil {
224 | return fmt.Errorf("failed to parse target file front matter: %v", err)
225 | }
226 | if translated, ok := dstFrontMatter["translated"].(bool); ok && translated {
227 | if !force {
228 | fmt.Printf("Skipping %s (already translated, use -F to force translate)\n", srcPath)
229 | return nil
230 | }
231 | fmt.Printf("Force translating %s\n", srcPath)
232 | }
233 | }
234 | }
235 | }
236 |
237 | // Read source file content
238 | content, err := os.ReadFile(srcPath)
239 | if err != nil {
240 | return fmt.Errorf("failed to read source file: %v", err)
241 | }
242 |
243 | // Parse front matter
244 | var frontMatter map[string]interface{}
245 | contentToTranslate := string(content)
246 |
247 | // Check and parse front matter
248 | if strings.HasPrefix(contentToTranslate, "---\n") {
249 | parts := strings.SplitN(contentToTranslate[4:], "\n---\n", 2)
250 | if len(parts) == 2 {
251 | if err := yaml.Unmarshal([]byte(parts[0]), &frontMatter); err != nil {
252 | return fmt.Errorf("failed to parse front matter: %v", err)
253 | }
254 | contentToTranslate = parts[1]
255 | }
256 | }
257 |
258 | // Translate content
259 | translatedContent, err := t.TranslateContent(contentToTranslate, targetLang)
260 | if err != nil {
261 | return fmt.Errorf("failed to translate content: %v", err)
262 | }
263 |
264 | // Update front matter
265 | if frontMatter == nil {
266 | frontMatter = make(map[string]interface{})
267 | }
268 | frontMatter["translated"] = true
269 |
270 | // Generate new file content
271 | frontMatterBytes, err := yaml.Marshal(frontMatter)
272 | if err != nil {
273 | return fmt.Errorf("failed to marshal front matter: %v", err)
274 | }
275 |
276 | newContent := fmt.Sprintf("---\n%s---\n\n%s", string(frontMatterBytes), translatedContent)
277 |
278 | // Create target directory if it doesn't exist
279 | if err := os.MkdirAll(filepath.Dir(dstPath), 0755); err != nil {
280 | return fmt.Errorf("failed to create target directory: %v", err)
281 | }
282 |
283 | // Write translated content to target file
284 | if err := os.WriteFile(dstPath, []byte(newContent), 0644); err != nil {
285 | return fmt.Errorf("failed to write target file: %v", err)
286 | }
287 |
288 | return nil
289 | }
290 |
291 | // ProcessDirectory processes all markdown files in the directory
292 | func ProcessDirectory(srcDir, dstDir string, targetLang string, cfg *config.Config, force bool, format bool) error {
293 | // First calculate the total number of files to process
294 | var total int
295 | err := filepath.Walk(srcDir, func(path string, info os.FileInfo, err error) error {
296 | if err != nil {
297 | return err
298 | }
299 | if !info.IsDir() && filepath.Ext(path) == ".md" {
300 | total++
301 | }
302 | return nil
303 | })
304 | if err != nil {
305 | return fmt.Errorf("failed to count files: %v", err)
306 | }
307 |
308 | fmt.Printf("Found %d markdown files to translate\n", total)
309 |
310 | // Create translator instance
311 | t := New(cfg, format)
312 | current := 0
313 |
314 | // Walk through source directory
315 | return filepath.Walk(srcDir, func(path string, info os.FileInfo, err error) error {
316 | if err != nil {
317 | return err
318 | }
319 |
320 | // Skip directories
321 | if info.IsDir() {
322 | return nil
323 | }
324 |
325 | // Only process markdown files
326 | ext := filepath.Ext(path)
327 | if ext != ".md" {
328 | return nil
329 | }
330 |
331 | current++
332 |
333 | // Get relative path
334 | relPath, err := filepath.Rel(srcDir, path)
335 | if err != nil {
336 | return fmt.Errorf("failed to get relative path: %v", err)
337 | }
338 |
339 | var dstPath string
340 | if dstDir == "" {
341 | // If target directory is empty, create translation file in source directory
342 | dir := filepath.Dir(path)
343 | base := filepath.Base(path)
344 | nameWithoutExt := strings.TrimSuffix(base, ext)
345 | dstPath = filepath.Join(dir, nameWithoutExt+"_"+targetLang+ext)
346 | } else {
347 | // If a different target directory is specified, use the specified directory structure
348 | dstPath = filepath.Join(dstDir, relPath)
349 | }
350 |
351 | t.progress(Progress{
352 | Total: total,
353 | Current: current,
354 | SourceFile: path,
355 | TargetFile: dstPath,
356 | })
357 |
358 | // Process file
359 | if err := ProcessFile(path, dstPath, targetLang, cfg, format, force); err != nil {
360 | return fmt.Errorf("failed to process file %s: %v", path, err)
361 | }
362 |
363 | return nil
364 | })
365 | }
366 |
--------------------------------------------------------------------------------