├── mdctl.png ├── main.go ├── action.yml ├── go.mod ├── .gitignore ├── .github └── workflows │ ├── update-homebrew.yml │ ├── release.yml │ ├── pr-review.yml │ ├── idoc.yml │ └── docker-build.yml ├── .goreleaser.yaml ├── LICENSE ├── Dockerfile ├── cmd ├── download.go ├── root.go ├── llmstxt.go ├── translate.go ├── export.go ├── lint.go └── upload.go ├── internal ├── storage │ └── provider.go ├── llmstxt │ ├── formatter.go │ ├── fetcher.go │ ├── generator.go │ ├── extractor.go │ └── sitemap.go ├── exporter │ ├── sitereader │ │ ├── reader.go │ │ └── mkdocs.go │ ├── heading.go │ ├── exporter.go │ ├── merger.go │ └── pandoc.go ├── linter │ ├── config.go │ ├── fixer.go │ ├── linter.go │ ├── linter_test.go │ └── rules_test.go ├── cache │ └── cache.go ├── markdownfmt │ └── formatter.go ├── processor │ └── processor.go ├── config │ └── config.go └── translator │ └── translator.go ├── docs ├── DEVELOPMENT.md └── features │ ├── export.md │ └── upload.md ├── README.md ├── go.sum └── Makefile /mdctl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samzong/mdctl/HEAD/mdctl.png -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "github.com/samzong/mdctl/cmd" 5 | ) 6 | 7 | func main() { 8 | cmd.Execute() 9 | } 10 | -------------------------------------------------------------------------------- /action.yml: -------------------------------------------------------------------------------- 1 | name: "mdctl CLI" 2 | description: "Run mdctl (Markdown docs toolkit) in GitHub Actions via Docker." 3 | author: "samzong" 4 | branding: 5 | icon: "book" 6 | color: "blue" 7 | 8 | inputs: 9 | args: 10 | description: "Arguments to pass to mdctl (e.g., \"export -f README.md -o out.docx\")." 11 | required: false 12 | default: "--help" 13 | 14 | runs: 15 | using: "docker" 16 | image: "Dockerfile" 17 | entrypoint: "/bin/sh" 18 | args: 19 | - -c 20 | - mdctl ${{ inputs.args }} 21 | 22 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/samzong/mdctl 2 | 3 | go 1.23.4 4 | 5 | require ( 6 | github.com/PuerkitoBio/goquery v1.9.1 7 | github.com/aws/aws-sdk-go v1.55.6 8 | github.com/gobwas/glob v0.2.3 9 | github.com/spf13/cobra v1.8.1 10 | golang.org/x/text v0.23.0 11 | gopkg.in/yaml.v3 v3.0.1 12 | ) 13 | 14 | require ( 15 | github.com/andybalholm/cascadia v1.3.2 // indirect 16 | github.com/inconshreveable/mousetrap v1.1.0 // indirect 17 | github.com/jmespath/go-jmespath v0.4.0 // indirect 18 | github.com/spf13/pflag v1.0.5 // indirect 19 | golang.org/x/net v0.33.0 // indirect 20 | ) 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries for programs and plugins 2 | *.exe 3 | *.exe~ 4 | *.dll 5 | *.so 6 | *.dylib 7 | mdctl 8 | bin/ 9 | 10 | # Test binary, built with `go test -c` 11 | *.test 12 | 13 | # Output of the go coverage tool, specifically when used with LiteIDE 14 | *.out 15 | 16 | # Dependency directories (remove the comment below to include it) 17 | vendor/ 18 | 19 | # Go workspace file 20 | go.work 21 | 22 | # IDE specific files 23 | .idea/ 24 | .vscode/ 25 | *.swp 26 | *.swo 27 | 28 | # OS generated files 29 | .DS_Store 30 | .DS_Store? 31 | ._* 32 | .Spotlight-V100 33 | .Trashes 34 | ehthumbs.db 35 | Thumbs.db 36 | 37 | # Project specific 38 | images/ 39 | dist/ 40 | *.docx 41 | *.pdf -------------------------------------------------------------------------------- /.github/workflows/update-homebrew.yml: -------------------------------------------------------------------------------- 1 | name: Update Homebrew Tap 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | tag: 7 | description: 'Select the tag to update Homebrew' 8 | required: true 9 | type: string 10 | repository_dispatch: 11 | types: [trigger-homebrew-update] 12 | 13 | jobs: 14 | update-homebrew: 15 | runs-on: macos-latest 16 | steps: 17 | - name: Set version 18 | run: | 19 | if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then 20 | echo "VERSION=${{ inputs.tag }}" >> $GITHUB_ENV 21 | else 22 | echo "VERSION=${{ github.event.client_payload.version }}" >> $GITHUB_ENV 23 | fi 24 | 25 | - name: Checkout repository 26 | uses: actions/checkout@v4 27 | with: 28 | fetch-depth: 0 29 | 30 | - name: Update Homebrew Formula 31 | env: 32 | GH_PAT: ${{ secrets.GH_PAT }} 33 | run: make update-homebrew -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*' 7 | 8 | permissions: 9 | contents: write 10 | packages: write 11 | 12 | jobs: 13 | goreleaser: 14 | runs-on: ubuntu-latest 15 | steps: 16 | - name: Checkout 17 | uses: actions/checkout@v4 18 | with: 19 | fetch-depth: 0 20 | 21 | - name: Set up Go 22 | uses: actions/setup-go@v4 23 | with: 24 | go-version: '>=1.21.0' 25 | cache: true 26 | 27 | - name: Run GoReleaser 28 | uses: goreleaser/goreleaser-action@v5 29 | with: 30 | distribution: goreleaser 31 | version: latest 32 | args: release --clean 33 | env: 34 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 35 | 36 | - name: Trigger Homebrew Update 37 | if: success() 38 | uses: peter-evans/repository-dispatch@v2 39 | with: 40 | token: ${{ secrets.GH_PAT }} 41 | event-type: trigger-homebrew-update 42 | client-payload: '{"version": "${{ env.VERSION }}"}' -------------------------------------------------------------------------------- /.github/workflows/pr-review.yml: -------------------------------------------------------------------------------- 1 | name: PR Review 2 | 3 | on: 4 | pull_request: 5 | types: [opened, synchronize, reopened] 6 | paths-ignore: 7 | - '**.md' 8 | - 'docs/**' 9 | - '.gitignore' 10 | 11 | jobs: 12 | review: 13 | name: Build & Test 14 | runs-on: ubuntu-latest 15 | steps: 16 | - name: Checkout code 17 | uses: actions/checkout@v4 18 | with: 19 | fetch-depth: 0 20 | 21 | - name: Set up Go 22 | uses: actions/setup-go@v5 23 | with: 24 | go-version: '1.21' 25 | cache: true 26 | 27 | - name: Install dependencies 28 | run: make deps 29 | 30 | - name: Format check 31 | run: | 32 | make fmt 33 | git diff --exit-code || (echo "Code is not formatted. Please run 'make fmt'" && exit 1) 34 | 35 | - name: Run tests 36 | run: make test 37 | 38 | - name: Build 39 | run: make build 40 | 41 | - name: Upload artifact 42 | uses: actions/upload-artifact@v4 43 | with: 44 | name: mdctl 45 | path: bin/mdctl -------------------------------------------------------------------------------- /.goreleaser.yaml: -------------------------------------------------------------------------------- 1 | before: 2 | hooks: 3 | - go mod tidy 4 | 5 | builds: 6 | - env: 7 | - CGO_ENABLED=0 8 | goos: 9 | - linux 10 | - windows 11 | - darwin 12 | goarch: 13 | - amd64 14 | - arm64 15 | ignore: 16 | - goos: windows 17 | goarch: arm64 18 | ldflags: 19 | - -s -w -X github.com/samzong/mdctl/cmd.Version={{.Version}} -X github.com/samzong/mdctl/cmd.BuildTime={{.Date}} 20 | binary: mdctl 21 | 22 | archives: 23 | - format: tar.gz 24 | name_template: >- 25 | {{ .ProjectName }}_ 26 | {{- title .Os }}_ 27 | {{- if eq .Arch "amd64" }}x86_64 28 | {{- else if eq .Arch "386" }}i386 29 | {{- else }}{{ .Arch }}{{ end }} 30 | {{- if .Arm }}v{{ .Arm }}{{ end }} 31 | format_overrides: 32 | - goos: windows 33 | format: zip 34 | 35 | changelog: 36 | sort: asc 37 | filters: 38 | exclude: 39 | - '^docs:' 40 | - '^test:' 41 | - '^ci:' 42 | - '^chore:' 43 | 44 | checksum: 45 | name_template: 'checksums.txt' 46 | 47 | snapshot: 48 | name_template: "{{ incpatch .Version }}-next" -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 samzong 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM golang:1.23-alpine AS builder 2 | 3 | WORKDIR /app 4 | 5 | # Copy go mod and sum files 6 | COPY go.mod go.sum ./ 7 | 8 | # Download dependencies 9 | RUN go mod download 10 | 11 | # Copy source code 12 | COPY . . 13 | 14 | # Install git for version information 15 | RUN apk add --no-cache git 16 | 17 | # Set build arguments with defaults 18 | ARG VERSION=dev 19 | ARG BUILD_TIME 20 | 21 | # Set default build time if not provided 22 | RUN if [ -z "$BUILD_TIME" ]; then BUILD_TIME=$(date -u +"%Y-%m-%dT%H:%M:%SZ"); fi && \ 23 | echo "Building version: $VERSION, build time: $BUILD_TIME" && \ 24 | CGO_ENABLED=0 go build -trimpath -ldflags "-s -w -X github.com/samzong/mdctl/cmd.Version=${VERSION} -X github.com/samzong/mdctl/cmd.BuildTime=${BUILD_TIME}" -o /app/bin/mdctl 25 | 26 | # Use a minimal alpine image for the final stage 27 | FROM alpine:3.19 28 | 29 | # Install ca-certificates for HTTPS requests 30 | RUN apk --no-cache add ca-certificates 31 | 32 | WORKDIR /root/ 33 | 34 | # Copy the binary from the builder stage 35 | COPY --from=builder /app/bin/mdctl /usr/local/bin/mdctl 36 | 37 | # Create config directory 38 | RUN mkdir -p /root/.config/mdctl 39 | 40 | # Set the entrypoint 41 | ENTRYPOINT ["mdctl"] 42 | 43 | # Default command 44 | CMD ["--help"] 45 | -------------------------------------------------------------------------------- /cmd/download.go: -------------------------------------------------------------------------------- 1 | package cmd 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/samzong/mdctl/internal/processor" 7 | 8 | "github.com/spf13/cobra" 9 | ) 10 | 11 | var ( 12 | sourceFile string 13 | sourceDir string 14 | imageOutputDir string 15 | 16 | downloadCmd = &cobra.Command{ 17 | Use: "download", 18 | Short: "Download remote images in markdown files", 19 | Long: `Download remote images in markdown files to local storage and update references. 20 | Examples: 21 | mdctl download -f post.md 22 | mdctl download -d content/posts 23 | mdctl download -f post.md -o assets/images`, 24 | RunE: func(cmd *cobra.Command, args []string) error { 25 | if sourceFile == "" && sourceDir == "" { 26 | return fmt.Errorf("either source file (-f) or source directory (-d) must be specified") 27 | } 28 | if sourceFile != "" && sourceDir != "" { 29 | return fmt.Errorf("cannot specify both source file (-f) and source directory (-d)") 30 | } 31 | 32 | p := processor.New(sourceFile, sourceDir, imageOutputDir) 33 | return p.Process() 34 | }, 35 | } 36 | ) 37 | 38 | func init() { 39 | downloadCmd.Flags().StringVarP(&sourceFile, "file", "f", "", "Source markdown file to process") 40 | downloadCmd.Flags().StringVarP(&sourceDir, "dir", "d", "", "Source directory containing markdown files to process") 41 | downloadCmd.Flags().StringVarP(&imageOutputDir, "output", "o", "", "Output directory for downloaded images (optional)") 42 | } 43 | -------------------------------------------------------------------------------- /cmd/root.go: -------------------------------------------------------------------------------- 1 | package cmd 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | 7 | "github.com/spf13/cobra" 8 | ) 9 | 10 | var ( 11 | Version = "dev" 12 | BuildTime = "unknown" 13 | verbose bool 14 | veryVerbose bool 15 | 16 | rootCmd = &cobra.Command{ 17 | Use: "mdctl", 18 | Short: "A CLI tool for markdown file operations", 19 | Long: `mdctl is a CLI tool that helps you manage and process markdown files. 20 | Currently supports downloading remote images and more features to come.`, 21 | Version: fmt.Sprintf("%s (built at %s)", Version, BuildTime), 22 | } 23 | ) 24 | 25 | func Execute() { 26 | if err := rootCmd.Execute(); err != nil { 27 | fmt.Println(err) 28 | os.Exit(1) 29 | } 30 | } 31 | 32 | func init() { 33 | // Add commands first 34 | rootCmd.AddCommand(translateCmd) 35 | rootCmd.AddCommand(downloadCmd) 36 | rootCmd.AddCommand(configCmd) 37 | rootCmd.AddCommand(uploadCmd) 38 | rootCmd.AddCommand(exportCmd) 39 | rootCmd.AddCommand(llmstxtCmd) 40 | rootCmd.AddCommand(lintCmd) 41 | 42 | // Add global flags 43 | rootCmd.PersistentFlags().BoolVarP(&verbose, "verbose", "v", false, "Enable verbose output") 44 | rootCmd.PersistentFlags().BoolVar(&veryVerbose, "vv", false, "Enable very verbose output with detailed information") 45 | 46 | // Then add groups and set group IDs 47 | rootCmd.AddGroup(&cobra.Group{ 48 | ID: "core", 49 | Title: "Core Commands:", 50 | }) 51 | rootCmd.AddGroup(&cobra.Group{ 52 | ID: "config", 53 | Title: "Configuration Commands:", 54 | }) 55 | 56 | // Set group for each command 57 | translateCmd.GroupID = "core" 58 | downloadCmd.GroupID = "core" 59 | uploadCmd.GroupID = "core" 60 | exportCmd.GroupID = "core" 61 | llmstxtCmd.GroupID = "core" 62 | lintCmd.GroupID = "core" 63 | configCmd.GroupID = "config" 64 | } 65 | -------------------------------------------------------------------------------- /internal/storage/provider.go: -------------------------------------------------------------------------------- 1 | package storage 2 | 3 | import ( 4 | "github.com/samzong/mdctl/internal/config" 5 | ) 6 | 7 | // Provider defines the interface for storage providers 8 | type Provider interface { 9 | // Upload uploads a file to cloud storage 10 | Upload(localPath, remotePath string, metadata map[string]string) (string, error) 11 | 12 | // Configure sets up the provider with the given configuration 13 | Configure(config config.CloudConfig) error 14 | 15 | // GetPublicURL returns the public URL for a remote path 16 | GetPublicURL(remotePath string) string 17 | 18 | // ObjectExists checks if an object exists in the storage 19 | ObjectExists(remotePath string) (bool, error) 20 | 21 | // CompareHash compares a local hash with a remote object's hash 22 | CompareHash(remotePath, localHash string) (bool, error) 23 | 24 | // SetObjectMetadata sets metadata for an object 25 | SetObjectMetadata(remotePath string, metadata map[string]string) error 26 | 27 | // GetObjectMetadata retrieves metadata for an object 28 | GetObjectMetadata(remotePath string) (map[string]string, error) 29 | } 30 | 31 | // ProviderFactory is a function that creates a new storage provider 32 | type ProviderFactory func() Provider 33 | 34 | var providers = make(map[string]ProviderFactory) 35 | 36 | // RegisterProvider registers a storage provider factory 37 | func RegisterProvider(name string, factory ProviderFactory) { 38 | providers[name] = factory 39 | } 40 | 41 | // GetProvider returns a storage provider by name 42 | func GetProvider(name string) (Provider, bool) { 43 | factory, exists := providers[name] 44 | if !exists { 45 | return nil, false 46 | } 47 | return factory(), true 48 | } 49 | 50 | // ListProviders returns a list of available provider names 51 | func ListProviders() []string { 52 | var names []string 53 | for name := range providers { 54 | names = append(names, name) 55 | } 56 | return names 57 | } 58 | -------------------------------------------------------------------------------- /internal/llmstxt/formatter.go: -------------------------------------------------------------------------------- 1 | package llmstxt 2 | 3 | import ( 4 | "strings" 5 | "unicode" 6 | ) 7 | 8 | // Format to Markdown content 9 | func (g *Generator) formatContent(sections map[string][]PageInfo) string { 10 | var buf strings.Builder 11 | 12 | // Get sorted section list 13 | sectionNames := g.getSortedSections(sections) 14 | 15 | // Find root page info 16 | var rootPage PageInfo 17 | if rootPages, ok := sections["ROOT"]; ok && len(rootPages) > 0 { 18 | rootPage = rootPages[0] 19 | } 20 | 21 | // Add document title 22 | buf.WriteString("# ") 23 | buf.WriteString(rootPage.Title) 24 | buf.WriteString("\n\n") 25 | 26 | // Add document description 27 | buf.WriteString("> ") 28 | buf.WriteString(rootPage.Description) 29 | buf.WriteString("\n\n") 30 | 31 | // Handle each section 32 | for _, section := range sectionNames { 33 | // Skip ROOT section, because it's already used for title and description 34 | if section == "ROOT" { 35 | continue 36 | } 37 | 38 | // Add section title 39 | buf.WriteString("## ") 40 | buf.WriteString(capitalizeString(section)) 41 | buf.WriteString("\n\n") 42 | 43 | // Add page info for each page in section 44 | for _, page := range sections[section] { 45 | buf.WriteString("- [") 46 | buf.WriteString(page.Title) 47 | buf.WriteString("](") 48 | buf.WriteString(page.URL) 49 | buf.WriteString("): ") 50 | buf.WriteString(page.Description) 51 | buf.WriteString("\n") 52 | 53 | // Add page content in full mode 54 | if g.config.FullMode && page.Content != "" { 55 | buf.WriteString("\n") 56 | buf.WriteString(page.Content) 57 | buf.WriteString("\n") 58 | } 59 | 60 | buf.WriteString("\n") 61 | } 62 | } 63 | 64 | return buf.String() 65 | } 66 | 67 | // Capitalize first letter, lowercase the rest 68 | func capitalizeString(str string) string { 69 | if str == "" { 70 | return "" 71 | } 72 | 73 | runes := []rune(str) 74 | return string(unicode.ToUpper(runes[0])) + strings.ToLower(string(runes[1:])) 75 | } 76 | -------------------------------------------------------------------------------- /.github/workflows/idoc.yml: -------------------------------------------------------------------------------- 1 | # 📖 Simple document generation tool! Dependence Node.js run. 2 | # https://github.com/jaywcjlove/idoc 3 | 4 | name: idoc 5 | on: 6 | push: 7 | branches: 8 | - main 9 | 10 | jobs: 11 | build-deploy: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v4 15 | - uses: actions/setup-node@v4 16 | with: 17 | node-version: 20 18 | registry-url: "https://registry.npmjs.org" 19 | 20 | - name: Create idoc config. 21 | run: | 22 | cat > idoc.yml << EOF 23 | site: mdctl 24 | description: A command-line tool for processing Markdown files. Currently, it supports automatically downloading remote images to local storage and updating the image references in Markdown files, as well as translating markdown files using AI models. 25 | keywords: Markdown processor,CLI tool,Image downloader,Markdown translator,AI translation,Markdown automation,Remote image handling,Markdown utilities,AI-powered Markdown,Markdown enhancement,Markdown file management 26 | favicon: assets/favicon.ico 27 | logo: assets/icon.png 28 | 29 | openSource: https://github.com/samzong/mdctl 30 | 31 | tocs: false 32 | 33 | element: 34 | wrapper: style=max-width:720px; 35 | 36 | menus: 37 | Home: index.html 38 | About: 39 | url: https://github.com/samzong 40 | target: __blank 41 | sideEffectFiles: 42 | - README_zh.md 43 | 44 | cacheFileStat: true 45 | 46 | footer: | 47 | Copyright © {{idocYear}} samzong
48 | EOF 49 | 50 | - run: npm install idoc@1 -g 51 | - run: idoc 52 | 53 | - name: Deploy 54 | uses: peaceiris/actions-gh-pages@v4 55 | if: github.ref == 'refs/heads/main' 56 | with: 57 | github_token: ${{ secrets.GITHUB_TOKEN }} 58 | publish_dir: ./dist 59 | -------------------------------------------------------------------------------- /internal/exporter/sitereader/reader.go: -------------------------------------------------------------------------------- 1 | package sitereader 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | "log" 7 | "os" 8 | "path/filepath" 9 | ) 10 | 11 | // SiteReader Define Site Reader Interface 12 | type SiteReader interface { 13 | // Detect if given directory is this type of site 14 | Detect(dir string) bool 15 | 16 | // Read site structure, return sorted list of files 17 | // navPath parameter is used to specify the navigation path to export, empty to export all 18 | ReadStructure(dir string, configPath string, navPath string) ([]string, error) 19 | } 20 | 21 | // GetSiteReader Return the appropriate reader based on site type 22 | func GetSiteReader(siteType string, verbose bool, logger *log.Logger) (SiteReader, error) { 23 | // If no logger is provided, create a default one 24 | if logger == nil { 25 | if verbose { 26 | logger = log.New(os.Stdout, "[SITE-READER] ", log.LstdFlags) 27 | } else { 28 | logger = log.New(io.Discard, "", 0) 29 | } 30 | } 31 | 32 | logger.Printf("Creating site reader for type: %s", siteType) 33 | 34 | switch siteType { 35 | case "mkdocs": 36 | logger.Println("Using MkDocs site reader") 37 | return &MkDocsReader{Logger: logger}, nil 38 | case "hugo": 39 | logger.Println("Hugo site type is not yet implemented") 40 | return nil, fmt.Errorf("hugo site type is not yet implemented") 41 | case "docusaurus": 42 | logger.Println("Docusaurus site type is not yet implemented") 43 | return nil, fmt.Errorf("docusaurus site type is not yet implemented") 44 | default: 45 | logger.Printf("Unsupported site type: %s", siteType) 46 | return nil, fmt.Errorf("unsupported site type: %s", siteType) 47 | } 48 | } 49 | 50 | // FindConfigFile Find config file in given directory 51 | func FindConfigFile(dir string, configNames []string) (string, error) { 52 | // If no config file name is provided, use default values 53 | if len(configNames) == 0 { 54 | configNames = []string{"config.yml", "config.yaml"} 55 | } 56 | 57 | // Find config file 58 | for _, name := range configNames { 59 | configPath := filepath.Join(dir, name) 60 | if _, err := os.Stat(configPath); err == nil { 61 | return configPath, nil 62 | } 63 | } 64 | 65 | return "", fmt.Errorf("no config file found in %s", dir) 66 | } 67 | -------------------------------------------------------------------------------- /cmd/llmstxt.go: -------------------------------------------------------------------------------- 1 | package cmd 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | 7 | "github.com/samzong/mdctl/internal/llmstxt" 8 | "github.com/spf13/cobra" 9 | ) 10 | 11 | var ( 12 | includePaths []string 13 | excludePaths []string 14 | outputPath string 15 | fullMode bool 16 | concurrency int 17 | timeout int 18 | maxPages int 19 | 20 | llmstxtCmd = &cobra.Command{ 21 | Use: "llmstxt [url]", 22 | Short: "Generate llms.txt from sitemap.xml", 23 | Long: `Generate a llms.txt file from a website's sitemap.xml. This file is a curated 24 | list of the website's pages in markdown format, perfect for training or fine-tuning 25 | language models. 26 | 27 | In standard mode, only title and description are extracted. In full mode (-f flag), 28 | the content of each page is also extracted. 29 | 30 | Examples: 31 | # Standard mode 32 | mdctl llmstxt https://example.com/sitemap.xml > llms.txt 33 | 34 | # Full-content mode 35 | mdctl llmstxt -f https://example.com/sitemap.xml > llms-full.txt`, 36 | Args: cobra.ExactArgs(1), 37 | RunE: func(cmd *cobra.Command, args []string) error { 38 | sitemapURL := args[0] 39 | 40 | // Create a generator and configure options 41 | config := llmstxt.GeneratorConfig{ 42 | SitemapURL: sitemapURL, 43 | IncludePaths: includePaths, 44 | ExcludePaths: excludePaths, 45 | FullMode: fullMode, 46 | Concurrency: concurrency, 47 | Timeout: timeout, 48 | UserAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36", 49 | Verbose: verbose, 50 | VeryVerbose: veryVerbose, 51 | MaxPages: maxPages, 52 | } 53 | 54 | generator := llmstxt.NewGenerator(config) 55 | 56 | // Execute generation 57 | content, err := generator.Generate() 58 | if err != nil { 59 | return err 60 | } 61 | 62 | // Output content 63 | if outputPath == "" { 64 | // Output to standard output 65 | fmt.Println(content) 66 | } else { 67 | // Output to file 68 | return os.WriteFile(outputPath, []byte(content), 0644) 69 | } 70 | 71 | return nil 72 | }, 73 | } 74 | ) 75 | 76 | func init() { 77 | llmstxtCmd.Flags().StringVarP(&outputPath, "output", "o", "", "Output file path (default: stdout)") 78 | llmstxtCmd.Flags().StringSliceVarP(&includePaths, "include-path", "i", []string{}, "Glob patterns for paths to include (can be specified multiple times)") 79 | llmstxtCmd.Flags().StringSliceVarP(&excludePaths, "exclude-path", "e", []string{}, "Glob patterns for paths to exclude (can be specified multiple times)") 80 | llmstxtCmd.Flags().BoolVarP(&fullMode, "full", "f", false, "Enable full-content mode (extract page content)") 81 | llmstxtCmd.Flags().IntVarP(&concurrency, "concurrency", "c", 5, "Number of concurrent requests") 82 | llmstxtCmd.Flags().IntVar(&timeout, "timeout", 30, "Request timeout in seconds") 83 | llmstxtCmd.Flags().IntVar(&maxPages, "max-pages", 0, "Maximum number of pages to process (0 for unlimited)") 84 | 85 | // Add command to core group 86 | llmstxtCmd.GroupID = "core" 87 | 88 | rootCmd.AddCommand(llmstxtCmd) 89 | } 90 | -------------------------------------------------------------------------------- /internal/llmstxt/fetcher.go: -------------------------------------------------------------------------------- 1 | package llmstxt 2 | 3 | import ( 4 | "fmt" 5 | "net/http" 6 | "sync" 7 | "time" 8 | ) 9 | 10 | // Fetch pages concurrently using a worker pool 11 | func (g *Generator) fetchPages(urls []string) ([]PageInfo, error) { 12 | g.logger.Printf("Starting to fetch %d pages with concurrency %d", len(urls), g.config.Concurrency) 13 | 14 | // Create result and error channels 15 | resultChan := make(chan PageInfo, len(urls)) 16 | errorChan := make(chan error, len(urls)) 17 | 18 | // Create work channel, controlling concurrency 19 | workChan := make(chan string, len(urls)) 20 | 21 | // Start worker pool 22 | var wg sync.WaitGroup 23 | for i := 0; i < g.config.Concurrency; i++ { 24 | wg.Add(1) 25 | go func() { 26 | defer wg.Done() 27 | for urlStr := range workChan { 28 | pageInfo, err := g.fetchPageContent(urlStr) 29 | if err != nil { 30 | g.logger.Printf("Warning: failed to fetch page %s: %v", urlStr, err) 31 | errorChan <- fmt.Errorf("failed to fetch page %s: %w", urlStr, err) 32 | continue 33 | } 34 | resultChan <- pageInfo 35 | } 36 | }() 37 | } 38 | 39 | // Send all URLs to work channel 40 | for _, urlStr := range urls { 41 | workChan <- urlStr 42 | } 43 | close(workChan) 44 | 45 | // Wait for all work to finish 46 | wg.Wait() 47 | close(resultChan) 48 | close(errorChan) 49 | 50 | // Collect results 51 | var results []PageInfo 52 | for result := range resultChan { 53 | results = append(results, result) 54 | g.logger.Printf("Fetched page: %s", result.URL) 55 | } 56 | 57 | // Check for errors (don't interrupt processing, just log warnings) 58 | for err := range errorChan { 59 | g.logger.Printf("Warning: %v", err) 60 | } 61 | 62 | g.logger.Printf("Successfully fetched %d/%d pages", len(results), len(urls)) 63 | 64 | return results, nil 65 | } 66 | 67 | // Get the content of a single page 68 | func (g *Generator) fetchPageContent(urlStr string) (PageInfo, error) { 69 | // Set HTTP client 70 | client := &http.Client{ 71 | Timeout: time.Duration(g.config.Timeout) * time.Second, 72 | } 73 | 74 | // Build request 75 | req, err := http.NewRequest("GET", urlStr, nil) 76 | if err != nil { 77 | return PageInfo{}, fmt.Errorf("failed to create request: %w", err) 78 | } 79 | 80 | // Set User-Agent 81 | req.Header.Set("User-Agent", g.config.UserAgent) 82 | 83 | // Send request 84 | start := time.Now() 85 | resp, err := client.Do(req) 86 | if err != nil { 87 | return PageInfo{}, fmt.Errorf("failed to fetch page: %w", err) 88 | } 89 | defer resp.Body.Close() 90 | 91 | if resp.StatusCode != http.StatusOK { 92 | return PageInfo{}, fmt.Errorf("failed to fetch page, status code: %d", resp.StatusCode) 93 | } 94 | 95 | // Extract page information 96 | pageInfo, err := g.extractPageInfo(urlStr, resp) 97 | if err != nil { 98 | return PageInfo{}, fmt.Errorf("failed to extract page info: %w", err) 99 | } 100 | 101 | // Record timing information 102 | elapsed := time.Since(start).Round(time.Millisecond) 103 | g.logger.Printf("Fetched %s in %v", urlStr, elapsed) 104 | 105 | return pageInfo, nil 106 | } 107 | -------------------------------------------------------------------------------- /docs/DEVELOPMENT.md: -------------------------------------------------------------------------------- 1 | # mdctl 开发者指南 2 | 3 | ## 项目介绍 4 | 5 | mdctl 是一个用于处理 Markdown 文件的命令行工具,主要功能包括: 6 | 7 | 1. **下载功能**:自动下载 Markdown 文件中的远程图片到本地,并更新引用路径 8 | 2. **翻译功能**:使用 AI 模型将 Markdown 文件翻译成多种语言 9 | 3. **上传功能**:将本地图片上传到云存储,并更新 Markdown 文件中的引用 10 | 4. **配置管理**:管理工具的配置信息 11 | 5. **其他功能**:如导出为其他格式、生成 llms.txt 文件等 12 | 13 | ## 项目结构 14 | 15 | ```bash 16 | ../mdctl 17 | ├── cmd 18 | │ ├── config.go 19 | │ ├── download.go 20 | │ ├── export.go 21 | │ ├── llmstxt.go 22 | │ ├── root.go 23 | │ ├── translate.go 24 | │ └── upload.go 25 | ├── internal 26 | │ ├── cache 27 | │ ├── config 28 | │ ├── exporter 29 | │ ├── llmstxt 30 | │ ├── markdownfmt 31 | │ ├── processor 32 | │ ├── storage 33 | │ ├── translator 34 | │ └── uploader 35 | ├── main.go 36 | ├── go.mod 37 | ├── go.sum 38 | ``` 39 | 40 | ## 核心模块说明 41 | 42 | ### 命令行模块 (cmd/) 43 | 44 | 使用 [Cobra](https://github.com/spf13/cobra) 库实现命令行界面,主要命令包括: 45 | 46 | - **root**: 根命令,定义基本信息和版本 47 | - **download**: 下载远程图片到本地 48 | - **translate**: 翻译 Markdown 文件 49 | - **upload**: 上传本地图片到云存储 50 | - **config**: 管理配置信息 51 | 52 | ### 处理器模块 (internal/processor/) 53 | 54 | 负责处理 Markdown 文件中的远程图片下载,主要功能: 55 | 56 | - 解析 Markdown 文件中的图片链接 57 | - 下载远程图片到本地 58 | - 更新 Markdown 文件中的图片引用路径 59 | 60 | ### 翻译模块 (internal/translator/) 61 | 62 | 负责翻译 Markdown 文件,主要功能: 63 | 64 | - 支持多种语言翻译 65 | - 保持 Markdown 格式和 front matter 不变 66 | - 使用 AI 模型进行翻译 67 | - 支持目录结构的翻译 68 | 69 | ### 上传模块 (internal/uploader/) 70 | 71 | 负责上传本地图片到云存储,主要功能: 72 | 73 | - 解析 Markdown 文件中的本地图片链接 74 | - 上传图片到云存储 75 | - 更新 Markdown 文件中的图片引用路径 76 | - 支持多种冲突处理策略 77 | 78 | ### 存储模块 (internal/storage/) 79 | 80 | 定义存储提供者接口和实现,主要功能: 81 | 82 | - 提供统一的存储接口 83 | - 支持 S3 兼容的存储服务 84 | - 处理文件上传和元数据管理 85 | 86 | ### llms.txt 生成模块 (internal/llmstxt/) 87 | 88 | 负责从网站的 sitemap.xml 生成 llms.txt 文件,主要功能: 89 | 90 | - 解析 sitemap.xml 文件 91 | - 访问每个 URL 并提取页面内容 92 | - 生成格式化的 llms.txt 文档 93 | 94 | ### 配置模块 (internal/config/) 95 | 96 | 负责管理配置信息,主要功能: 97 | 98 | - 加载和保存配置文件 99 | - 管理 AI 模型配置 100 | - 管理云存储配置 101 | 102 | ## 开发风格和约定 103 | 104 | ### 代码组织 105 | 106 | 1. **命令与实现分离**:命令行接口在 `cmd/` 目录,具体实现在 `internal/` 目录 107 | 2. **模块化设计**:每个功能都有独立的模块,如处理器、翻译器、上传器等 108 | 3. **接口定义**:使用接口定义模块间交互,如存储提供者接口 109 | 110 | ### 错误处理 111 | 112 | 错误处理采用 Go 语言的标准方式,通过返回错误值进行传递和处理。 113 | 114 | ### 配置管理 115 | 116 | 配置文件存储在 `~/.config/mdctl/config.json`,包含: 117 | 118 | - AI 模型配置(端点、API 密钥、模型名称等) 119 | - 云存储配置(提供者、区域、访问密钥等) 120 | 121 | ### 日志输出 122 | 123 | 使用标准输出进行日志记录,提供详细的处理信息和错误信息。 124 | 125 | ## 添加新功能的步骤 126 | 127 | 1. **定义命令**:在 `cmd/` 目录下创建新的命令文件,定义命令行接口 128 | 2. **实现功能**:在 `internal/` 目录下创建相应的实现模块 129 | 3. **注册命令**:在 `cmd/root.go` 的 `init()` 函数中注册新命令 130 | 4. **更新文档**:更新 README 文件,添加新功能的说明 131 | 132 | ## 构建和发布 133 | 134 | 项目使用 Makefile 和 GoReleaser 进行构建和发布: 135 | 136 | - **构建**:使用 `make build` 命令构建项目 137 | - **发布**:使用 `make release` 命令发布新版本 138 | 139 | ## 扩展点 140 | 141 | ### 添加新的存储提供者 142 | 143 | 1. 在 `internal/storage/` 目录下创建新的提供者实现 144 | 2. 实现 `Provider` 接口 145 | 3. 在初始化时注册提供者 146 | 147 | ### 添加新的 AI 模型支持 148 | 149 | 1. 在 `internal/translator/` 目录下扩展翻译器实现 150 | 2. 添加新模型的 API 调用 151 | 3. 更新配置模块以支持新模型的配置 152 | 153 | ### 添加新的 Markdown 处理功能 154 | 155 | 1. 创建新的处理器模块 156 | 2. 实现 Markdown 解析和处理逻辑 157 | 3. 添加新的命令行接口 158 | -------------------------------------------------------------------------------- /internal/exporter/heading.go: -------------------------------------------------------------------------------- 1 | package exporter 2 | 3 | import ( 4 | "bufio" 5 | "fmt" 6 | "regexp" 7 | "strings" 8 | ) 9 | 10 | var ( 11 | // Match ATX-style headings (those starting with #) 12 | atxHeadingRegex = regexp.MustCompile(`^(#{1,6})\s+(.+)$`) 13 | // Match Setext-style headings (underline style) 14 | setextHeading1Regex = regexp.MustCompile(`^=+\s*$`) 15 | setextHeading2Regex = regexp.MustCompile(`^-+\s*$`) 16 | ) 17 | 18 | // ShiftHeadings Adjust heading levels in Markdown text 19 | func ShiftHeadings(content string, shiftBy int) string { 20 | if shiftBy == 0 { 21 | return content 22 | } 23 | 24 | scanner := bufio.NewScanner(strings.NewReader(content)) 25 | var result []string 26 | var prevLine string 27 | var isPrevLineHeading bool 28 | 29 | for scanner.Scan() { 30 | line := scanner.Text() 31 | 32 | // Handle ATX-style headings 33 | if matches := atxHeadingRegex.FindStringSubmatch(line); matches != nil { 34 | level := len(matches[1]) + shiftBy 35 | heading := matches[2] 36 | 37 | if level <= 6 { 38 | // Still valid heading level 39 | result = append(result, fmt.Sprintf("%s %s", strings.Repeat("#", level), heading)) 40 | } else { 41 | // Exceeded max heading level, convert to bold text 42 | result = append(result, fmt.Sprintf("**%s**", heading)) 43 | } 44 | isPrevLineHeading = false 45 | } else if setextHeading1Regex.MatchString(line) && prevLine != "" { 46 | // Handle Setext-style level 1 headings 47 | level := 1 + shiftBy 48 | if level <= 6 { 49 | result[len(result)-1] = fmt.Sprintf("%s %s", strings.Repeat("#", level), prevLine) 50 | } else { 51 | result[len(result)-1] = fmt.Sprintf("**%s**", prevLine) 52 | } 53 | isPrevLineHeading = true 54 | } else if setextHeading2Regex.MatchString(line) && prevLine != "" { 55 | // Handle Setext-style level 2 headings 56 | level := 2 + shiftBy 57 | if level <= 6 { 58 | result[len(result)-1] = fmt.Sprintf("%s %s", strings.Repeat("#", level), prevLine) 59 | } else { 60 | result[len(result)-1] = fmt.Sprintf("**%s**", prevLine) 61 | } 62 | isPrevLineHeading = true 63 | } else { 64 | // Ordinary line 65 | result = append(result, line) 66 | isPrevLineHeading = false 67 | } 68 | 69 | if !isPrevLineHeading { 70 | prevLine = line 71 | } 72 | } 73 | 74 | return strings.Join(result, "\n") 75 | } 76 | 77 | // AddTitleFromFilename Add heading from filename 78 | func AddTitleFromFilename(content, filename string, level int) string { 79 | // Extract heading from filename (remove extension) 80 | title := strings.TrimSuffix(filename, ".md") 81 | title = strings.TrimSuffix(title, ".markdown") 82 | 83 | // Replace underscores and hyphens with spaces, making the heading more readable 84 | title = strings.ReplaceAll(title, "_", " ") 85 | title = strings.ReplaceAll(title, "-", " ") 86 | 87 | // Capitalize the first letter of each word 88 | title = strings.Title(title) 89 | 90 | // Create heading line 91 | var titleLine string 92 | if level <= 6 { 93 | titleLine = fmt.Sprintf("%s %s\n\n", strings.Repeat("#", level), title) 94 | } else { 95 | titleLine = fmt.Sprintf("**%s**\n\n", title) 96 | } 97 | 98 | return titleLine + content 99 | } 100 | -------------------------------------------------------------------------------- /.github/workflows/docker-build.yml: -------------------------------------------------------------------------------- 1 | name: Docker Image Build 2 | 3 | on: 4 | release: 5 | types: [published, created] 6 | push: 7 | tags: 8 | - 'v*' 9 | paths-ignore: 10 | - '**.md' 11 | - '.github/**' 12 | - '!.github/workflows/docker-build.yml' 13 | workflow_dispatch: 14 | 15 | jobs: 16 | docker: 17 | name: Build and Push Multi-arch Image 18 | runs-on: ubuntu-latest 19 | permissions: 20 | contents: write 21 | packages: write 22 | steps: 23 | - name: Checkout Code 24 | uses: actions/checkout@v4 25 | with: 26 | fetch-depth: 0 27 | 28 | - name: Set up QEMU 29 | uses: docker/setup-qemu-action@v3 30 | 31 | - name: Setup Docker Buildx 32 | uses: docker/setup-buildx-action@v3 33 | 34 | - name: Login to GitHub Container Registry 35 | uses: docker/login-action@v3 36 | with: 37 | registry: ghcr.io 38 | username: ${{ github.actor }} 39 | password: ${{ secrets.GITHUB_TOKEN }} 40 | 41 | - name: Extract Metadata 42 | id: meta 43 | uses: docker/metadata-action@v5 44 | with: 45 | images: ghcr.io/${{ github.repository_owner }}/mdctl 46 | tags: | 47 | type=ref,event=branch 48 | type=ref,event=pr 49 | type=semver,pattern={{version}} 50 | type=semver,pattern={{major}}.{{minor}} 51 | type=semver,pattern={{major}} 52 | type=sha,format=short 53 | type=raw,value=latest,enable=${{ github.ref_type == 'tag' }} 54 | 55 | - name: Display tags 56 | run: | 57 | echo "Generated tags: ${{ steps.meta.outputs.tags }}" 58 | echo "Ref type: ${{ github.ref_type }}" 59 | echo "Ref: ${{ github.ref }}" 60 | 61 | # Set explicit latest tag for tag events 62 | - name: Set explicit latest tag 63 | if: startsWith(github.ref, 'refs/tags/') 64 | run: echo "EXTRA_TAGS=ghcr.io/${{ github.repository_owner }}/mdctl:latest" >> $GITHUB_ENV 65 | 66 | # Get version information 67 | - name: Get version info 68 | id: version_info 69 | run: | 70 | # Get version from tag or git describe 71 | if [[ "$GITHUB_REF_TYPE" == "tag" ]]; then 72 | VERSION="${GITHUB_REF_NAME}" 73 | else 74 | VERSION="$(git describe --tags --always || echo 'dev')" 75 | fi 76 | 77 | # Get build time 78 | BUILD_TIME="$(date -u +"%Y-%m-%dT%H:%M:%SZ")" 79 | 80 | # Set outputs 81 | echo "VERSION=${VERSION}" >> $GITHUB_OUTPUT 82 | echo "BUILD_TIME=${BUILD_TIME}" >> $GITHUB_OUTPUT 83 | 84 | # Display for debugging 85 | echo "Version: ${VERSION}" 86 | echo "Build time: ${BUILD_TIME}" 87 | 88 | - name: Build and Push Multi-arch Image 89 | uses: docker/build-push-action@v5 90 | with: 91 | context: . 92 | platforms: linux/amd64,linux/arm64 93 | push: true 94 | tags: ${{ steps.meta.outputs.tags }}${{ env.EXTRA_TAGS != '' && format(',{0}', env.EXTRA_TAGS) || '' }} 95 | labels: ${{ steps.meta.outputs.labels }} 96 | build-args: | 97 | VERSION=${{ steps.version_info.outputs.VERSION }} 98 | BUILD_TIME=${{ steps.version_info.outputs.BUILD_TIME }} 99 | cache-from: type=gha 100 | cache-to: type=gha,mode=max 101 | -------------------------------------------------------------------------------- /internal/linter/config.go: -------------------------------------------------------------------------------- 1 | package linter 2 | 3 | import ( 4 | "encoding/json" 5 | "os" 6 | "path/filepath" 7 | ) 8 | 9 | // ConfigFile represents a markdownlint configuration file 10 | type ConfigFile struct { 11 | // Default configuration 12 | Default bool `json:"default,omitempty"` 13 | 14 | // Extends other configuration files 15 | Extends string `json:"extends,omitempty"` 16 | 17 | // Rule-specific configuration 18 | MD001 *RuleConfig `json:"MD001,omitempty"` 19 | MD003 *RuleConfig `json:"MD003,omitempty"` 20 | MD009 *RuleConfig `json:"MD009,omitempty"` 21 | MD010 *RuleConfig `json:"MD010,omitempty"` 22 | MD012 *RuleConfig `json:"MD012,omitempty"` 23 | MD013 *RuleConfig `json:"MD013,omitempty"` 24 | MD018 *RuleConfig `json:"MD018,omitempty"` 25 | MD019 *RuleConfig `json:"MD019,omitempty"` 26 | MD023 *RuleConfig `json:"MD023,omitempty"` 27 | MD032 *RuleConfig `json:"MD032,omitempty"` 28 | MD047 *RuleConfig `json:"MD047,omitempty"` 29 | } 30 | 31 | // RuleConfig represents configuration for a specific rule 32 | type RuleConfig struct { 33 | // Whether the rule is enabled 34 | Enabled *bool `json:"enabled,omitempty"` 35 | 36 | // Rule-specific options 37 | Options map[string]interface{} `json:"options,omitempty"` 38 | } 39 | 40 | // LoadConfigFile loads configuration from a file 41 | func LoadConfigFile(filename string) (*ConfigFile, error) { 42 | // Try to find config file if not specified 43 | if filename == "" { 44 | filename = findConfigFile() 45 | } 46 | 47 | if filename == "" { 48 | return &ConfigFile{Default: true}, nil 49 | } 50 | 51 | data, err := os.ReadFile(filename) 52 | if err != nil { 53 | return nil, err 54 | } 55 | 56 | var config ConfigFile 57 | if err := json.Unmarshal(data, &config); err != nil { 58 | return nil, err 59 | } 60 | 61 | return &config, nil 62 | } 63 | 64 | // ApplyToRuleSet applies the configuration to a rule set 65 | func (c *ConfigFile) ApplyToRuleSet(rs *RuleSet) { 66 | ruleConfigs := map[string]*RuleConfig{ 67 | "MD001": c.MD001, 68 | "MD003": c.MD003, 69 | "MD009": c.MD009, 70 | "MD010": c.MD010, 71 | "MD012": c.MD012, 72 | "MD013": c.MD013, 73 | "MD018": c.MD018, 74 | "MD019": c.MD019, 75 | "MD023": c.MD023, 76 | "MD032": c.MD032, 77 | "MD047": c.MD047, 78 | } 79 | 80 | for ruleID, ruleConfig := range ruleConfigs { 81 | if ruleConfig != nil && ruleConfig.Enabled != nil { 82 | if rule, exists := rs.rules[ruleID]; exists { 83 | rule.SetEnabled(*ruleConfig.Enabled) 84 | } 85 | } 86 | } 87 | } 88 | 89 | // findConfigFile looks for common markdownlint config files 90 | func findConfigFile() string { 91 | configFiles := []string{ 92 | ".markdownlint.json", 93 | ".markdownlint.jsonc", 94 | ".markdownlintrc", 95 | ".markdownlintrc.json", 96 | ".markdownlintrc.jsonc", 97 | } 98 | 99 | for _, filename := range configFiles { 100 | if _, err := os.Stat(filename); err == nil { 101 | return filename 102 | } 103 | } 104 | 105 | // Also check in home directory 106 | if home, err := os.UserHomeDir(); err == nil { 107 | for _, filename := range configFiles { 108 | fullPath := filepath.Join(home, filename) 109 | if _, err := os.Stat(fullPath); err == nil { 110 | return fullPath 111 | } 112 | } 113 | } 114 | 115 | return "" 116 | } 117 | 118 | // CreateDefaultConfig creates a default configuration file 119 | func CreateDefaultConfig(filename string) error { 120 | config := ConfigFile{ 121 | Default: true, 122 | MD001: &RuleConfig{Enabled: boolPtr(true)}, 123 | MD003: &RuleConfig{Enabled: boolPtr(true)}, 124 | MD009: &RuleConfig{Enabled: boolPtr(true)}, 125 | MD010: &RuleConfig{Enabled: boolPtr(true)}, 126 | MD012: &RuleConfig{Enabled: boolPtr(true)}, 127 | MD013: &RuleConfig{Enabled: boolPtr(true)}, 128 | MD018: &RuleConfig{Enabled: boolPtr(true)}, 129 | MD019: &RuleConfig{Enabled: boolPtr(true)}, 130 | MD023: &RuleConfig{Enabled: boolPtr(true)}, 131 | MD032: &RuleConfig{Enabled: boolPtr(true)}, 132 | MD047: &RuleConfig{Enabled: boolPtr(true)}, 133 | } 134 | 135 | data, err := json.MarshalIndent(config, "", " ") 136 | if err != nil { 137 | return err 138 | } 139 | 140 | return os.WriteFile(filename, data, 0644) 141 | } 142 | 143 | // boolPtr returns a pointer to a bool value 144 | func boolPtr(b bool) *bool { 145 | return &b 146 | } 147 | -------------------------------------------------------------------------------- /internal/cache/cache.go: -------------------------------------------------------------------------------- 1 | package cache 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "os" 7 | "path/filepath" 8 | "sync" 9 | "time" 10 | ) 11 | 12 | // CacheItem represents a single cached file information 13 | type CacheItem struct { 14 | LocalPath string `json:"local_path"` 15 | RemotePath string `json:"remote_path"` 16 | URL string `json:"url"` 17 | Hash string `json:"hash"` 18 | UploadTime time.Time `json:"upload_time"` 19 | } 20 | 21 | // Cache manages information about uploaded files 22 | type Cache struct { 23 | Items map[string]CacheItem `json:"items"` 24 | Version string `json:"version"` 25 | CacheDir string `json:"cache_dir,omitempty"` 26 | mutex sync.RWMutex 27 | } 28 | 29 | // New creates a new cache instance 30 | func New(cacheDir string) *Cache { 31 | if cacheDir == "" { 32 | homeDir, err := os.UserHomeDir() 33 | if err == nil { 34 | cacheDir = filepath.Join(homeDir, ".cache", "mdctl") 35 | } else { 36 | // Fallback to temp directory 37 | cacheDir = filepath.Join(os.TempDir(), "mdctl-cache") 38 | } 39 | } 40 | 41 | return &Cache{ 42 | Items: make(map[string]CacheItem), 43 | Version: "1.0", 44 | CacheDir: cacheDir, 45 | } 46 | } 47 | 48 | // saveWithoutLock writes cache to disk without acquiring the lock 49 | // This should only be called from methods that already hold a lock 50 | func (c *Cache) saveWithoutLock() error { 51 | // Ensure cache directory exists 52 | if err := os.MkdirAll(c.CacheDir, 0755); err != nil { 53 | return fmt.Errorf("failed to create cache directory: %v", err) 54 | } 55 | 56 | cacheFile := filepath.Join(c.CacheDir, "upload-cache.json") 57 | data, err := json.MarshalIndent(c, "", " ") 58 | if err != nil { 59 | return fmt.Errorf("failed to marshal cache: %v", err) 60 | } 61 | 62 | if err := os.WriteFile(cacheFile, data, 0644); err != nil { 63 | return fmt.Errorf("failed to write cache file: %v", err) 64 | } 65 | 66 | return nil 67 | } 68 | 69 | // Load reads cache from disk 70 | func (c *Cache) Load() error { 71 | c.mutex.Lock() 72 | defer c.mutex.Unlock() 73 | 74 | // Ensure cache directory exists 75 | if err := os.MkdirAll(c.CacheDir, 0755); err != nil { 76 | return fmt.Errorf("failed to create cache directory: %v", err) 77 | } 78 | 79 | cacheFile := filepath.Join(c.CacheDir, "upload-cache.json") 80 | if _, err := os.Stat(cacheFile); os.IsNotExist(err) { 81 | // Cache file doesn't exist yet, create a new one 82 | c.Items = make(map[string]CacheItem) 83 | return c.saveWithoutLock() 84 | } 85 | 86 | data, err := os.ReadFile(cacheFile) 87 | if err != nil { 88 | return fmt.Errorf("failed to read cache file: %v", err) 89 | } 90 | 91 | if err := json.Unmarshal(data, c); err != nil { 92 | // If cache is corrupt, start with a fresh one 93 | c.Items = make(map[string]CacheItem) 94 | return nil 95 | } 96 | 97 | return nil 98 | } 99 | 100 | // Save persists the cache to disk 101 | func (c *Cache) Save() error { 102 | c.mutex.Lock() 103 | defer c.mutex.Unlock() 104 | 105 | return c.saveWithoutLock() // Use the lockless version to avoid deadlock 106 | } 107 | 108 | // AddItem adds or updates a cache item 109 | func (c *Cache) AddItem(localPath, remotePath, url, hash string) { 110 | c.mutex.Lock() 111 | defer c.mutex.Unlock() 112 | 113 | c.Items[localPath] = CacheItem{ 114 | LocalPath: localPath, 115 | RemotePath: remotePath, 116 | URL: url, 117 | Hash: hash, 118 | UploadTime: time.Now(), 119 | } 120 | } 121 | 122 | // GetItem retrieves a cache item by local path 123 | func (c *Cache) GetItem(localPath string) (CacheItem, bool) { 124 | c.mutex.RLock() 125 | defer c.mutex.RUnlock() 126 | 127 | item, exists := c.Items[localPath] 128 | return item, exists 129 | } 130 | 131 | // HasItemWithHash checks if an item with the same hash exists 132 | func (c *Cache) HasItemWithHash(hash string) (CacheItem, bool) { 133 | c.mutex.RLock() 134 | defer c.mutex.RUnlock() 135 | 136 | for _, item := range c.Items { 137 | if item.Hash == hash { 138 | return item, true 139 | } 140 | } 141 | return CacheItem{}, false 142 | } 143 | 144 | // RemoveItem removes an item from the cache 145 | func (c *Cache) RemoveItem(localPath string) { 146 | c.mutex.Lock() 147 | defer c.mutex.Unlock() 148 | 149 | delete(c.Items, localPath) 150 | } 151 | -------------------------------------------------------------------------------- /internal/llmstxt/generator.go: -------------------------------------------------------------------------------- 1 | package llmstxt 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | "log" 7 | "os" 8 | "sort" 9 | "time" 10 | ) 11 | 12 | // GeneratorConfig contains the configuration required to generate llms.txt 13 | type GeneratorConfig struct { 14 | SitemapURL string 15 | IncludePaths []string 16 | ExcludePaths []string 17 | FullMode bool 18 | Concurrency int 19 | Timeout int 20 | UserAgent string 21 | Verbose bool 22 | VeryVerbose bool // More detailed log output 23 | MaxPages int // Maximum number of pages to process, 0 means no limit 24 | } 25 | 26 | // PageInfo stores page information 27 | type PageInfo struct { 28 | Title string 29 | URL string 30 | Description string 31 | Content string // Page content, only filled in full mode 32 | Section string // First segment of URL path as section 33 | } 34 | 35 | // Generator is the llms.txt generator 36 | type Generator struct { 37 | config GeneratorConfig 38 | logger *log.Logger 39 | } 40 | 41 | // NewGenerator creates a new generator instance 42 | func NewGenerator(config GeneratorConfig) *Generator { 43 | var logger *log.Logger 44 | if config.Verbose || config.VeryVerbose { 45 | logger = log.New(os.Stdout, "[LLMSTXT] ", log.LstdFlags) 46 | } else { 47 | logger = log.New(io.Discard, "", 0) 48 | } 49 | 50 | return &Generator{ 51 | config: config, 52 | logger: logger, 53 | } 54 | } 55 | 56 | // Generate performs the generation process and returns the generated content 57 | func (g *Generator) Generate() (string, error) { 58 | startTime := time.Now() 59 | g.logger.Printf("Starting generation for sitemap: %s", g.config.SitemapURL) 60 | if g.config.FullMode { 61 | g.logger.Println("Full-content mode enabled") 62 | } 63 | 64 | // 1. Parse sitemap.xml to get URL list 65 | urls, err := g.parseSitemap() 66 | if err != nil { 67 | return "", fmt.Errorf("failed to parse sitemap: %w", err) 68 | } 69 | g.logger.Printf("Found %d URLs in sitemap", len(urls)) 70 | 71 | // 2. Filter URLs (based on include/exclude mode) 72 | urls = g.filterURLs(urls) 73 | g.logger.Printf("%d URLs after filtering", len(urls)) 74 | 75 | // 2.1. Apply max page limit 76 | if g.config.MaxPages > 0 && len(urls) > g.config.MaxPages { 77 | g.logger.Printf("Limiting to %d pages as requested (--max-pages)", g.config.MaxPages) 78 | urls = urls[:g.config.MaxPages] 79 | } 80 | 81 | // 3. Create worker pool and get page info 82 | pages, err := g.fetchPages(urls) 83 | if err != nil { 84 | return "", fmt.Errorf("failed to fetch pages: %w", err) 85 | } 86 | 87 | // 4. Group pages by section 88 | sections := g.groupBySections(pages) 89 | 90 | // 5. Format to Markdown content 91 | content := g.formatContent(sections) 92 | 93 | elapsedTime := time.Since(startTime).Round(time.Millisecond) 94 | g.logger.Printf("Generation completed successfully in %v", elapsedTime) 95 | return content, nil 96 | } 97 | 98 | // Group pages by section 99 | func (g *Generator) groupBySections(pages []PageInfo) map[string][]PageInfo { 100 | sections := make(map[string][]PageInfo) 101 | 102 | for _, page := range pages { 103 | sections[page.Section] = append(sections[page.Section], page) 104 | } 105 | 106 | // Sort pages within each section by URL path length 107 | for section, sectionPages := range sections { 108 | sort.Slice(sectionPages, func(i, j int) bool { 109 | return len(sectionPages[i].URL) < len(sectionPages[j].URL) 110 | }) 111 | sections[section] = sectionPages 112 | } 113 | 114 | return sections 115 | } 116 | 117 | // Get sorted section name list, ensuring ROOT section is always first 118 | func (g *Generator) getSortedSections(sections map[string][]PageInfo) []string { 119 | sectionNames := make([]string, 0, len(sections)) 120 | 121 | // Add ROOT section first (if exists) 122 | if _, hasRoot := sections["ROOT"]; hasRoot { 123 | sectionNames = append(sectionNames, "ROOT") 124 | } 125 | 126 | // Add other sections and sort alphabetically 127 | for section := range sections { 128 | if section != "ROOT" { 129 | sectionNames = append(sectionNames, section) 130 | } 131 | } 132 | 133 | // Only sort if there are non-ROOT sections 134 | if len(sectionNames) > 1 { 135 | // Only sort non-ROOT sections 136 | nonRootSections := sectionNames[1:] 137 | sort.Strings(nonRootSections) 138 | } 139 | 140 | return sectionNames 141 | } 142 | -------------------------------------------------------------------------------- /cmd/translate.go: -------------------------------------------------------------------------------- 1 | package cmd 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "path/filepath" 7 | "strings" 8 | 9 | "github.com/samzong/mdctl/internal/config" 10 | "github.com/samzong/mdctl/internal/translator" 11 | "github.com/spf13/cobra" 12 | ) 13 | 14 | var ( 15 | fromPath string 16 | toPath string 17 | locale string 18 | force bool 19 | format bool 20 | ) 21 | 22 | // Generate target file path 23 | func generateTargetPath(sourcePath, lang string) string { 24 | dir := filepath.Dir(sourcePath) 25 | base := filepath.Base(sourcePath) 26 | ext := filepath.Ext(base) 27 | nameWithoutExt := strings.TrimSuffix(base, ext) 28 | return filepath.Join(dir, nameWithoutExt+"_"+lang+ext) 29 | } 30 | 31 | var translateCmd = &cobra.Command{ 32 | Use: "translate", 33 | Short: "Translate markdown files using AI models", 34 | Long: `Translate markdown files or directories to specified language using AI models. 35 | 36 | Supported AI Models: 37 | - OpenAI (Current) 38 | - DeepSeek R1 (Current) 39 | - Llama (Current) 40 | 41 | Supported Languages: 42 | ar (العربية), de (Deutsch), en (English), es (Español), fr (Français), 43 | hi (हिन्दी), it (Italiano), ja (日本語), ko (한국어), pt (Português), 44 | ru (Русский), th (ไทย), vi (Tiếng Việt), zh (中文) 45 | 46 | Examples: 47 | # Translate a single file to Chinese 48 | mdctl translate -f README.md -l zh 49 | 50 | # Translate a directory to Japanese 51 | mdctl translate -f docs -l ja 52 | 53 | # Force translate an already translated file 54 | mdctl translate -f README.md -l ko -F 55 | 56 | # Format markdown content after translation 57 | mdctl translate -f README.md -l zh -m 58 | 59 | # Translate to a specific output path 60 | mdctl translate -f docs -l fr -t translated_docs`, 61 | RunE: func(cmd *cobra.Command, args []string) error { 62 | cfg, err := config.LoadConfig() 63 | if err != nil { 64 | return fmt.Errorf("failed to load config: %v", err) 65 | } 66 | 67 | // Validate language option 68 | if !translator.IsLanguageSupported(locale) { 69 | return fmt.Errorf("unsupported locale: %s\nSupported languages: %s", 70 | locale, 71 | translator.GetSupportedLanguages()) 72 | } 73 | 74 | // Check if source path exists 75 | if _, err := os.Stat(fromPath); os.IsNotExist(err) { 76 | return fmt.Errorf("source path does not exist: %s", fromPath) 77 | } 78 | 79 | // Get absolute path of source path 80 | srcAbs, err := filepath.Abs(fromPath) 81 | if err != nil { 82 | return fmt.Errorf("failed to get absolute path: %v", err) 83 | } 84 | 85 | // Check if it's a file or directory 86 | fi, err := os.Stat(srcAbs) 87 | if err != nil { 88 | return fmt.Errorf("failed to get file info: %v", err) 89 | } 90 | 91 | if fi.IsDir() { 92 | // If it's a directory and no target path specified, use the same directory structure 93 | if toPath == "" { 94 | return translator.ProcessDirectory(srcAbs, srcAbs, locale, cfg, force, format) 95 | } 96 | // If target path is specified, use the specified path 97 | dstAbs, err := filepath.Abs(toPath) 98 | if err != nil { 99 | return fmt.Errorf("failed to get absolute path: %v", err) 100 | } 101 | return translator.ProcessDirectory(srcAbs, dstAbs, locale, cfg, force, format) 102 | } 103 | 104 | // Process single file 105 | var dstAbs string 106 | if toPath == "" { 107 | // If no target path specified, generate name_lang.md in the same directory as source 108 | dstAbs = generateTargetPath(srcAbs, locale) 109 | } else { 110 | // If target path specified, use the specified path 111 | dstAbs, err = filepath.Abs(toPath) 112 | if err != nil { 113 | return fmt.Errorf("failed to get absolute path: %v", err) 114 | } 115 | } 116 | 117 | return translator.ProcessFile(srcAbs, dstAbs, locale, cfg, format, force) 118 | }, 119 | } 120 | 121 | func init() { 122 | translateCmd.Flags().StringVarP(&fromPath, "from", "f", "", "Source file or directory path") 123 | translateCmd.Flags().StringVarP(&toPath, "to", "t", "", "Target file or directory path (optional, default: generate in same directory as source)") 124 | translateCmd.Flags().StringVarP(&locale, "locales", "l", "", "Target language code (e.g., zh, en, ja, ko, fr, de, es, etc.)") 125 | translateCmd.Flags().BoolVarP(&force, "force", "F", false, "Force translate even if already translated") 126 | translateCmd.Flags().BoolVarP(&format, "format", "m", false, "Format markdown content after translation") 127 | 128 | translateCmd.MarkFlagRequired("from") 129 | translateCmd.MarkFlagRequired("locales") 130 | } 131 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # mdctl - A CLI Tool for Markdown File Operations 2 | 3 |
4 | mdctl logo 5 |
6 |

An AI-powered CLI tool to enhance your Markdown workflow, with auto-image downloading, translation, and more features coming soon!

7 |

8 | Build Status 9 | Release Version 10 | go report 11 | MIT License 12 | Ask DeepWiki 13 |

14 |
15 | 16 | ## Key Features 17 | 18 | - Automatically downloads remote images to a specified local directory. 19 | - Translates markdown files using AI models with support for multiple languages. 20 | - Uploads local images in markdown files to cloud storage services and updates references. 21 | - Exports markdown files to various document formats (DOCX, PDF, EPUB) with customization options. 22 | - Generates llms.txt files from website sitemaps for training language models. 23 | 24 | ## Installation 25 | 26 | Use Homebrew to install mdctl. Follow the [Homebrew Installation Guide](https://brew.sh/) to install Homebrew. 27 | 28 | ```bash 29 | brew tap samzong/tap 30 | brew install samzong/tap/mdctl 31 | ``` 32 | 33 | Or use go to install mdctl. 34 | 35 | ```bash 36 | go install github.com/samzong/mdctl@latest 37 | ``` 38 | 39 | ## Usage 40 | 41 | Quick examples for common tasks: 42 | 43 | ### Downloading Images 44 | 45 | ```bash 46 | # Process a single file 47 | mdctl download -f path/to/your/file.md 48 | 49 | # Process a directory 50 | mdctl download -d path/to/your/directory 51 | ``` 52 | 53 | ### Translating I18n 54 | 55 | ```bash 56 | # Translate to Chinese 57 | mdctl translate -f README.md -l zh 58 | 59 | # Translate a directory to Japanese 60 | mdctl translate -d docs/ -l ja 61 | ``` 62 | 63 | ### Uploading Images to Cloud Storage 64 | 65 | ```bash 66 | # Upload images from a file 67 | mdctl upload -f post.md 68 | 69 | # Upload images from a directory 70 | mdctl upload -d docs/ 71 | ``` 72 | 73 | ### Exporting Documents to `.docx` 74 | 75 | ```bash 76 | # Export to DOCX 77 | mdctl export -f README.md -o output.docx 78 | 79 | # Export to PDF with table of contents 80 | mdctl export -d docs/ -o documentation.pdf -F pdf --toc 81 | ``` 82 | 83 | ### Generating `llms.txt` from `sitemap.xml` 84 | 85 | ```bash 86 | # Standard mode (titles and descriptions) 87 | mdctl llmstxt https://example.com/sitemap.xml > llms.txt 88 | 89 | # Full-content mode 90 | mdctl llmstxt -f https://example.com/sitemap.xml > llms-full.txt 91 | ``` 92 | 93 | ### GitHub Action 94 | 95 | Use mdctl in your CI with the Docker-based Action in this repo. Example workflow step: 96 | 97 | ```yaml 98 | jobs: 99 | docs: 100 | runs-on: ubuntu-latest 101 | steps: 102 | - uses: actions/checkout@v4 103 | - name: Export docs to DOCX 104 | uses: samzong/mdctl@v1 105 | with: 106 | args: "export -f README.md -o output.docx" 107 | ``` 108 | 109 | Notes: 110 | - Set `with.args` to any mdctl command and flags (e.g., `download`, `translate`, `upload`, `export`, `llmstxt`). 111 | - Provide necessary credentials via `env` when using cloud features (e.g., S3 for `upload`). 112 | - You can set `working-directory` on the step if needed. 113 | 114 | ## Developer's Guide 115 | 116 | If you are interested in contributing, please refer to the [DEVELOPMENT.md](docs/DEVELOPMENT.md) file for a complete technical architecture, component design, and development guide. 117 | 118 | ## Contributing 119 | 120 | Welcome to contribute code, report issues, or suggest features! Please follow these steps: 121 | 122 | 1. Fork this repository 123 | 2. Create your feature branch (`git checkout -b feature/amazing-feature`) 124 | 3. Commit your changes (`git commit -m 'Add some amazing feature'`) 125 | 4. Push to the branch (`git push origin feature/amazing-feature`) 126 | 5. Open a Pull Request 127 | 128 | ## License 129 | 130 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. 131 | -------------------------------------------------------------------------------- /cmd/export.go: -------------------------------------------------------------------------------- 1 | package cmd 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | "log" 7 | "os" 8 | 9 | "github.com/samzong/mdctl/internal/exporter" 10 | "github.com/spf13/cobra" 11 | ) 12 | 13 | var ( 14 | exportFile string 15 | exportDir string 16 | siteType string 17 | exportOutput string 18 | exportTemplate string 19 | exportFormat string 20 | generateToc bool 21 | shiftHeadingLevelBy int 22 | fileAsTitle bool 23 | tocDepth int 24 | navPath string 25 | logger *log.Logger 26 | 27 | exportCmd = &cobra.Command{ 28 | Use: "export", 29 | Short: "Export markdown files to other formats", 30 | Long: `Export markdown files to other formats like DOCX, PDF, EPUB. 31 | Uses Pandoc as the underlying conversion tool. 32 | 33 | Examples: 34 | mdctl export -f README.md -o output.docx 35 | mdctl export -d docs/ -o documentation.docx 36 | mdctl export -d docs/ -s mkdocs -o site_docs.docx 37 | mdctl export -d docs/ -o report.docx -t templates/corporate.docx 38 | mdctl export -d docs/ -o documentation.docx --shift-heading-level-by 2 39 | mdctl export -d docs/ -o documentation.docx --toc --toc-depth 4 40 | mdctl export -d docs/ -o documentation.pdf -F pdf`, 41 | RunE: func(cmd *cobra.Command, args []string) error { 42 | // Initialize logger 43 | if verbose { 44 | logger = log.New(os.Stdout, "[EXPORT] ", log.LstdFlags) 45 | } else { 46 | logger = log.New(io.Discard, "", 0) 47 | } 48 | 49 | logger.Println("Starting export process...") 50 | 51 | // Parameter validation 52 | if exportFile == "" && exportDir == "" { 53 | return fmt.Errorf("either source file (-f) or source directory (-d) must be specified") 54 | } 55 | if exportFile != "" && exportDir != "" { 56 | return fmt.Errorf("cannot specify both source file (-f) and source directory (-d)") 57 | } 58 | if exportOutput == "" { 59 | return fmt.Errorf("output file (-o) must be specified") 60 | } 61 | 62 | logger.Printf("Validating parameters: file=%s, dir=%s, output=%s, format=%s, site-type=%s", 63 | exportFile, exportDir, exportOutput, exportFormat, siteType) 64 | 65 | // Check if Pandoc is available 66 | logger.Println("Checking Pandoc availability...") 67 | if err := exporter.CheckPandocAvailability(); err != nil { 68 | return err 69 | } 70 | logger.Println("Pandoc is available.") 71 | 72 | // Create export options 73 | options := exporter.ExportOptions{ 74 | Template: exportTemplate, 75 | GenerateToc: generateToc, 76 | ShiftHeadingLevelBy: shiftHeadingLevelBy, 77 | FileAsTitle: fileAsTitle, 78 | Format: exportFormat, 79 | SiteType: siteType, 80 | Verbose: verbose, 81 | Logger: logger, 82 | TocDepth: tocDepth, 83 | NavPath: navPath, 84 | } 85 | 86 | logger.Printf("Export options: template=%s, toc=%v, toc-depth=%d, shift-heading=%d, file-as-title=%v", 87 | exportTemplate, generateToc, tocDepth, shiftHeadingLevelBy, fileAsTitle) 88 | 89 | // Execute export 90 | exp := exporter.NewExporter() 91 | var err error 92 | 93 | if exportFile != "" { 94 | logger.Printf("Exporting single file: %s -> %s", exportFile, exportOutput) 95 | err = exp.ExportFile(exportFile, exportOutput, options) 96 | } else { 97 | logger.Printf("Exporting directory: %s -> %s", exportDir, exportOutput) 98 | err = exp.ExportDirectory(exportDir, exportOutput, options) 99 | } 100 | 101 | if err != nil { 102 | logger.Printf("Export failed: %s", err) 103 | return err 104 | } 105 | 106 | logger.Println("Export completed successfully.") 107 | return nil 108 | }, 109 | } 110 | ) 111 | 112 | func init() { 113 | exportCmd.Flags().StringVarP(&exportFile, "file", "f", "", "Source markdown file to export") 114 | exportCmd.Flags().StringVarP(&exportDir, "dir", "d", "", "Source directory containing markdown files to export") 115 | exportCmd.Flags().StringVarP(&siteType, "site-type", "s", "basic", "Site type (basic, mkdocs, hugo, docusaurus)") 116 | exportCmd.Flags().StringVarP(&exportOutput, "output", "o", "", "Output file path") 117 | exportCmd.Flags().StringVarP(&exportTemplate, "template", "t", "", "Word template file path") 118 | exportCmd.Flags().StringVarP(&exportFormat, "format", "F", "docx", "Output format (docx, pdf, epub)") 119 | exportCmd.Flags().BoolVar(&generateToc, "toc", false, "Generate table of contents") 120 | exportCmd.Flags().IntVar(&shiftHeadingLevelBy, "shift-heading-level-by", 0, "Shift heading level by N") 121 | exportCmd.Flags().BoolVar(&fileAsTitle, "file-as-title", false, "Use filename as section title") 122 | exportCmd.Flags().IntVar(&tocDepth, "toc-depth", 3, "Depth of table of contents (default 3)") 123 | exportCmd.Flags().StringVarP(&navPath, "nav-path", "n", "", "Specify the navigation path to export (e.g. 'Section1/Subsection2')") 124 | } 125 | -------------------------------------------------------------------------------- /internal/linter/fixer.go: -------------------------------------------------------------------------------- 1 | package linter 2 | 3 | import ( 4 | "regexp" 5 | "strings" 6 | ) 7 | 8 | // Fixer provides auto-fix functionality for markdown issues 9 | type Fixer struct { 10 | rules map[string]func([]string) ([]string, int) 11 | } 12 | 13 | // NewFixer creates a new fixer instance 14 | func NewFixer() *Fixer { 15 | f := &Fixer{ 16 | rules: make(map[string]func([]string) ([]string, int)), 17 | } 18 | 19 | // Register fix functions for each rule 20 | f.rules["MD009"] = f.fixTrailingSpaces 21 | f.rules["MD010"] = f.fixHardTabs 22 | f.rules["MD012"] = f.fixMultipleBlankLines 23 | f.rules["MD018"] = f.fixNoSpaceAfterHash 24 | f.rules["MD019"] = f.fixMultipleSpacesAfterHash 25 | f.rules["MD023"] = f.fixHeadingIndentation 26 | f.rules["MD032"] = f.fixListSpacing 27 | f.rules["MD047"] = f.fixFileEndNewline 28 | 29 | return f 30 | } 31 | 32 | // ApplyFixes applies fixes for the given issues 33 | func (f *Fixer) ApplyFixes(content string, issues []*Issue) (string, int) { 34 | lines := strings.Split(content, "\n") 35 | totalFixed := 0 36 | 37 | // Group issues by rule for efficient processing 38 | ruleIssues := make(map[string][]*Issue) 39 | for _, issue := range issues { 40 | ruleIssues[issue.Rule] = append(ruleIssues[issue.Rule], issue) 41 | } 42 | 43 | // Apply fixes for each rule 44 | for rule, ruleSpecificIssues := range ruleIssues { 45 | if fixFunc, exists := f.rules[rule]; exists { 46 | var fixed int 47 | lines, fixed = fixFunc(lines) 48 | totalFixed += fixed 49 | 50 | // Mark issues as fixed 51 | for _, issue := range ruleSpecificIssues { 52 | issue.Fixed = true 53 | } 54 | } 55 | } 56 | 57 | return strings.Join(lines, "\n"), totalFixed 58 | } 59 | 60 | // fixTrailingSpaces removes trailing spaces from lines 61 | func (f *Fixer) fixTrailingSpaces(lines []string) ([]string, int) { 62 | fixed := 0 63 | for i, line := range lines { 64 | trimmed := strings.TrimRight(line, " \t") 65 | if trimmed != line { 66 | lines[i] = trimmed 67 | fixed++ 68 | } 69 | } 70 | return lines, fixed 71 | } 72 | 73 | // fixHardTabs replaces hard tabs with spaces 74 | func (f *Fixer) fixHardTabs(lines []string) ([]string, int) { 75 | fixed := 0 76 | for i, line := range lines { 77 | if strings.Contains(line, "\t") { 78 | lines[i] = strings.ReplaceAll(line, "\t", " ") 79 | fixed++ 80 | } 81 | } 82 | return lines, fixed 83 | } 84 | 85 | // fixMultipleBlankLines removes consecutive blank lines 86 | func (f *Fixer) fixMultipleBlankLines(lines []string) ([]string, int) { 87 | var result []string 88 | fixed := 0 89 | prevBlank := false 90 | 91 | for _, line := range lines { 92 | isBlank := strings.TrimSpace(line) == "" 93 | 94 | if isBlank && prevBlank { 95 | fixed++ // Count removed blank lines 96 | continue 97 | } 98 | 99 | result = append(result, line) 100 | prevBlank = isBlank 101 | } 102 | 103 | return result, fixed 104 | } 105 | 106 | // fixNoSpaceAfterHash adds space after hash in headings 107 | func (f *Fixer) fixNoSpaceAfterHash(lines []string) ([]string, int) { 108 | fixed := 0 109 | re := regexp.MustCompile(`^(#+)([^# ])`) 110 | 111 | for i, line := range lines { 112 | trimmed := strings.TrimSpace(line) 113 | if re.MatchString(trimmed) { 114 | lines[i] = re.ReplaceAllString(trimmed, "$1 $2") 115 | fixed++ 116 | } 117 | } 118 | 119 | return lines, fixed 120 | } 121 | 122 | // fixMultipleSpacesAfterHash removes extra spaces after hash in headings 123 | func (f *Fixer) fixMultipleSpacesAfterHash(lines []string) ([]string, int) { 124 | fixed := 0 125 | re := regexp.MustCompile(`^(#+)\s{2,}`) 126 | 127 | for i, line := range lines { 128 | trimmed := strings.TrimSpace(line) 129 | if re.MatchString(trimmed) { 130 | lines[i] = re.ReplaceAllString(trimmed, "$1 ") 131 | fixed++ 132 | } 133 | } 134 | 135 | return lines, fixed 136 | } 137 | 138 | // fixHeadingIndentation removes leading spaces from headings 139 | func (f *Fixer) fixHeadingIndentation(lines []string) ([]string, int) { 140 | fixed := 0 141 | re := regexp.MustCompile(`^ +(#.*)`) 142 | 143 | for i, line := range lines { 144 | if re.MatchString(line) { 145 | lines[i] = re.ReplaceAllString(line, "$1") 146 | fixed++ 147 | } 148 | } 149 | 150 | return lines, fixed 151 | } 152 | 153 | // fixListSpacing adds blank lines around lists 154 | func (f *Fixer) fixListSpacing(lines []string) ([]string, int) { 155 | fixed := 0 156 | var result []string 157 | listRe := regexp.MustCompile(`^(\s*[*+-] )`) 158 | 159 | for i, line := range lines { 160 | if listRe.MatchString(line) { 161 | // Check if previous line needs a blank line 162 | if i > 0 && strings.TrimSpace(lines[i-1]) != "" && len(result) > 0 { 163 | result = append(result, "") 164 | fixed++ 165 | } 166 | } 167 | result = append(result, line) 168 | } 169 | 170 | return result, fixed 171 | } 172 | 173 | // fixFileEndNewline ensures file ends with single newline 174 | func (f *Fixer) fixFileEndNewline(lines []string) ([]string, int) { 175 | if len(lines) == 0 { 176 | return lines, 0 177 | } 178 | 179 | // Remove trailing empty lines 180 | for len(lines) > 0 && strings.TrimSpace(lines[len(lines)-1]) == "" { 181 | lines = lines[:len(lines)-1] 182 | } 183 | 184 | // Add single empty line at the end 185 | lines = append(lines, "") 186 | 187 | return lines, 1 188 | } 189 | -------------------------------------------------------------------------------- /internal/markdownfmt/formatter.go: -------------------------------------------------------------------------------- 1 | package markdownfmt 2 | 3 | import ( 4 | "fmt" 5 | "regexp" 6 | "strings" 7 | ) 8 | 9 | // Formatter for formatting markdown content 10 | type Formatter struct { 11 | // Whether formatting is enabled 12 | enabled bool 13 | } 14 | 15 | // New creates a new formatter 16 | func New(enabled bool) *Formatter { 17 | return &Formatter{ 18 | enabled: enabled, 19 | } 20 | } 21 | 22 | // Format formats markdown content 23 | func (f *Formatter) Format(content string) string { 24 | if !f.enabled { 25 | return content 26 | } 27 | 28 | // 1. Split content into lines 29 | lines := strings.Split(content, "\n") 30 | 31 | // 2. Process each line 32 | var formatted []string 33 | for i := 0; i < len(lines); i++ { 34 | line := lines[i] 35 | 36 | // Process headings: ensure there are blank lines before and after 37 | if isHeading(line) { 38 | // If not the first line and previous line is not blank, add a blank line 39 | if i > 0 && len(strings.TrimSpace(lines[i-1])) > 0 { 40 | formatted = append(formatted, "") 41 | } 42 | // Normalize heading format (one space after #) 43 | line = formatHeading(line) 44 | formatted = append(formatted, line) 45 | // If not the last line, add a blank line 46 | if i < len(lines)-1 { 47 | formatted = append(formatted, "") 48 | } 49 | continue 50 | } 51 | 52 | // Process spaces in links 53 | line = formatMarkdownLinks(line) 54 | 55 | // Process content in parentheses 56 | line = formatParentheses(line) 57 | 58 | // Process spaces between Chinese and English text 59 | line = formatChineseEnglishSpace(line) 60 | 61 | formatted = append(formatted, line) 62 | } 63 | 64 | // 3. Handle consecutive blank lines 65 | formatted = removeConsecutiveBlankLines(formatted) 66 | 67 | // 4. Join lines 68 | result := strings.Join(formatted, "\n") 69 | 70 | return result 71 | } 72 | 73 | // isHeading checks if the line is a heading 74 | func isHeading(line string) bool { 75 | return strings.HasPrefix(strings.TrimSpace(line), "#") 76 | } 77 | 78 | // formatHeading formats the heading line 79 | func formatHeading(line string) string { 80 | // Remove leading spaces 81 | line = strings.TrimSpace(line) 82 | // Ensure only one space between # and text 83 | re := regexp.MustCompile(`^(#+)\s*`) 84 | return re.ReplaceAllString(line, "$1 ") 85 | } 86 | 87 | // formatParentheses processes the format within parentheses 88 | func formatParentheses(line string) string { 89 | // First handle http/https links by temporarily replacing them 90 | linkPattern := regexp.MustCompile(`\([^)]*https?://[^)]+\)`) 91 | links := linkPattern.FindAllString(line, -1) 92 | for i, link := range links { 93 | line = strings.Replace(line, link, fmt.Sprintf("__LINK_PLACEHOLDER_%d__", i), 1) 94 | } 95 | 96 | // Process regular parentheses content 97 | re := regexp.MustCompile(`\(([^)]+)\)`) 98 | line = re.ReplaceAllStringFunc(line, func(match string) string { 99 | // Extract content within parentheses 100 | content := match[1 : len(match)-1] 101 | // Clean leading and trailing spaces 102 | content = strings.TrimSpace(content) 103 | // Replace consecutive spaces with a single space 104 | content = regexp.MustCompile(`\s+`).ReplaceAllString(content, " ") 105 | return fmt.Sprintf("(%s)", content) 106 | }) 107 | 108 | // Restore links 109 | for i, link := range links { 110 | line = strings.Replace(line, fmt.Sprintf("__LINK_PLACEHOLDER_%d__", i), link, 1) 111 | } 112 | 113 | return line 114 | } 115 | 116 | // formatMarkdownLinks processes spaces in markdown links 117 | func formatMarkdownLinks(line string) string { 118 | // Match markdown link format [text](url), including possible spaces 119 | linkPattern := regexp.MustCompile(`\[(.*?)\]\(\s*(.*?)\s*\)`) 120 | 121 | // Process spaces in link text and URL 122 | line = linkPattern.ReplaceAllStringFunc(line, func(match string) string { 123 | // Extract link text and URL 124 | parts := linkPattern.FindStringSubmatch(match) 125 | if len(parts) != 3 { 126 | return match 127 | } 128 | 129 | text := parts[1] 130 | url := parts[2] 131 | 132 | // Clean spaces in URL 133 | url = strings.TrimSpace(url) 134 | // Remove all spaces and invisible characters in URL 135 | url = regexp.MustCompile(`[\s\p{Zs}\p{C}]+`).ReplaceAllString(url, "") 136 | 137 | // Keep spaces in link text, but clean leading/trailing spaces and consecutive spaces 138 | text = strings.TrimSpace(text) 139 | text = regexp.MustCompile(`\s+`).ReplaceAllString(text, " ") 140 | 141 | // Reassemble link 142 | return fmt.Sprintf("[%s](%s)", text, url) 143 | }) 144 | 145 | // Process spaces in heading links 146 | headingLinkPattern := regexp.MustCompile(`\]\(#(.*?)\)`) 147 | line = headingLinkPattern.ReplaceAllStringFunc(line, func(match string) string { 148 | parts := headingLinkPattern.FindStringSubmatch(match) 149 | if len(parts) != 2 { 150 | return match 151 | } 152 | 153 | anchor := parts[1] 154 | // Remove all spaces 155 | anchor = regexp.MustCompile(`\s+`).ReplaceAllString(anchor, "") 156 | return fmt.Sprintf("](#%s)", anchor) 157 | }) 158 | 159 | return line 160 | } 161 | 162 | // formatChineseEnglishSpace adds spaces between Chinese and English text 163 | func formatChineseEnglishSpace(line string) string { 164 | // Match boundaries between Chinese and English/numbers 165 | re := regexp.MustCompile(`([\p{Han}])([A-Za-z0-9])`) 166 | line = re.ReplaceAllString(line, "$1 $2") 167 | 168 | re = regexp.MustCompile(`([A-Za-z0-9])([\p{Han}])`) 169 | line = re.ReplaceAllString(line, "$1 $2") 170 | 171 | return line 172 | } 173 | 174 | // removeConsecutiveBlankLines removes consecutive blank lines 175 | func removeConsecutiveBlankLines(lines []string) []string { 176 | var result []string 177 | isPrevLineBlank := false 178 | 179 | for _, line := range lines { 180 | isCurrentLineBlank := len(strings.TrimSpace(line)) == 0 181 | 182 | if !isCurrentLineBlank || !isPrevLineBlank { 183 | result = append(result, line) 184 | } 185 | 186 | isPrevLineBlank = isCurrentLineBlank 187 | } 188 | 189 | return result 190 | } 191 | -------------------------------------------------------------------------------- /docs/features/export.md: -------------------------------------------------------------------------------- 1 | # Export 功能设计文档 2 | 3 | ## 功能概述 4 | 5 | 为 mdctl 工具增加 `export` 子命令,用于将 Markdown 文件导出为其他格式。第一版将优先支持导出为 Word 文档格式(docx),后续可扩展支持更多格式(如 PDF、EPUB 等)。 6 | 7 | 该功能将利用 Pandoc 作为底层导出工具,支持 Pandoc 的模板系统,允许用户配置自定义的导出模板。 8 | 9 | ## 用户需求 10 | 11 | 1. 支持将单个 Markdown 文件导出为 Word 格式 12 | 2. 支持将多个 Markdown 文件合并后导出为单个 Word 文档 13 | 3. 支持按照文件夹中的文件名顺序合并文件 14 | 4. 支持多种文档系统(MkDocs 第一期、Hugo、Docusaurus coming soon)的文件读取方式 15 | 5. 在合并过程中智能调整标题层级,保持文档结构的清晰性 16 | 6. 支持自定义 Word 模板,使最终文档具有一致的样式 17 | 18 | ## 命令设计 19 | 20 | ``` 21 | mdctl export [flags] 22 | ``` 23 | 24 | ### 参数设计 25 | 26 | - `-f, --file`: 指定单个 Markdown 文件进行导出 27 | - `-d, --dir`: 指定包含多个 Markdown 文件的目录 28 | - `-s, --site-type`: 指定文档站点类型,可选值:mkdocs, hugo, docusaurus(默认:mkdocs) 29 | - `-o, --output`: 指定输出文件路径 30 | - `-t, --template`: 指定 Word 模板文件路径 31 | - `-F, --format`: 指定输出格式,可选值:docx, pdf, epub(默认:docx) 32 | - `--toc`: 是否生成目录(默认:false) 33 | - `--shift-heading-level-by`: 标题层级偏移量(默认:0) 34 | - `--file-as-title`: 是否使用文件名作为章节标题(默认:false) 35 | 36 | ### 使用示例 37 | 38 | ```bash 39 | # 导出单个文件 40 | mdctl export -f README.md -o output.docx 41 | 42 | # 导出整个目录 43 | mdctl export -d docs/ -o documentation.docx 44 | 45 | # 导出 MkDocs 站点 46 | mdctl export -d docs/ -s mkdocs -o site_docs.docx 47 | 48 | # 导出 Hugo 站点 49 | mdctl export -d content/ -s hugo -o hugo_docs.docx 50 | 51 | # 使用自定义模板 52 | mdctl export -d docs/ -o report.docx -t templates/corporate.docx 53 | 54 | # 指定标题层级偏移量 55 | mdctl export -d docs/ -o documentation.docx --shift-heading-level-by 2 56 | 57 | # 导出为 PDF 格式 58 | mdctl export -d docs/ -o documentation.pdf -F pdf 59 | ``` 60 | 61 | ## 实现设计 62 | 63 | ### 整体架构 64 | 65 | 按照项目的现有结构,我们将在 `cmd/` 目录下创建 `export.go` 文件定义命令接口,在 `internal/` 目录下创建 `exporter/` 模块实现具体功能。 66 | 67 | ``` 68 | mdctl/ 69 | ├── cmd/ 70 | │ └── export.go # 新增:export 命令定义 71 | ├── internal/ 72 | │ └── exporter/ # 新增:导出功能实现 73 | │ ├── exporter.go # 导出器接口定义 74 | │ ├── pandoc.go # Pandoc 导出实现 75 | │ ├── merger.go # Markdown 合并实现 76 | │ ├── sitereader/ # 新增:不同文档系统的站点结构读取 77 | │ │ ├── reader.go # 站点读取器接口 78 | │ │ ├── mkdocs.go # MkDocs 站点读取 79 | │ │ ├── hugo.go # Hugo 站点读取 80 | │ │ └── docusaurus.go # Docusaurus 站点读取 81 | │ └── heading.go # 标题处理实现 82 | ``` 83 | 84 | ### 核心组件 85 | 86 | #### 1. 命令处理器 (cmd/export.go) 87 | 88 | 负责解析命令行参数并调用导出功能。 89 | 90 | ```go 91 | var ( 92 | exportFile string 93 | exportDir string 94 | siteType string 95 | configFile string 96 | exportOutput string 97 | exportTemplate string 98 | exportFormat string 99 | pandocPath string 100 | generateToc bool 101 | shiftHeadingLevelBy int 102 | fileAsTitle bool 103 | 104 | exportCmd = &cobra.Command{ 105 | Use: "export", 106 | Short: "Export markdown files to other formats", 107 | Long: `...`, 108 | RunE: func(cmd *cobra.Command, args []string) error { 109 | // 参数验证和处理逻辑 110 | // 调用 internal/exporter 的功能 111 | }, 112 | } 113 | ) 114 | ``` 115 | 116 | #### 2. 导出器接口 (internal/exporter/exporter.go) 117 | 118 | 定义导出功能的通用接口,支持扩展其他格式。 119 | 120 | ```go 121 | type Exporter interface { 122 | Export(input string, output string, options ExportOptions) error 123 | } 124 | 125 | type ExportOptions struct { 126 | Template string 127 | GenerateToc bool 128 | ShiftHeadingLevelBy int 129 | FileAsTitle bool 130 | Format string 131 | // 其他选项 132 | } 133 | ``` 134 | 135 | #### 3. Pandoc 导出实现 (internal/exporter/pandoc.go) 136 | 137 | 使用 Pandoc 工具实现导出功能。 138 | 139 | ```go 140 | type PandocExporter struct { 141 | PandocPath string 142 | } 143 | 144 | func (e *PandocExporter) Export(input, output string, options ExportOptions) error { 145 | // 构建并执行 Pandoc 命令 146 | // 如果 pandoc 不可用,返回明确的错误提示 147 | } 148 | ``` 149 | 150 | #### 4. 站点结构读取器 (internal/exporter/sitereader/) 151 | 152 | 负责识别和解析不同文档系统的站点结构。 153 | 154 | ```go 155 | // 站点读取器接口 156 | type SiteReader interface { 157 | // 检测给定目录是否为此类型的站点 158 | Detect(dir string) bool 159 | 160 | // 读取站点结构,返回按顺序排列的文件列表 161 | ReadStructure(dir string, configPath string) ([]string, error) 162 | } 163 | 164 | // 工厂函数,根据站点类型返回相应的读取器 165 | func GetSiteReader(siteType string) (SiteReader, error) { 166 | // 返回对应类型的读取器实现 167 | } 168 | ``` 169 | 170 | #### 5. Markdown 合并器 (internal/exporter/merger.go) 171 | 172 | 负责合并多个 Markdown 文件。 173 | 174 | ```go 175 | type Merger struct { 176 | ShiftHeadingLevelBy int 177 | FileAsTitle bool 178 | } 179 | 180 | func (m *Merger) Merge(sources []string, target string) error { 181 | // 合并多个 Markdown 文件的逻辑 182 | // 自动处理标题层级 183 | } 184 | ``` 185 | 186 | #### 6. 标题处理器 (internal/exporter/heading.go) 187 | 188 | 处理 Markdown 文件中的标题层级。 189 | 190 | ```go 191 | func ShiftHeadings(content string, levels int) string { 192 | // 调整标题层级的逻辑 193 | } 194 | ``` 195 | 196 | ### 工作流程 197 | 198 | 1. **命令解析**:解析用户提供的命令行参数 199 | 2. **文件收集**:根据参数收集需要处理的 Markdown 文件 200 | - 单文件模式:直接使用指定文件 201 | - 目录模式:收集目录中的所有 Markdown 文件并按文件名排序 202 | - 站点模式:使用相应的站点读取器解析站点结构 203 | 3. **文件合并**:如果有多个文件,将它们合并为一个临时 Markdown 文件 204 | - 自动调整每个文件的标题层级 205 | - 可选添加文件名作为章节标题 206 | 4. **格式转换**:使用 Pandoc 将 Markdown 转换为目标格式 207 | - 应用用户指定的模板(如果有) 208 | - 生成目录(如果启用) 209 | 5. **输出处理**:将最终结果输出到用户指定的路径 210 | 211 | ## 标题层级处理策略 212 | 213 | 为了解决多文件合并时标题层级的问题,系统将自动处理标题层级: 214 | 215 | 1. 每个文件的标题层级将按照指定的偏移量调整: 216 | - H1 -> H(1+偏移量) 217 | - H2 -> H(2+偏移量) 218 | - ... 219 | - 如果调整后超过 H6,将转换为加粗文本 (**文本**) 220 | 221 | 2. 如果启用了文件名作为标题功能,会自动在每个文件内容前添加对应层级的标题 222 | 223 | 3. 系统会自动处理标题的相对层级关系,确保文档结构的逻辑性 224 | 225 | ## 依赖条件 226 | 227 | **Pandoc**:需要系统中安装 Pandoc 工具 228 | - 在执行导出命令时检查 Pandoc 是否可用 229 | - 如果找不到 Pandoc,提供明确的错误信息和安装指导 230 | 231 | ## 错误处理 232 | 233 | 1. Pandoc 不可用时提供明确的错误信息和安装指导 234 | 2. 文件不存在或无法访问时的错误处理 235 | 3. 合并过程中可能出现的格式问题处理 236 | 4. 模板文件异常的处理 237 | 5. 不支持的站点类型或配置文件处理 238 | 239 | ## 未来扩展 240 | 241 | 1. 增强模板管理功能,支持模板下载和更新 242 | 2. 支持更多的文档站点系统 243 | 3. 支持更复杂的文档结构处理,如自动生成封面、页眉页脚 244 | 4. 集成图表和公式渲染功能 -------------------------------------------------------------------------------- /internal/processor/processor.go: -------------------------------------------------------------------------------- 1 | package processor 2 | 3 | import ( 4 | "crypto/md5" 5 | "fmt" 6 | "io" 7 | "net/http" 8 | "os" 9 | "path/filepath" 10 | "regexp" 11 | "strings" 12 | ) 13 | 14 | type Processor struct { 15 | SourceFile string 16 | SourceDir string 17 | ImageOutputDir string 18 | } 19 | 20 | func New(sourceFile, sourceDir, imageOutputDir string) *Processor { 21 | return &Processor{ 22 | SourceFile: sourceFile, 23 | SourceDir: sourceDir, 24 | ImageOutputDir: imageOutputDir, 25 | } 26 | } 27 | 28 | func (p *Processor) Process() error { 29 | if p.SourceFile != "" { 30 | return p.processFile(p.SourceFile) 31 | } 32 | return p.processDirectory(p.SourceDir) 33 | } 34 | 35 | func (p *Processor) processDirectory(dir string) error { 36 | fmt.Printf("Processing directory: %s\n", dir) 37 | return filepath.Walk(dir, func(path string, info os.FileInfo, err error) error { 38 | if err != nil { 39 | return err 40 | } 41 | if !info.IsDir() && (strings.HasSuffix(path, ".md") || strings.HasSuffix(path, ".markdown")) { 42 | return p.processFile(path) 43 | } 44 | return nil 45 | }) 46 | } 47 | 48 | func (p *Processor) processFile(filePath string) error { 49 | fmt.Printf("Processing file: %s\n", filePath) 50 | content, err := os.ReadFile(filePath) 51 | if err != nil { 52 | return fmt.Errorf("failed to read file %s: %v", filePath, err) 53 | } 54 | 55 | // Determine image output directory 56 | imgDir := p.determineImageDir(filePath) 57 | if err := os.MkdirAll(imgDir, 0755); err != nil { 58 | return fmt.Errorf("failed to create image directory %s: %v", imgDir, err) 59 | } 60 | 61 | // Find all image links 62 | imgRegex := regexp.MustCompile(`!\[([^\]]*)\]\(([^)]+)\)`) 63 | matches := imgRegex.FindAllStringSubmatch(string(content), -1) 64 | 65 | fmt.Printf("Found %d images in file %s\n", len(matches), filePath) 66 | 67 | newContent := string(content) 68 | for _, match := range matches { 69 | imgAlt := match[1] 70 | imgURL := match[2] 71 | 72 | // Replace image URL starting with "//" to "https://" 73 | if strings.HasPrefix(imgURL, "//") { 74 | imgURL = strings.Replace(imgURL, "//", "https://", 1) 75 | } 76 | // Skip local images 77 | if !strings.HasPrefix(imgURL, "http://") && !strings.HasPrefix(imgURL, "https://") { 78 | continue 79 | } 80 | 81 | // Download and save image 82 | localPath, err := p.downloadImage(imgURL, imgDir) 83 | if err != nil { 84 | fmt.Printf("Warning: Failed to download image %s: %v\n", imgURL, err) 85 | continue 86 | } 87 | 88 | // Calculate relative path 89 | relPath, err := filepath.Rel(filepath.Dir(filePath), localPath) 90 | if err != nil { 91 | fmt.Printf("Warning: Failed to calculate relative path: %v\n", err) 92 | continue 93 | } 94 | 95 | // Replace image link 96 | oldLink := fmt.Sprintf("![%s](%s)", match[1], match[2]) 97 | newLink := fmt.Sprintf("![%s](%s)", imgAlt, relPath) 98 | newContent = strings.Replace(newContent, oldLink, newLink, 1) 99 | } 100 | 101 | // Write back to file 102 | if err := os.WriteFile(filePath, []byte(newContent), 0644); err != nil { 103 | return fmt.Errorf("failed to write file %s: %v", filePath, err) 104 | } 105 | 106 | return nil 107 | } 108 | 109 | func (p *Processor) determineImageDir(filePath string) string { 110 | if p.ImageOutputDir != "" { 111 | return p.ImageOutputDir 112 | } 113 | if p.SourceDir != "" { 114 | return filepath.Join(p.SourceDir, "images") 115 | } 116 | return filepath.Join(filepath.Dir(filePath), "images") 117 | } 118 | 119 | func (p *Processor) downloadImage(url string, destDir string) (string, error) { 120 | resp, err := http.Get(url) 121 | if err != nil { 122 | return "", err 123 | } 124 | defer resp.Body.Close() 125 | 126 | // Get filename from URL or Content-Disposition 127 | filename := getFilenameFromURL(url, resp) 128 | 129 | // If no extension, try to get from Content-Type 130 | if filepath.Ext(filename) == "" { 131 | contentType := resp.Header.Get("Content-Type") 132 | ext := getExtensionFromContentType(contentType) 133 | if ext != "" { 134 | filename += ext 135 | } 136 | } 137 | 138 | // Ensure filename is unique 139 | hash := md5.New() 140 | io.WriteString(hash, url) 141 | urlHash := fmt.Sprintf("%x", hash.Sum(nil))[:8] 142 | 143 | ext := filepath.Ext(filename) 144 | basename := strings.TrimSuffix(filename, ext) 145 | filename = fmt.Sprintf("%s_%s%s", basename, urlHash, ext) 146 | 147 | localPath := filepath.Join(destDir, filename) 148 | 149 | // Create target file 150 | out, err := os.Create(localPath) 151 | if err != nil { 152 | return "", err 153 | } 154 | defer out.Close() 155 | 156 | // Write to file 157 | _, err = io.Copy(out, resp.Body) 158 | if err != nil { 159 | return "", err 160 | } 161 | 162 | fmt.Printf("Downloaded image to: %s\n", localPath) 163 | return localPath, nil 164 | } 165 | 166 | func getFilenameFromURL(url string, resp *http.Response) string { 167 | // First try to get from Content-Disposition 168 | if cd := resp.Header.Get("Content-Disposition"); cd != "" { 169 | if strings.Contains(cd, "filename=") { 170 | parts := strings.Split(cd, "filename=") 171 | if len(parts) > 1 { 172 | filename := strings.Trim(parts[1], `"'`) 173 | if filename != "" { 174 | return filename 175 | } 176 | } 177 | } 178 | } 179 | 180 | // Get from URL path 181 | parts := strings.Split(url, "/") 182 | if len(parts) > 0 { 183 | filename := parts[len(parts)-1] 184 | // Remove URL parameters 185 | if idx := strings.Index(filename, "?"); idx != -1 { 186 | filename = filename[:idx] 187 | } 188 | // Remove trailing "@" character 189 | if idx := strings.LastIndex(filename, "@"); idx != -1 { 190 | if idx > strings.LastIndex(filename, ".") { 191 | filename = filename[:idx] 192 | } 193 | } 194 | if filename != "" { 195 | return filename 196 | } 197 | } 198 | 199 | // Use default name 200 | return "image" 201 | } 202 | 203 | func getExtensionFromContentType(contentType string) string { 204 | switch contentType { 205 | case "image/jpeg", "image/jpg": 206 | return ".jpg" 207 | case "image/png": 208 | return ".png" 209 | case "image/gif": 210 | return ".gif" 211 | case "image/webp": 212 | return ".webp" 213 | default: 214 | return "" 215 | } 216 | } 217 | -------------------------------------------------------------------------------- /internal/linter/linter.go: -------------------------------------------------------------------------------- 1 | package linter 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | "os" 7 | "strings" 8 | 9 | "github.com/samzong/mdctl/internal/markdownfmt" 10 | ) 11 | 12 | // Config holds the linter configuration 13 | type Config struct { 14 | AutoFix bool 15 | OutputFormat string 16 | RulesFile string 17 | EnableRules []string 18 | DisableRules []string 19 | Verbose bool 20 | } 21 | 22 | // Issue represents a linting issue 23 | type Issue struct { 24 | Line int `json:"line"` 25 | Column int `json:"column,omitempty"` 26 | Rule string `json:"rule"` 27 | Message string `json:"message"` 28 | Context string `json:"context,omitempty"` 29 | Fixed bool `json:"fixed,omitempty"` 30 | } 31 | 32 | // Result holds the linting results for a file 33 | type Result struct { 34 | Filename string `json:"filename"` 35 | Issues []*Issue `json:"issues"` 36 | FixedCount int `json:"fixed_count"` 37 | } 38 | 39 | // Linter performs markdown linting 40 | type Linter struct { 41 | config *Config 42 | rules *RuleSet 43 | formatter *markdownfmt.Formatter 44 | fixer *Fixer 45 | } 46 | 47 | // New creates a new linter instance 48 | func New(config *Config) *Linter { 49 | rules := NewRuleSet() 50 | 51 | // Load configuration file if specified 52 | if config.RulesFile != "" { 53 | if configFile, err := LoadConfigFile(config.RulesFile); err == nil { 54 | configFile.ApplyToRuleSet(rules) 55 | } else if config.Verbose { 56 | fmt.Printf("Warning: Could not load rules file %s: %v\n", config.RulesFile, err) 57 | } 58 | } else { 59 | // Try to find and load default config file 60 | if configFile, err := LoadConfigFile(""); err == nil { 61 | configFile.ApplyToRuleSet(rules) 62 | } 63 | } 64 | 65 | // Apply rule configuration from command line 66 | if len(config.EnableRules) > 0 { 67 | rules.EnableOnly(config.EnableRules) 68 | } 69 | 70 | if len(config.DisableRules) > 0 { 71 | rules.Disable(config.DisableRules) 72 | } 73 | 74 | return &Linter{ 75 | config: config, 76 | rules: rules, 77 | formatter: markdownfmt.New(true), // Enable formatter for auto-fix 78 | fixer: NewFixer(), 79 | } 80 | } 81 | 82 | // LintFile lints a single markdown file 83 | func (l *Linter) LintFile(filename string) (*Result, error) { 84 | // Check file size limit (10MB) 85 | const maxFileSize = 10 * 1024 * 1024 86 | if info, err := os.Stat(filename); err == nil { 87 | if info.Size() > maxFileSize { 88 | return nil, fmt.Errorf("file too large: %s (max %d bytes)", filename, maxFileSize) 89 | } 90 | } 91 | 92 | content, err := os.ReadFile(filename) 93 | if err != nil { 94 | return nil, fmt.Errorf("failed to read file: %v", err) 95 | } 96 | 97 | return l.LintContent(filename, string(content)) 98 | } 99 | 100 | // LintContent lints markdown content 101 | func (l *Linter) LintContent(filename, content string) (*Result, error) { 102 | result := &Result{ 103 | Filename: filename, 104 | Issues: []*Issue{}, 105 | } 106 | 107 | lines := strings.Split(content, "\n") 108 | 109 | // Apply all enabled rules 110 | for _, rule := range l.rules.GetEnabledRules() { 111 | issues := rule.Check(lines) 112 | result.Issues = append(result.Issues, issues...) 113 | } 114 | 115 | // Apply auto-fix if requested 116 | if l.config.AutoFix && len(result.Issues) > 0 { 117 | fixedContent, fixedCount := l.applyFixes(content, result.Issues) 118 | result.FixedCount = fixedCount 119 | 120 | // Write fixed content back to file with backup 121 | if fixedCount > 0 { 122 | // Create backup before modifying the file 123 | if err := l.createBackup(filename); err != nil { 124 | return nil, fmt.Errorf("failed to create backup: %v", err) 125 | } 126 | 127 | if err := os.WriteFile(filename, []byte(fixedContent), 0644); err != nil { 128 | return nil, fmt.Errorf("failed to write fixed content: %v", err) 129 | } 130 | 131 | // Mark issues as fixed 132 | for _, issue := range result.Issues { 133 | if issue.Rule != "MD013" { // Don't mark line length issues as fixed automatically 134 | issue.Fixed = true 135 | } 136 | } 137 | } 138 | } 139 | 140 | return result, nil 141 | } 142 | 143 | // applyFixes applies automatic fixes to the content 144 | func (l *Linter) applyFixes(content string, issues []*Issue) (string, int) { 145 | // Use the dedicated fixer for rule-specific fixes 146 | fixedContent, fixedCount := l.fixer.ApplyFixes(content, issues) 147 | 148 | // Then apply general formatting fixes 149 | finalContent := l.formatter.Format(fixedContent) 150 | 151 | // If formatter made additional changes, count them 152 | if finalContent != fixedContent && fixedCount == 0 { 153 | fixedCount = l.countFixableIssues(issues) 154 | } 155 | 156 | return finalContent, fixedCount 157 | } 158 | 159 | // createBackup creates a backup of the file before modification 160 | func (l *Linter) createBackup(filename string) error { 161 | backupFilename := filename + ".orig" 162 | 163 | // Open source file 164 | src, err := os.Open(filename) 165 | if err != nil { 166 | return fmt.Errorf("failed to open source file: %v", err) 167 | } 168 | defer src.Close() 169 | 170 | // Create backup file 171 | dst, err := os.Create(backupFilename) 172 | if err != nil { 173 | return fmt.Errorf("failed to create backup file: %v", err) 174 | } 175 | defer dst.Close() 176 | 177 | // Copy content 178 | _, err = io.Copy(dst, src) 179 | if err != nil { 180 | return fmt.Errorf("failed to copy content to backup: %v", err) 181 | } 182 | 183 | return nil 184 | } 185 | 186 | // countFixableIssues counts how many issues can be automatically fixed 187 | func (l *Linter) countFixableIssues(issues []*Issue) int { 188 | fixableRules := map[string]bool{ 189 | "MD009": true, // Trailing spaces 190 | "MD010": true, // Hard tabs 191 | "MD012": true, // Multiple consecutive blank lines 192 | "MD018": true, // No space after hash on atx style heading 193 | "MD019": true, // Multiple spaces after hash on atx style heading 194 | "MD023": true, // Headings must start at the beginning of the line 195 | "MD047": true, // Files should end with a single newline character 196 | } 197 | 198 | count := 0 199 | for _, issue := range issues { 200 | if fixableRules[issue.Rule] { 201 | count++ 202 | } 203 | } 204 | return count 205 | } 206 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/PuerkitoBio/goquery v1.9.1 h1:mTL6XjbJTZdpfL+Gwl5U2h1l9yEkJjhmlTeV9VPW7UI= 2 | github.com/PuerkitoBio/goquery v1.9.1/go.mod h1:cW1n6TmIMDoORQU5IU/P1T3tGFunOeXEpGP2WHRwkbY= 3 | github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss= 4 | github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU= 5 | github.com/aws/aws-sdk-go v1.55.6 h1:cSg4pvZ3m8dgYcgqB97MrcdjUmZ1BeMYKUxMMB89IPk= 6 | github.com/aws/aws-sdk-go v1.55.6/go.mod h1:eRwEWoyTWFMVYVQzKMNHWP5/RV4xIUGMQfXQHfHkpNU= 7 | github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= 8 | github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= 9 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 10 | github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y= 11 | github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8= 12 | github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= 13 | github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= 14 | github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg= 15 | github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo= 16 | github.com/jmespath/go-jmespath/internal/testify v1.5.1 h1:shLQSRRSCCPj3f2gpwzGwWFoC7ycTf1rcQZHOlsJ6N8= 17 | github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U= 18 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 19 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 20 | github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= 21 | github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM= 22 | github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y= 23 | github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= 24 | github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= 25 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 26 | github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= 27 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= 28 | golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= 29 | golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= 30 | golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= 31 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= 32 | golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= 33 | golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= 34 | golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= 35 | golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= 36 | golang.org/x/net v0.33.0 h1:74SYHlV8BIgHIFC/LrYkOGIwL19eTYXQ5wc6TBuO36I= 37 | golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4= 38 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 39 | golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 40 | golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 41 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 42 | golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 43 | golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 44 | golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 45 | golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 46 | golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 47 | golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 48 | golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= 49 | golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= 50 | golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= 51 | golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY= 52 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 53 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= 54 | golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= 55 | golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= 56 | golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= 57 | golang.org/x/text v0.23.0 h1:D71I7dUrlY+VX0gQShAThNGHFxZ13dGLBHQLVl1mJlY= 58 | golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4= 59 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 60 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= 61 | golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= 62 | golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= 63 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 64 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= 65 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 66 | gopkg.in/yaml.v2 v2.2.8 h1:obN1ZagJSUGI0Ek/LBmuj4SNLPfIny3KsKFopxRdj10= 67 | gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= 68 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= 69 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 70 | -------------------------------------------------------------------------------- /internal/linter/linter_test.go: -------------------------------------------------------------------------------- 1 | package linter 2 | 3 | import ( 4 | "os" 5 | "testing" 6 | ) 7 | 8 | func TestLinter_LintContent(t *testing.T) { 9 | tests := []struct { 10 | name string 11 | content string 12 | expectRules []string // Expected rule IDs that should trigger 13 | expectCount int // Expected number of issues 14 | }{ 15 | { 16 | name: "valid markdown", 17 | content: "# Title\n\nThis is valid markdown.\n", 18 | expectRules: []string{}, 19 | expectCount: 0, 20 | }, 21 | { 22 | name: "trailing spaces", 23 | content: "# Title \n\nContent with trailing spaces. \n", 24 | expectRules: []string{"MD009"}, 25 | expectCount: 2, 26 | }, 27 | { 28 | name: "hard tabs", 29 | content: "# Title\n\n\tContent with hard tab.\n", 30 | expectRules: []string{"MD010"}, 31 | expectCount: 1, 32 | }, 33 | { 34 | name: "multiple blank lines", 35 | content: "# Title\n\n\n\nContent after multiple blank lines.\n", 36 | expectRules: []string{"MD012"}, 37 | expectCount: 2, // MD012 triggers for each set of consecutive blank lines 38 | }, 39 | { 40 | name: "no space after hash", 41 | content: "#Title\n\nContent.\n", 42 | expectRules: []string{"MD018"}, 43 | expectCount: 1, 44 | }, 45 | { 46 | name: "multiple spaces after hash", 47 | content: "# Title\n\nContent.\n", 48 | expectRules: []string{"MD019"}, 49 | expectCount: 1, 50 | }, 51 | { 52 | name: "heading not at start of line", 53 | content: "Some text\n # Title\n\nContent.\n", 54 | expectRules: []string{"MD023"}, 55 | expectCount: 1, 56 | }, 57 | { 58 | name: "list without blank line before", 59 | content: "# Title\nSome text\n- List item\n\nContent.\n", 60 | expectRules: []string{"MD032"}, 61 | expectCount: 1, 62 | }, 63 | { 64 | name: "list without blank line after", 65 | content: "# Title\n\n- List item\nSome text\n", 66 | expectRules: []string{"MD032"}, 67 | expectCount: 1, 68 | }, 69 | { 70 | name: "file not ending with newline", 71 | content: "# Title\n\nContent without final newline", 72 | expectRules: []string{"MD047"}, 73 | expectCount: 1, 74 | }, 75 | { 76 | name: "file ending with multiple newlines", 77 | content: "# Title\n\nContent.\n\n", 78 | expectRules: []string{"MD047", "MD012"}, 79 | expectCount: 2, // Both MD047 and MD012 trigger 80 | }, 81 | } 82 | 83 | for _, tt := range tests { 84 | t.Run(tt.name, func(t *testing.T) { 85 | linter := New(&Config{}) 86 | result, err := linter.LintContent("test.md", tt.content) 87 | 88 | if err != nil { 89 | t.Fatalf("LintContent failed: %v", err) 90 | } 91 | 92 | if len(result.Issues) != tt.expectCount { 93 | t.Errorf("Expected %d issues, got %d", tt.expectCount, len(result.Issues)) 94 | for _, issue := range result.Issues { 95 | t.Logf("Issue: %s - %s", issue.Rule, issue.Message) 96 | } 97 | } 98 | 99 | // Check that expected rules are triggered 100 | foundRules := make(map[string]bool) 101 | for _, issue := range result.Issues { 102 | foundRules[issue.Rule] = true 103 | } 104 | 105 | for _, expectedRule := range tt.expectRules { 106 | if !foundRules[expectedRule] { 107 | t.Errorf("Expected rule %s to be triggered, but it wasn't", expectedRule) 108 | } 109 | } 110 | }) 111 | } 112 | } 113 | 114 | func TestLinter_AutoFix(t *testing.T) { 115 | tests := []struct { 116 | name string 117 | content string 118 | expectFixed bool 119 | expectFixCount int 120 | expectRules []string 121 | }{ 122 | { 123 | name: "fix trailing spaces", 124 | content: "# Title \n\nContent with trailing spaces. \n", 125 | expectFixed: true, 126 | expectFixCount: 2, 127 | expectRules: []string{"MD009"}, 128 | }, 129 | { 130 | name: "fix hard tabs", 131 | content: "# Title\n\n\tContent with hard tab.\n", 132 | expectFixed: true, 133 | expectFixCount: 1, 134 | expectRules: []string{"MD010"}, 135 | }, 136 | { 137 | name: "fix multiple blank lines", 138 | content: "# Title\n\n\n\nContent after multiple blank lines.\n", 139 | expectFixed: true, 140 | expectFixCount: 2, // MD012 triggers multiple times 141 | expectRules: []string{"MD012"}, 142 | }, 143 | } 144 | 145 | for _, tt := range tests { 146 | t.Run(tt.name, func(t *testing.T) { 147 | // Create a temporary file 148 | tmpFile, err := os.CreateTemp("", "test_*.md") 149 | if err != nil { 150 | t.Fatalf("Failed to create temp file: %v", err) 151 | } 152 | defer os.Remove(tmpFile.Name()) 153 | defer os.Remove(tmpFile.Name() + ".orig") // Remove backup file 154 | 155 | // Write content to temp file 156 | if _, err := tmpFile.WriteString(tt.content); err != nil { 157 | t.Fatalf("Failed to write to temp file: %v", err) 158 | } 159 | tmpFile.Close() 160 | 161 | // Run linter with auto-fix 162 | linter := New(&Config{AutoFix: true}) 163 | result, err := linter.LintFile(tmpFile.Name()) 164 | 165 | if err != nil { 166 | t.Fatalf("LintFile failed: %v", err) 167 | } 168 | 169 | if tt.expectFixed && result.FixedCount != tt.expectFixCount { 170 | t.Errorf("Expected %d fixes, got %d", tt.expectFixCount, result.FixedCount) 171 | } 172 | 173 | // Check that backup file was created 174 | if tt.expectFixed { 175 | if _, err := os.Stat(tmpFile.Name() + ".orig"); os.IsNotExist(err) { 176 | t.Error("Expected backup file to be created, but it wasn't") 177 | } 178 | } 179 | }) 180 | } 181 | } 182 | 183 | func TestLinter_BackupCreation(t *testing.T) { 184 | // Create a temporary file 185 | tmpFile, err := os.CreateTemp("", "test_*.md") 186 | if err != nil { 187 | t.Fatalf("Failed to create temp file: %v", err) 188 | } 189 | defer os.Remove(tmpFile.Name()) 190 | defer os.Remove(tmpFile.Name() + ".orig") 191 | 192 | originalContent := "# Title \n\nContent with trailing spaces. \n" 193 | if _, err := tmpFile.WriteString(originalContent); err != nil { 194 | t.Fatalf("Failed to write to temp file: %v", err) 195 | } 196 | tmpFile.Close() 197 | 198 | // Run linter with auto-fix 199 | linter := New(&Config{AutoFix: true}) 200 | _, err = linter.LintFile(tmpFile.Name()) 201 | 202 | if err != nil { 203 | t.Fatalf("LintFile failed: %v", err) 204 | } 205 | 206 | // Check that backup file exists and contains original content 207 | backupContent, err := os.ReadFile(tmpFile.Name() + ".orig") 208 | if err != nil { 209 | t.Fatalf("Failed to read backup file: %v", err) 210 | } 211 | 212 | if string(backupContent) != originalContent { 213 | t.Errorf("Backup content doesn't match original.\nExpected: %q\nGot: %q", originalContent, string(backupContent)) 214 | } 215 | } 216 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | BINARY=mdctl 2 | VERSION=$(shell git describe --tags || echo "unknown version") 3 | BUILDTIME=$(shell date -u) 4 | GOBUILD=CGO_ENABLED=0 go build -trimpath -ldflags '-X "github.com/samzong/mdctl/cmd.Version=$(VERSION)" -X "github.com/samzong/mdctl/cmd.BuildTime=$(BUILDTIME)"' 5 | 6 | # Homebrew related variables 7 | CLEAN_VERSION=$(shell echo $(VERSION) | sed 's/^v//') 8 | DOWNLOAD_URL=https://github.com/samzong/mdctl/releases/download/$(VERSION)/mdctl-$(CLEAN_VERSION)-darwin-amd64.tar.gz 9 | HOMEBREW_TAP_REPO=homebrew-tap 10 | FORMULA_FILE=Formula/mdctl.rb 11 | BRANCH_NAME=update-mdctl-$(CLEAN_VERSION) 12 | 13 | # Adjust architecture definitions to match goreleaser output 14 | SUPPORTED_ARCHS = Darwin_x86_64 Darwin_arm64 Linux_x86_64 Linux_arm64 15 | 16 | .PHONY: deps 17 | deps: 18 | @echo "Installing Go dependencies..." 19 | go mod download 20 | go mod verify 21 | 22 | .PHONY: build 23 | build: deps 24 | $(GOBUILD) -o bin/$(BINARY) 25 | 26 | .PHONY: test 27 | test: 28 | go test -v ./... 29 | 30 | .PHONY: clean 31 | clean: 32 | rm -rf bin/ 33 | go clean -i ./... 34 | 35 | .PHONY: fmt 36 | fmt: 37 | go fmt ./... 38 | go mod tidy 39 | 40 | .PHONY: all 41 | all: clean fmt build test 42 | 43 | .PHONY: update-homebrew 44 | update-homebrew: 45 | @echo "==> Starting Homebrew formula update process..." 46 | @if [ -z "$(GH_PAT)" ]; then \ 47 | echo "❌ Error: GH_PAT environment variable is required"; \ 48 | exit 1; \ 49 | fi 50 | 51 | @echo "==> Current version information:" 52 | @echo " - VERSION: $(VERSION)" 53 | @echo " - CLEAN_VERSION: $(CLEAN_VERSION)" 54 | 55 | @echo "==> Preparing working directory..." 56 | @rm -rf tmp && mkdir -p tmp 57 | 58 | @echo "==> Cloning Homebrew tap repository..." 59 | @cd tmp && git clone https://$(GH_PAT)@github.com/samzong/$(HOMEBREW_TAP_REPO).git 60 | @cd tmp/$(HOMEBREW_TAP_REPO) && echo " - Creating new branch: $(BRANCH_NAME)" && git checkout -b $(BRANCH_NAME) 61 | 62 | @echo "==> Processing architectures and calculating checksums..." 63 | @cd tmp/$(HOMEBREW_TAP_REPO) && \ 64 | for arch in $(SUPPORTED_ARCHS); do \ 65 | echo " - Processing $$arch..."; \ 66 | if [ "$(DRY_RUN)" = "1" ]; then \ 67 | echo " [DRY_RUN] Would download: https://github.com/samzong/mdctl/releases/download/v$(CLEAN_VERSION)/mdctl_$${arch}.tar.gz"; \ 68 | case "$$arch" in \ 69 | Darwin_x86_64) DARWIN_AMD64_SHA="fake_sha_amd64" ;; \ 70 | Darwin_arm64) DARWIN_ARM64_SHA="fake_sha_arm64" ;; \ 71 | Linux_x86_64) LINUX_AMD64_SHA="fake_sha_linux_amd64" ;; \ 72 | Linux_arm64) LINUX_ARM64_SHA="fake_sha_linux_arm64" ;; \ 73 | esac; \ 74 | else \ 75 | echo " - Downloading release archive..."; \ 76 | curl -L -sSfO "https://github.com/samzong/mdctl/releases/download/v$(CLEAN_VERSION)/mdctl_$${arch}.tar.gz" || { echo "❌ Failed to download $$arch archive"; exit 1; }; \ 77 | echo " - Calculating SHA256..."; \ 78 | sha=$$(shasum -a 256 "mdctl_$${arch}.tar.gz" | cut -d' ' -f1); \ 79 | case "$$arch" in \ 80 | Darwin_x86_64) DARWIN_AMD64_SHA="$$sha"; echo " ✓ Darwin AMD64 SHA: $$sha" ;; \ 81 | Darwin_arm64) DARWIN_ARM64_SHA="$$sha"; echo " ✓ Darwin ARM64 SHA: $$sha" ;; \ 82 | Linux_x86_64) LINUX_AMD64_SHA="$$sha"; echo " ✓ Linux AMD64 SHA: $$sha" ;; \ 83 | Linux_arm64) LINUX_ARM64_SHA="$$sha"; echo " ✓ Linux ARM64 SHA: $$sha" ;; \ 84 | esac; \ 85 | fi; \ 86 | done; \ 87 | \ 88 | if [ "$(DRY_RUN)" = "1" ]; then \ 89 | echo "==> [DRY_RUN] Would update formula with:"; \ 90 | echo " - Darwin AMD64 SHA: $$DARWIN_AMD64_SHA"; \ 91 | echo " - Darwin ARM64 SHA: $$DARWIN_ARM64_SHA"; \ 92 | echo " - Linux AMD64 SHA: $$LINUX_AMD64_SHA"; \ 93 | echo " - Linux ARM64 SHA: $$LINUX_ARM64_SHA"; \ 94 | echo " - Would commit and push changes"; \ 95 | echo " - Would create PR"; \ 96 | else \ 97 | echo "==> Updating formula file..."; \ 98 | echo " - Updating version to $(CLEAN_VERSION)"; \ 99 | sed -i '' -e 's|version ".*"|version "$(CLEAN_VERSION)"|' $(FORMULA_FILE); \ 100 | \ 101 | echo " - Updating URLs and checksums"; \ 102 | sed -i '' \ 103 | -e '/on_macos/,/end/ { \ 104 | /if Hardware::CPU.arm?/,/else/ { \ 105 | s|url ".*"|url "https://github.com/samzong/mdctl/releases/download/v#{version}/mdctl_Darwin_arm64.tar.gz"|; \ 106 | s|sha256 ".*"|sha256 "'"$$DARWIN_ARM64_SHA"'"|; \ 107 | }; \ 108 | /else/,/end/ { \ 109 | s|url ".*"|url "https://github.com/samzong/mdctl/releases/download/v#{version}/mdctl_Darwin_x86_64.tar.gz"|; \ 110 | s|sha256 ".*"|sha256 "'"$$DARWIN_AMD64_SHA"'"|; \ 111 | }; \ 112 | }' \ 113 | -e '/on_linux/,/end/ { \ 114 | /if Hardware::CPU.arm?/,/else/ { \ 115 | s|url ".*"|url "https://github.com/samzong/mdctl/releases/download/v#{version}/mdctl_Linux_arm64.tar.gz"|; \ 116 | s|sha256 ".*"|sha256 "'"$$LINUX_ARM64_SHA"'"|; \ 117 | }; \ 118 | /else/,/end/ { \ 119 | s|url ".*"|url "https://github.com/samzong/mdctl/releases/download/v#{version}/mdctl_Linux_x86_64.tar.gz"|; \ 120 | s|sha256 ".*"|sha256 "'"$$LINUX_AMD64_SHA"'"|; \ 121 | }; \ 122 | }' $(FORMULA_FILE); \ 123 | \ 124 | echo " - Checking for changes..."; \ 125 | if ! git diff --quiet $(FORMULA_FILE); then \ 126 | echo "==> Changes detected, creating pull request..."; \ 127 | echo " - Adding changes to git"; \ 128 | git add $(FORMULA_FILE); \ 129 | echo " - Committing changes"; \ 130 | git commit -m "chore: bump to $(VERSION)"; \ 131 | echo " - Pushing to remote"; \ 132 | git push -u origin $(BRANCH_NAME); \ 133 | echo " - Preparing pull request data"; \ 134 | pr_data=$$(jq -n \ 135 | --arg title "chore: update mdctl to $(VERSION)" \ 136 | --arg body "Auto-generated PR\nSHAs:\n- Darwin(amd64): $$DARWIN_AMD64_SHA\n- Darwin(arm64): $$DARWIN_ARM64_SHA" \ 137 | --arg head "$(BRANCH_NAME)" \ 138 | --arg base "main" \ 139 | '{title: $$title, body: $$body, head: $$head, base: $$base}'); \ 140 | echo " - Creating pull request"; \ 141 | curl -X POST \ 142 | -H "Authorization: token $(GH_PAT)" \ 143 | -H "Content-Type: application/json" \ 144 | https://api.github.com/repos/samzong/$(HOMEBREW_TAP_REPO)/pulls \ 145 | -d "$$pr_data"; \ 146 | echo "✅ Pull request created successfully"; \ 147 | else \ 148 | echo "❌ No changes detected in formula file"; \ 149 | exit 1; \ 150 | fi; \ 151 | fi 152 | 153 | @echo "==> Cleaning up temporary files..." 154 | @rm -rf tmp 155 | @echo "✅ Homebrew formula update process completed" 156 | 157 | .PHONY: help 158 | help: 159 | @echo "Usage: make " 160 | @echo "Targets:" 161 | @echo " deps: Install Go dependencies" 162 | @echo " build: Build the binary" 163 | @echo " test: Run tests" 164 | @echo " clean: Clean up build artifacts" 165 | @echo " fmt: Format the code" 166 | @echo " all: Clean, format, build, and test" 167 | @echo " update-homebrew: Update Homebrew formula (requires GH_PAT)" 168 | 169 | .DEFAULT_GOAL := help 170 | -------------------------------------------------------------------------------- /internal/llmstxt/extractor.go: -------------------------------------------------------------------------------- 1 | package llmstxt 2 | 3 | import ( 4 | "fmt" 5 | "net/http" 6 | "net/url" 7 | "path" 8 | "strings" 9 | 10 | "github.com/PuerkitoBio/goquery" 11 | ) 12 | 13 | // Extract page information from HTML content 14 | func (g *Generator) extractPageInfo(urlStr string, resp *http.Response) (PageInfo, error) { 15 | // Create PageInfo object 16 | pageInfo := PageInfo{ 17 | URL: urlStr, 18 | Section: parseSection(urlStr), 19 | } 20 | 21 | // Parse HTML 22 | doc, err := goquery.NewDocumentFromReader(resp.Body) 23 | if err != nil { 24 | return pageInfo, err 25 | } 26 | 27 | // Extract title 28 | pageInfo.Title = extractTitle(doc) 29 | if g.config.VeryVerbose { 30 | g.logger.Printf("Extracted title from %s: %s", urlStr, pageInfo.Title) 31 | } 32 | 33 | if pageInfo.Title == "" { 34 | // If title cannot be extracted, use the last segment of the URL as the title 35 | pageInfo.Title = extractTitleFromURL(urlStr) 36 | if g.config.VeryVerbose { 37 | g.logger.Printf("Could not extract title, using URL-based title instead: %s", pageInfo.Title) 38 | } 39 | } 40 | 41 | // Extract description 42 | pageInfo.Description = extractDescription(doc) 43 | if g.config.VeryVerbose { 44 | g.logger.Printf("Extracted description from %s: %s", urlStr, truncateString(pageInfo.Description, 100)) 45 | } 46 | 47 | // Extract content in full mode 48 | if g.config.FullMode { 49 | if g.config.VeryVerbose { 50 | g.logger.Printf("Extracting full content from %s", urlStr) 51 | } 52 | pageInfo.Content = extractContent(doc) 53 | if g.config.VeryVerbose { 54 | contentLen := len(pageInfo.Content) 55 | preview := truncateString(pageInfo.Content, 100) 56 | g.logger.Printf("Extracted content from %s (%d chars): %s", urlStr, contentLen, preview) 57 | } 58 | } 59 | 60 | return pageInfo, nil 61 | } 62 | 63 | // Helper function: truncate string and add ellipsis 64 | func truncateString(s string, maxLen int) string { 65 | s = strings.TrimSpace(s) 66 | if len(s) <= maxLen { 67 | return s 68 | } 69 | return s[:maxLen] + "..." 70 | } 71 | 72 | // Extract section information from URL 73 | func parseSection(urlStr string) string { 74 | // Parse URL 75 | parsedURL, err := url.Parse(urlStr) 76 | if err != nil { 77 | return "ROOT" 78 | } 79 | 80 | // Split path 81 | pathParts := strings.Split(strings.Trim(parsedURL.Path, "/"), "/") 82 | 83 | // If path is empty, return ROOT 84 | if len(pathParts) == 0 || pathParts[0] == "" { 85 | return "ROOT" 86 | } 87 | 88 | // Return first segment of path 89 | return pathParts[0] 90 | } 91 | 92 | // Extract title from HTML document 93 | func extractTitle(doc *goquery.Document) string { 94 | // Try to extract from title tag 95 | title := doc.Find("title").First().Text() 96 | title = strings.TrimSpace(title) 97 | 98 | // If no title tag, try to extract from h1 tag 99 | if title == "" { 100 | title = doc.Find("h1").First().Text() 101 | title = strings.TrimSpace(title) 102 | } 103 | 104 | return title 105 | } 106 | 107 | // Extract title from URL 108 | func extractTitleFromURL(urlStr string) string { 109 | // Parse URL 110 | parsedURL, err := url.Parse(urlStr) 111 | if err != nil { 112 | return urlStr 113 | } 114 | 115 | // Get the last segment of the path 116 | basename := path.Base(parsedURL.Path) 117 | 118 | // Remove file extension 119 | basename = strings.TrimSuffix(basename, path.Ext(basename)) 120 | 121 | // If basename is empty or is "/", use hostname 122 | if basename == "" || basename == "." || basename == "/" { 123 | return parsedURL.Hostname() 124 | } 125 | 126 | // Replace hyphens and underscores with spaces, and capitalize 127 | basename = strings.ReplaceAll(basename, "-", " ") 128 | basename = strings.ReplaceAll(basename, "_", " ") 129 | 130 | return strings.Title(basename) 131 | } 132 | 133 | // Extract description from HTML document 134 | func extractDescription(doc *goquery.Document) string { 135 | var description string 136 | 137 | // Try meta description 138 | description, _ = doc.Find("meta[name='description']").Attr("content") 139 | if description != "" { 140 | return strings.TrimSpace(description) 141 | } 142 | 143 | // Try og:description 144 | description, _ = doc.Find("meta[property='og:description']").Attr("content") 145 | if description != "" { 146 | return strings.TrimSpace(description) 147 | } 148 | 149 | // Try twitter:description 150 | description, _ = doc.Find("meta[name='twitter:description']").Attr("content") 151 | if description != "" { 152 | return strings.TrimSpace(description) 153 | } 154 | 155 | // If none found, extract first text 156 | description = doc.Find("p").First().Text() 157 | if description != "" { 158 | // Limit length 159 | if len(description) > 200 { 160 | description = description[:197] + "..." 161 | } 162 | return strings.TrimSpace(description) 163 | } 164 | 165 | return "No description available" 166 | } 167 | 168 | // Extract content from HTML document 169 | func extractContent(doc *goquery.Document) string { 170 | var content strings.Builder 171 | 172 | // Try to find main content area 173 | mainContent := doc.Find("article, main, #content, .content, .post-content").First() 174 | 175 | // If no specific content area found, use body 176 | if mainContent.Length() == 0 { 177 | mainContent = doc.Find("body") 178 | } 179 | 180 | // Extract all paragraphs 181 | mainContent.Find("p, h1, h2, h3, h4, h5, h6, ul, ol, blockquote").Each(func(i int, s *goquery.Selection) { 182 | // Get tag name 183 | tagName := goquery.NodeName(s) 184 | text := strings.TrimSpace(s.Text()) 185 | 186 | if text == "" { 187 | return 188 | } 189 | 190 | // Format according to tag type 191 | switch tagName { 192 | case "h1": 193 | content.WriteString("# " + text + "\n\n") 194 | case "h2": 195 | content.WriteString("## " + text + "\n\n") 196 | case "h3": 197 | content.WriteString("### " + text + "\n\n") 198 | case "h4": 199 | content.WriteString("#### " + text + "\n\n") 200 | case "h5": 201 | content.WriteString("##### " + text + "\n\n") 202 | case "h6": 203 | content.WriteString("###### " + text + "\n\n") 204 | case "p": 205 | content.WriteString(text + "\n\n") 206 | case "blockquote": 207 | content.WriteString("> " + text + "\n\n") 208 | case "ul", "ol": 209 | s.Find("li").Each(func(j int, li *goquery.Selection) { 210 | liText := strings.TrimSpace(li.Text()) 211 | if liText != "" { 212 | if tagName == "ul" { 213 | content.WriteString("- " + liText + "\n") 214 | } else { 215 | content.WriteString(fmt.Sprintf("%d. %s\n", j+1, liText)) 216 | } 217 | } 218 | }) 219 | content.WriteString("\n") 220 | } 221 | }) 222 | 223 | // Limit content length 224 | contentStr := content.String() 225 | if len(contentStr) > 10000 { 226 | // Find last paragraph end position 227 | lastParaEnd := strings.LastIndex(contentStr[:10000], "\n\n") 228 | if lastParaEnd == -1 { 229 | lastParaEnd = 10000 230 | } 231 | contentStr = contentStr[:lastParaEnd] + "\n\n... (content truncated)" 232 | } 233 | 234 | return contentStr 235 | } 236 | -------------------------------------------------------------------------------- /internal/llmstxt/sitemap.go: -------------------------------------------------------------------------------- 1 | package llmstxt 2 | 3 | import ( 4 | "encoding/xml" 5 | "fmt" 6 | "io" 7 | "net/http" 8 | "strings" 9 | "time" 10 | 11 | "github.com/gobwas/glob" 12 | ) 13 | 14 | // Sitemap XML structure 15 | type Sitemap struct { 16 | XMLName xml.Name `xml:"urlset"` 17 | URLs []struct { 18 | Loc string `xml:"loc"` 19 | LastMod string `xml:"lastmod,omitempty"` 20 | ChangeFreq string `xml:"changefreq,omitempty"` 21 | Priority string `xml:"priority,omitempty"` 22 | } `xml:"url"` 23 | } 24 | 25 | // SitemapIndex XML structure 26 | type SitemapIndex struct { 27 | XMLName xml.Name `xml:"sitemapindex"` 28 | Sitemaps []struct { 29 | Loc string `xml:"loc"` 30 | LastMod string `xml:"lastmod,omitempty"` 31 | } `xml:"sitemap"` 32 | } 33 | 34 | // Parse sitemap.xml file and return all URLs 35 | func (g *Generator) parseSitemap() ([]string, error) { 36 | g.logger.Printf("Parsing sitemap from %s", g.config.SitemapURL) 37 | 38 | // Set HTTP client 39 | client := &http.Client{ 40 | Timeout: time.Duration(g.config.Timeout) * time.Second, 41 | } 42 | 43 | // Build request 44 | req, err := http.NewRequest("GET", g.config.SitemapURL, nil) 45 | if err != nil { 46 | return nil, fmt.Errorf("failed to create request: %w", err) 47 | } 48 | 49 | // Set User-Agent 50 | req.Header.Set("User-Agent", g.config.UserAgent) 51 | 52 | // Send request 53 | resp, err := client.Do(req) 54 | if err != nil { 55 | return nil, fmt.Errorf("failed to fetch sitemap: %w", err) 56 | } 57 | defer resp.Body.Close() 58 | 59 | if resp.StatusCode != http.StatusOK { 60 | return nil, fmt.Errorf("failed to fetch sitemap, status code: %d", resp.StatusCode) 61 | } 62 | 63 | // Read response body 64 | body, err := io.ReadAll(resp.Body) 65 | if err != nil { 66 | return nil, fmt.Errorf("failed to read sitemap content: %w", err) 67 | } 68 | 69 | // Try to parse as standard sitemap 70 | var sitemap Sitemap 71 | if err := xml.Unmarshal(body, &sitemap); err == nil && len(sitemap.URLs) > 0 { 72 | g.logger.Println("Parsed standard sitemap") 73 | return g.extractURLsFromSitemap(sitemap), nil 74 | } 75 | 76 | // Try to parse as sitemap index 77 | var sitemapIndex SitemapIndex 78 | if err := xml.Unmarshal(body, &sitemapIndex); err == nil && len(sitemapIndex.Sitemaps) > 0 { 79 | g.logger.Println("Parsed sitemap index, fetching child sitemaps") 80 | return g.fetchSitemapIndex(sitemapIndex, client) 81 | } 82 | 83 | // If all parsing fails, try to handle as text sitemap (one URL per line) 84 | lines := string(body) 85 | if len(lines) > 0 { 86 | g.logger.Println("Parsing as text sitemap") 87 | return g.parseTextSitemap(lines), nil 88 | } 89 | 90 | return nil, fmt.Errorf("could not parse sitemap, unknown format") 91 | } 92 | 93 | // Extract URLs from standard sitemap 94 | func (g *Generator) extractURLsFromSitemap(sitemap Sitemap) []string { 95 | urls := make([]string, 0, len(sitemap.URLs)) 96 | for _, urlEntry := range sitemap.URLs { 97 | if urlEntry.Loc != "" { 98 | urls = append(urls, urlEntry.Loc) 99 | } 100 | } 101 | return urls 102 | } 103 | 104 | // Get all child sitemap URLs from sitemap index 105 | func (g *Generator) fetchSitemapIndex(index SitemapIndex, client *http.Client) ([]string, error) { 106 | var allURLs []string 107 | 108 | for _, sitemapEntry := range index.Sitemaps { 109 | if sitemapEntry.Loc == "" { 110 | continue 111 | } 112 | 113 | g.logger.Printf("Fetching child sitemap: %s", sitemapEntry.Loc) 114 | 115 | // Build request 116 | req, err := http.NewRequest("GET", sitemapEntry.Loc, nil) 117 | if err != nil { 118 | g.logger.Printf("Warning: failed to create request for child sitemap %s: %v", sitemapEntry.Loc, err) 119 | continue 120 | } 121 | 122 | // Set User-Agent 123 | req.Header.Set("User-Agent", g.config.UserAgent) 124 | 125 | // Send request 126 | resp, err := client.Do(req) 127 | if err != nil { 128 | g.logger.Printf("Warning: failed to fetch child sitemap %s: %v", sitemapEntry.Loc, err) 129 | continue 130 | } 131 | 132 | // Read response body 133 | body, err := io.ReadAll(resp.Body) 134 | resp.Body.Close() 135 | if err != nil { 136 | g.logger.Printf("Warning: failed to read child sitemap %s: %v", sitemapEntry.Loc, err) 137 | continue 138 | } 139 | 140 | // Parse child sitemap 141 | var childSitemap Sitemap 142 | if err := xml.Unmarshal(body, &childSitemap); err != nil { 143 | g.logger.Printf("Warning: failed to parse child sitemap %s: %v", sitemapEntry.Loc, err) 144 | continue 145 | } 146 | 147 | // Extract URLs 148 | childURLs := g.extractURLsFromSitemap(childSitemap) 149 | g.logger.Printf("Found %d URLs in child sitemap %s", len(childURLs), sitemapEntry.Loc) 150 | allURLs = append(allURLs, childURLs...) 151 | } 152 | 153 | return allURLs, nil 154 | } 155 | 156 | // Parse text sitemap (one URL per line) 157 | func (g *Generator) parseTextSitemap(content string) []string { 158 | lines := splitLines(content) 159 | var urls []string 160 | 161 | for _, line := range lines { 162 | line = normalizeURL(line) 163 | if isValidURL(line) { 164 | urls = append(urls, line) 165 | } 166 | } 167 | 168 | return urls 169 | } 170 | 171 | // Filter URLs based on include/exclude mode 172 | func (g *Generator) filterURLs(urls []string) []string { 173 | if len(g.config.IncludePaths) == 0 && len(g.config.ExcludePaths) == 0 { 174 | return urls // No filtering rules, return directly 175 | } 176 | 177 | // Compile include/exclude mode 178 | var includeMatchers, excludeMatchers []glob.Glob 179 | for _, pattern := range g.config.IncludePaths { 180 | matcher, err := glob.Compile(pattern) 181 | if err != nil { 182 | g.logger.Printf("Warning: invalid include pattern '%s': %v", pattern, err) 183 | continue 184 | } 185 | includeMatchers = append(includeMatchers, matcher) 186 | } 187 | 188 | for _, pattern := range g.config.ExcludePaths { 189 | matcher, err := glob.Compile(pattern) 190 | if err != nil { 191 | g.logger.Printf("Warning: invalid exclude pattern '%s': %v", pattern, err) 192 | continue 193 | } 194 | excludeMatchers = append(excludeMatchers, matcher) 195 | } 196 | 197 | var filteredURLs []string 198 | for _, url := range urls { 199 | // If there are include rules, one of them must match 200 | if len(includeMatchers) > 0 { 201 | matched := false 202 | for _, matcher := range includeMatchers { 203 | if matcher.Match(url) { 204 | matched = true 205 | break 206 | } 207 | } 208 | if !matched { 209 | continue 210 | } 211 | } 212 | 213 | // If any exclude rules match, exclude 214 | excluded := false 215 | for _, matcher := range excludeMatchers { 216 | if matcher.Match(url) { 217 | excluded = true 218 | break 219 | } 220 | } 221 | if excluded { 222 | continue 223 | } 224 | 225 | filteredURLs = append(filteredURLs, url) 226 | } 227 | 228 | return filteredURLs 229 | } 230 | 231 | // Helper function: split text by line 232 | func splitLines(s string) []string { 233 | return strings.Split(s, "\n") 234 | } 235 | 236 | // Helper function: normalize URL (remove spaces, etc.) 237 | func normalizeURL(url string) string { 238 | return url 239 | } 240 | 241 | // Helper function: check if URL is valid 242 | func isValidURL(url string) bool { 243 | return url != "" 244 | } 245 | -------------------------------------------------------------------------------- /internal/linter/rules_test.go: -------------------------------------------------------------------------------- 1 | package linter 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestMD047_FileEndingCheck(t *testing.T) { 8 | tests := []struct { 9 | name string 10 | lines []string 11 | expectIssue bool 12 | description string 13 | }{ 14 | { 15 | name: "file ends with single newline", 16 | lines: []string{"# Title", "Content", ""}, 17 | expectIssue: false, 18 | description: "should not trigger issue when file ends with single newline", 19 | }, 20 | { 21 | name: "file does not end with newline", 22 | lines: []string{"# Title", "Content"}, 23 | expectIssue: true, 24 | description: "should trigger issue when file doesn't end with newline", 25 | }, 26 | { 27 | name: "file ends with multiple newlines", 28 | lines: []string{"# Title", "Content", "", ""}, 29 | expectIssue: true, 30 | description: "should trigger issue when file ends with multiple newlines", 31 | }, 32 | { 33 | name: "empty file", 34 | lines: []string{}, 35 | expectIssue: false, 36 | description: "should not trigger issue for empty file", 37 | }, 38 | } 39 | 40 | rule := &MD047{BaseRule: BaseRule{id: "MD047", description: "Files should end with a single newline character", enabled: true}} 41 | 42 | for _, tt := range tests { 43 | t.Run(tt.name, func(t *testing.T) { 44 | issues := rule.Check(tt.lines) 45 | hasIssue := len(issues) > 0 46 | 47 | if hasIssue != tt.expectIssue { 48 | t.Errorf("%s: expected issue=%t, got issue=%t", tt.description, tt.expectIssue, hasIssue) 49 | if hasIssue { 50 | for _, issue := range issues { 51 | t.Logf("Issue: %s", issue.Message) 52 | } 53 | } 54 | } 55 | }) 56 | } 57 | } 58 | 59 | func TestMD032_ListBlankLines(t *testing.T) { 60 | tests := []struct { 61 | name string 62 | lines []string 63 | expectCount int 64 | description string 65 | }{ 66 | { 67 | name: "list with proper blank lines", 68 | lines: []string{ 69 | "# Title", 70 | "", 71 | "- Item 1", 72 | "- Item 2", 73 | "", 74 | "Content after list", 75 | }, 76 | expectCount: 0, 77 | description: "should not trigger issue when list has proper blank lines", 78 | }, 79 | { 80 | name: "list without blank line before", 81 | lines: []string{ 82 | "# Title", 83 | "Some text", 84 | "- Item 1", 85 | "", 86 | "Content after list", 87 | }, 88 | expectCount: 1, 89 | description: "should trigger issue when list doesn't have blank line before", 90 | }, 91 | { 92 | name: "list without blank line after", 93 | lines: []string{ 94 | "# Title", 95 | "", 96 | "- Item 1", 97 | "Content after list", 98 | }, 99 | expectCount: 1, 100 | description: "should trigger issue when list doesn't have blank line after", 101 | }, 102 | { 103 | name: "list without blank lines before and after", 104 | lines: []string{ 105 | "# Title", 106 | "Some text", 107 | "- Item 1", 108 | "Content after list", 109 | }, 110 | expectCount: 2, 111 | description: "should trigger 2 issues when list doesn't have blank lines before and after", 112 | }, 113 | } 114 | 115 | rule := &MD032{BaseRule: BaseRule{id: "MD032", description: "Lists should be surrounded by blank lines", enabled: true}} 116 | 117 | for _, tt := range tests { 118 | t.Run(tt.name, func(t *testing.T) { 119 | issues := rule.Check(tt.lines) 120 | 121 | if len(issues) != tt.expectCount { 122 | t.Errorf("%s: expected %d issues, got %d issues", tt.description, tt.expectCount, len(issues)) 123 | for i, issue := range issues { 124 | t.Logf("Issue %d: Line %d - %s", i+1, issue.Line, issue.Message) 125 | } 126 | } 127 | }) 128 | } 129 | } 130 | 131 | func TestRegexPrecompilation(t *testing.T) { 132 | tests := []struct { 133 | name string 134 | rule Rule 135 | }{ 136 | {"MD018", &MD018{BaseRule: BaseRule{id: "MD018", enabled: true}}}, 137 | {"MD019", &MD019{BaseRule: BaseRule{id: "MD019", enabled: true}}}, 138 | {"MD023", &MD023{BaseRule: BaseRule{id: "MD023", enabled: true}}}, 139 | {"MD032", &MD032{BaseRule: BaseRule{id: "MD032", enabled: true}}}, 140 | } 141 | 142 | for _, tt := range tests { 143 | t.Run(tt.name, func(t *testing.T) { 144 | // Call Check method to trigger regex compilation 145 | _ = tt.rule.Check([]string{"# Test", "Content"}) 146 | 147 | // Check that pattern was compiled for rules that have patterns 148 | switch rule := tt.rule.(type) { 149 | case *MD018: 150 | if rule.pattern == nil { 151 | t.Error("MD018 pattern was not compiled") 152 | } 153 | case *MD019: 154 | if rule.pattern == nil { 155 | t.Error("MD019 pattern was not compiled") 156 | } 157 | case *MD023: 158 | if rule.pattern == nil { 159 | t.Error("MD023 pattern was not compiled") 160 | } 161 | case *MD032: 162 | if rule.pattern == nil { 163 | t.Error("MD032 pattern was not compiled") 164 | } 165 | } 166 | }) 167 | } 168 | } 169 | 170 | func TestMD018_NoSpaceAfterHash(t *testing.T) { 171 | rule := &MD018{BaseRule: BaseRule{id: "MD018", enabled: true}} 172 | 173 | tests := []struct { 174 | line string 175 | expectIssue bool 176 | }{ 177 | {"# Proper heading", false}, 178 | {"#Bad heading", true}, 179 | {"## Another proper heading", false}, 180 | {"##Bad heading", true}, 181 | {"### Yet another proper heading", false}, 182 | {"###Bad heading", true}, 183 | {"Not a heading", false}, 184 | {"", false}, 185 | } 186 | 187 | for _, tt := range tests { 188 | t.Run(tt.line, func(t *testing.T) { 189 | issues := rule.Check([]string{tt.line}) 190 | hasIssue := len(issues) > 0 191 | 192 | if hasIssue != tt.expectIssue { 193 | t.Errorf("Line %q: expected issue=%t, got issue=%t", tt.line, tt.expectIssue, hasIssue) 194 | } 195 | }) 196 | } 197 | } 198 | 199 | func TestMD019_MultipleSpacesAfterHash(t *testing.T) { 200 | rule := &MD019{BaseRule: BaseRule{id: "MD019", enabled: true}} 201 | 202 | tests := []struct { 203 | line string 204 | expectIssue bool 205 | }{ 206 | {"# Proper heading", false}, 207 | {"# Bad heading", true}, 208 | {"## Another proper heading", false}, 209 | {"## Bad heading", true}, 210 | {"### Very bad heading", true}, 211 | {"Not a heading", false}, 212 | {"", false}, 213 | } 214 | 215 | for _, tt := range tests { 216 | t.Run(tt.line, func(t *testing.T) { 217 | issues := rule.Check([]string{tt.line}) 218 | hasIssue := len(issues) > 0 219 | 220 | if hasIssue != tt.expectIssue { 221 | t.Errorf("Line %q: expected issue=%t, got issue=%t", tt.line, tt.expectIssue, hasIssue) 222 | } 223 | }) 224 | } 225 | } 226 | 227 | func TestMD023_HeadingAtStartOfLine(t *testing.T) { 228 | rule := &MD023{BaseRule: BaseRule{id: "MD023", enabled: true}} 229 | 230 | tests := []struct { 231 | line string 232 | expectIssue bool 233 | }{ 234 | {"# Proper heading", false}, 235 | {" # Bad heading", true}, 236 | {" ## Very bad heading", true}, 237 | {"Not a heading", false}, 238 | {"", false}, 239 | } 240 | 241 | for _, tt := range tests { 242 | t.Run(tt.line, func(t *testing.T) { 243 | issues := rule.Check([]string{tt.line}) 244 | hasIssue := len(issues) > 0 245 | 246 | if hasIssue != tt.expectIssue { 247 | t.Errorf("Line %q: expected issue=%t, got issue=%t", tt.line, tt.expectIssue, hasIssue) 248 | } 249 | }) 250 | } 251 | } 252 | -------------------------------------------------------------------------------- /internal/config/config.go: -------------------------------------------------------------------------------- 1 | package config 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "os" 7 | "path/filepath" 8 | ) 9 | 10 | type CloudConfig struct { 11 | Provider string `json:"provider"` 12 | Region string `json:"region"` 13 | Endpoint string `json:"endpoint"` 14 | AccessKey string `json:"access_key"` 15 | SecretKey string `json:"secret_key"` 16 | Bucket string `json:"bucket"` 17 | AccountID string `json:"account_id,omitempty"` 18 | CustomDomain string `json:"custom_domain,omitempty"` 19 | PathPrefix string `json:"path_prefix,omitempty"` 20 | ProviderOpts map[string]string `json:"provider_opts,omitempty"` 21 | Concurrency int `json:"concurrency"` 22 | SkipVerify bool `json:"skip_verify"` 23 | CACertPath string `json:"ca_cert_path,omitempty"` 24 | ConflictPolicy string `json:"conflict_policy"` 25 | CacheDir string `json:"cache_dir,omitempty"` 26 | } 27 | 28 | type Config struct { 29 | TranslatePrompt string `json:"translate_prompt"` 30 | OpenAIEndpointURL string `json:"endpoint"` 31 | OpenAIAPIKey string `json:"api_key"` 32 | ModelName string `json:"model"` 33 | Temperature float64 `json:"temperature"` 34 | TopP float64 `json:"top_p"` 35 | CloudStorages map[string]CloudConfig `json:"cloud_storages,omitempty"` 36 | DefaultStorage string `json:"default_storage,omitempty"` 37 | } 38 | 39 | var DefaultCloudConfig = CloudConfig{ 40 | Provider: "", 41 | Region: "auto", 42 | Endpoint: "", 43 | AccessKey: "", 44 | SecretKey: "", 45 | Bucket: "", 46 | Concurrency: 5, 47 | SkipVerify: false, 48 | ConflictPolicy: "rename", 49 | } 50 | 51 | var DefaultConfig = Config{ 52 | TranslatePrompt: "Translate the markdown to {TARGET_LANG} as a native speaker - preserve code/YAML/links/cli commands (e.g. `kubectl apply` or `pip install langchain`) and tech terms (CRDs, Helm charts, RAG). Output ONLY fluently localized text with natural technical phrasing that doesn't read machine-generated.", 53 | OpenAIEndpointURL: "https://api.openai.com/v1", 54 | OpenAIAPIKey: "", 55 | ModelName: "gpt-3.5-turbo", 56 | Temperature: 0.0, 57 | TopP: 1.0, 58 | CloudStorages: make(map[string]CloudConfig), 59 | } 60 | 61 | func GetConfigPath() string { 62 | homeDir, err := os.UserHomeDir() 63 | if err != nil { 64 | return "" 65 | } 66 | return filepath.Join(homeDir, ".config", "mdctl", "config.json") 67 | } 68 | 69 | func LoadConfig() (*Config, error) { 70 | configPath := GetConfigPath() 71 | if configPath == "" { 72 | return &DefaultConfig, nil 73 | } 74 | 75 | if _, err := os.Stat(configPath); os.IsNotExist(err) { 76 | if err := SaveConfig(&DefaultConfig); err != nil { 77 | return &DefaultConfig, fmt.Errorf("failed to create default config: %v", err) 78 | } 79 | return &DefaultConfig, nil 80 | } 81 | 82 | data, err := os.ReadFile(configPath) 83 | if err != nil { 84 | return &DefaultConfig, fmt.Errorf("failed to read config file: %v", err) 85 | } 86 | 87 | var config Config 88 | if err := json.Unmarshal(data, &config); err != nil { 89 | os.Remove(configPath) 90 | if err := SaveConfig(&DefaultConfig); err != nil { 91 | return &DefaultConfig, fmt.Errorf("failed to create new config after invalid file: %v", err) 92 | } 93 | return &DefaultConfig, fmt.Errorf("invalid config file (recreated with defaults): %v", err) 94 | } 95 | 96 | if config.TranslatePrompt == "" { 97 | config.TranslatePrompt = DefaultConfig.TranslatePrompt 98 | } 99 | if config.OpenAIEndpointURL == "" { 100 | config.OpenAIEndpointURL = DefaultConfig.OpenAIEndpointURL 101 | } 102 | if config.ModelName == "" { 103 | config.ModelName = DefaultConfig.ModelName 104 | } 105 | 106 | // Ensure CloudStorages is non-nil 107 | if config.CloudStorages == nil { 108 | config.CloudStorages = make(map[string]CloudConfig) 109 | } 110 | 111 | // Check if default storage exists 112 | if config.DefaultStorage != "" { 113 | if _, exists := config.CloudStorages[config.DefaultStorage]; !exists { 114 | // If specified default storage doesn't exist, use the first available one 115 | if len(config.CloudStorages) > 0 { 116 | for name := range config.CloudStorages { 117 | config.DefaultStorage = name 118 | break 119 | } 120 | } else { 121 | config.DefaultStorage = "" 122 | } 123 | } 124 | } else if len(config.CloudStorages) > 0 { 125 | // If no default storage is set but there are storage configurations, set the first one as default 126 | for name := range config.CloudStorages { 127 | config.DefaultStorage = name 128 | break 129 | } 130 | } 131 | 132 | return &config, nil 133 | } 134 | 135 | func SaveConfig(config *Config) error { 136 | configPath := GetConfigPath() 137 | if configPath == "" { 138 | return fmt.Errorf("failed to get config path") 139 | } 140 | 141 | configDir := filepath.Dir(configPath) 142 | if err := os.MkdirAll(configDir, 0755); err != nil { 143 | return fmt.Errorf("failed to create config directory: %v", err) 144 | } 145 | 146 | data, err := json.MarshalIndent(config, "", " ") 147 | if err != nil { 148 | return fmt.Errorf("failed to marshal config: %v", err) 149 | } 150 | 151 | if err := os.WriteFile(configPath, data, 0644); err != nil { 152 | return fmt.Errorf("failed to write config file: %v", err) 153 | } 154 | 155 | return nil 156 | } 157 | 158 | // ApplyCloudConfig applies platform-specific settings to the cloud configuration 159 | func (c *Config) ApplyCloudConfig() { 160 | // Ensure CloudStorages is non-nil 161 | if c.CloudStorages == nil { 162 | c.CloudStorages = make(map[string]CloudConfig) 163 | } 164 | 165 | // Check if default storage exists 166 | if c.DefaultStorage != "" { 167 | if _, exists := c.CloudStorages[c.DefaultStorage]; !exists { 168 | // If specified default storage doesn't exist, use the first available one 169 | for name := range c.CloudStorages { 170 | c.DefaultStorage = name 171 | break 172 | } 173 | } 174 | } 175 | 176 | // If no default storage is set but there are storage configurations, set the first one as default 177 | if c.DefaultStorage == "" && len(c.CloudStorages) > 0 { 178 | for name := range c.CloudStorages { 179 | c.DefaultStorage = name 180 | break 181 | } 182 | } 183 | } 184 | 185 | // GetActiveCloudConfig returns the current active cloud storage configuration 186 | // The storageName parameter can specify which configuration to use, if empty the default configuration is used 187 | func (c *Config) GetActiveCloudConfig(storageName string) CloudConfig { 188 | // If a storage name is specified, try to get that configuration 189 | if storageName != "" { 190 | if storage, exists := c.CloudStorages[storageName]; exists { 191 | return storage 192 | } 193 | } 194 | 195 | // If there's a default configuration, use that 196 | if c.DefaultStorage != "" { 197 | if storage, exists := c.CloudStorages[c.DefaultStorage]; exists { 198 | return storage 199 | } 200 | } 201 | 202 | // If any configuration is available, return the first one found 203 | if len(c.CloudStorages) > 0 { 204 | for _, storage := range c.CloudStorages { 205 | return storage 206 | } 207 | } 208 | 209 | // Return default empty configuration 210 | return DefaultCloudConfig 211 | } 212 | -------------------------------------------------------------------------------- /cmd/lint.go: -------------------------------------------------------------------------------- 1 | package cmd 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "os" 7 | "path/filepath" 8 | "strings" 9 | 10 | "github.com/samzong/mdctl/internal/linter" 11 | "github.com/spf13/cobra" 12 | ) 13 | 14 | var ( 15 | autoFix bool 16 | configRules []string 17 | outputFormat string 18 | rulesFile string 19 | enableRules []string 20 | disableRules []string 21 | initConfig bool 22 | configOutput string 23 | ) 24 | 25 | var lintCmd = &cobra.Command{ 26 | Use: "lint [files...]", 27 | Short: "Lint markdown files for syntax issues", 28 | Long: `Lint markdown files using markdownlint rules to find syntax issues. 29 | 30 | This command will scan markdown files and report any syntax issues found. 31 | It can also automatically fix issues when --fix flag is used. 32 | 33 | Examples: 34 | # Lint a single file 35 | mdctl lint README.md 36 | 37 | # Lint multiple files 38 | mdctl lint docs/*.md 39 | 40 | # Lint with auto-fix 41 | mdctl lint --fix README.md 42 | 43 | # Lint with custom rules configuration 44 | mdctl lint --config .markdownlint.json README.md 45 | 46 | # Enable specific rules 47 | mdctl lint --enable MD001,MD003 README.md 48 | 49 | # Disable specific rules 50 | mdctl lint --disable MD013,MD033 README.md 51 | 52 | # Create a default configuration file 53 | mdctl lint --init 54 | 55 | # Create a configuration file with custom name 56 | mdctl lint --init --init-config my-rules.json`, 57 | RunE: func(cmd *cobra.Command, args []string) error { 58 | // Handle config initialization 59 | if initConfig { 60 | configFile := configOutput 61 | if configFile == "" { 62 | configFile = ".markdownlint.json" 63 | } 64 | 65 | if err := linter.CreateDefaultConfig(configFile); err != nil { 66 | return fmt.Errorf("failed to create config file: %v", err) 67 | } 68 | 69 | fmt.Printf("Created markdownlint configuration file: %s\n", configFile) 70 | return nil 71 | } 72 | 73 | if len(args) == 0 { 74 | return fmt.Errorf("at least one markdown file must be specified") 75 | } 76 | 77 | // Expand file patterns 78 | var files []string 79 | for _, arg := range args { 80 | // Basic security validation - prevent path traversal 81 | if strings.Contains(arg, "..") { 82 | return fmt.Errorf("path traversal not allowed: %s", arg) 83 | } 84 | 85 | matches, err := filepath.Glob(arg) 86 | if err != nil { 87 | return fmt.Errorf("invalid file pattern %s: %v", arg, err) 88 | } 89 | if len(matches) == 0 { 90 | // If no glob matches, check if it's a direct file 91 | if _, err := os.Stat(arg); err == nil { 92 | files = append(files, arg) 93 | } else { 94 | fmt.Printf("Warning: No files found matching pattern: %s\n", arg) 95 | } 96 | } else { 97 | files = append(files, matches...) 98 | } 99 | } 100 | 101 | // Filter for markdown files 102 | var markdownFiles []string 103 | for _, file := range files { 104 | if strings.HasSuffix(strings.ToLower(file), ".md") || strings.HasSuffix(strings.ToLower(file), ".markdown") { 105 | markdownFiles = append(markdownFiles, file) 106 | } 107 | } 108 | 109 | if len(markdownFiles) == 0 { 110 | return fmt.Errorf("no markdown files found") 111 | } 112 | 113 | // Create linter configuration 114 | config := &linter.Config{ 115 | AutoFix: autoFix, 116 | OutputFormat: outputFormat, 117 | RulesFile: rulesFile, 118 | EnableRules: enableRules, 119 | DisableRules: disableRules, 120 | Verbose: verbose, 121 | } 122 | 123 | // Create linter instance 124 | mdLinter := linter.New(config) 125 | 126 | // Process files 127 | var totalIssues int 128 | var totalFixed int 129 | 130 | for _, file := range markdownFiles { 131 | if verbose { 132 | fmt.Printf("Linting: %s\n", file) 133 | } 134 | 135 | result, err := mdLinter.LintFile(file) 136 | if err != nil { 137 | fmt.Printf("Error linting %s: %v\n", file, err) 138 | continue 139 | } 140 | 141 | totalIssues += len(result.Issues) 142 | totalFixed += result.FixedCount 143 | 144 | // Display results based on output format 145 | if err := displayResults(file, result, config); err != nil { 146 | return fmt.Errorf("error displaying results: %v", err) 147 | } 148 | } 149 | 150 | // Summary 151 | if verbose || len(markdownFiles) > 1 { 152 | fmt.Printf("\nSummary:\n") 153 | fmt.Printf(" Files processed: %d\n", len(markdownFiles)) 154 | fmt.Printf(" Total issues: %d\n", totalIssues) 155 | if autoFix { 156 | fmt.Printf(" Issues fixed: %d\n", totalFixed) 157 | } 158 | } 159 | 160 | // Exit with error code if issues found and not in fix mode 161 | if totalIssues > 0 && !autoFix { 162 | os.Exit(1) 163 | } 164 | 165 | return nil 166 | }, 167 | } 168 | 169 | func displayResults(filename string, result *linter.Result, config *linter.Config) error { 170 | switch config.OutputFormat { 171 | case "json": 172 | return displayJSONResults(filename, result) 173 | case "github": 174 | return displayGitHubResults(filename, result) 175 | default: 176 | return displayDefaultResults(filename, result, config) 177 | } 178 | } 179 | 180 | func displayDefaultResults(filename string, result *linter.Result, config *linter.Config) error { 181 | if len(result.Issues) == 0 { 182 | if config.Verbose { 183 | fmt.Printf("✓ %s: No issues found\n", filename) 184 | } 185 | return nil 186 | } 187 | 188 | fmt.Printf("%s:\n", filename) 189 | for _, issue := range result.Issues { 190 | status := "✗" 191 | if issue.Fixed { 192 | status = "✓" 193 | } 194 | 195 | fmt.Printf(" %s Line %d: %s (%s)\n", 196 | status, issue.Line, issue.Message, issue.Rule) 197 | 198 | if config.Verbose && issue.Context != "" { 199 | fmt.Printf(" Context: %s\n", issue.Context) 200 | } 201 | } 202 | 203 | if config.AutoFix && result.FixedCount > 0 { 204 | fmt.Printf(" Fixed %d issues\n", result.FixedCount) 205 | } 206 | 207 | return nil 208 | } 209 | 210 | func displayJSONResults(filename string, result *linter.Result) error { 211 | output := map[string]interface{}{ 212 | "filename": result.Filename, 213 | "issues": result.Issues, 214 | "fixed_count": result.FixedCount, 215 | } 216 | 217 | data, err := json.MarshalIndent(output, "", " ") 218 | if err != nil { 219 | return err 220 | } 221 | 222 | fmt.Println(string(data)) 223 | return nil 224 | } 225 | 226 | func displayGitHubResults(filename string, result *linter.Result) error { 227 | // GitHub Actions workflow commands format 228 | for _, issue := range result.Issues { 229 | level := "error" 230 | if issue.Fixed { 231 | level = "notice" 232 | } 233 | 234 | fmt.Printf("::%s file=%s,line=%d::%s (%s)\n", 235 | level, filename, issue.Line, issue.Message, issue.Rule) 236 | } 237 | return nil 238 | } 239 | 240 | func init() { 241 | lintCmd.Flags().BoolVar(&autoFix, "fix", false, "Automatically fix issues where possible") 242 | lintCmd.Flags().StringVar(&outputFormat, "format", "default", "Output format: default, json, github") 243 | lintCmd.Flags().StringVar(&rulesFile, "config", "", "Path to markdownlint configuration file") 244 | lintCmd.Flags().StringSliceVar(&enableRules, "enable", []string{}, "Enable specific rules (comma-separated)") 245 | lintCmd.Flags().StringSliceVar(&disableRules, "disable", []string{}, "Disable specific rules (comma-separated)") 246 | lintCmd.Flags().BoolVar(&initConfig, "init", false, "Create a default .markdownlint.json configuration file") 247 | lintCmd.Flags().StringVar(&configOutput, "init-config", "", "Path for the configuration file when using --init (default: .markdownlint.json)") 248 | 249 | lintCmd.GroupID = "core" 250 | } 251 | -------------------------------------------------------------------------------- /cmd/upload.go: -------------------------------------------------------------------------------- 1 | package cmd 2 | 3 | import ( 4 | "fmt" 5 | "strings" 6 | 7 | "github.com/samzong/mdctl/internal/config" 8 | "github.com/samzong/mdctl/internal/uploader" 9 | "github.com/spf13/cobra" 10 | ) 11 | 12 | var ( 13 | // Upload command flags 14 | uploadSourceFile string 15 | uploadSourceDir string 16 | uploadProvider string 17 | uploadBucket string 18 | uploadCustomDomain string 19 | uploadPathPrefix string 20 | uploadDryRun bool 21 | uploadConcurrency int 22 | uploadForceUpload bool 23 | uploadSkipVerify bool 24 | uploadCACertPath string 25 | uploadConflictPolicy string 26 | uploadCacheDir string 27 | uploadIncludeExts string 28 | uploadStorageName string 29 | 30 | uploadCmd = &cobra.Command{ 31 | Use: "upload", 32 | Short: "Upload local images in markdown files to cloud storage", 33 | Long: `Upload local images in markdown files to cloud storage and rewrite URLs. 34 | Supports multiple cloud storage providers with S3-compatible APIs. 35 | 36 | Examples: 37 | mdctl upload -d docs/ 38 | mdctl upload -f post.md 39 | mdctl upload -f post.md --storage my-s3`, 40 | RunE: func(cmd *cobra.Command, args []string) error { 41 | if uploadSourceFile == "" && uploadSourceDir == "" { 42 | return fmt.Errorf("either source file (-f) or source directory (-d) must be specified") 43 | } 44 | if uploadSourceFile != "" && uploadSourceDir != "" { 45 | return fmt.Errorf("cannot specify both source file (-f) and source directory (-d)") 46 | } 47 | 48 | // Load configuration file first 49 | cfg, err := config.LoadConfig() 50 | if err != nil { 51 | return fmt.Errorf("failed to load config: %v", err) 52 | } 53 | 54 | // Get active cloud storage configuration 55 | cloudConfig := cfg.GetActiveCloudConfig(uploadStorageName) 56 | 57 | // Command line parameters take precedence over configuration 58 | if uploadProvider == "" { 59 | uploadProvider = cloudConfig.Provider 60 | } 61 | 62 | if uploadBucket == "" { 63 | uploadBucket = cloudConfig.Bucket 64 | } 65 | 66 | // Check for empty values after using configuration file values 67 | if uploadProvider == "" { 68 | return fmt.Errorf("provider (-p) must be specified or set in configuration file") 69 | } 70 | 71 | if uploadBucket == "" { 72 | return fmt.Errorf("bucket (-b) must be specified or set in configuration file") 73 | } 74 | 75 | // Set default region for S3-compatible services 76 | // If region is not set or empty, set default region 77 | if cloudConfig.Region == "" { 78 | switch strings.ToLower(uploadProvider) { 79 | case "s3": 80 | // For AWS S3, default to us-east-1 81 | cloudConfig.Region = "us-east-1" 82 | case "r2", "minio", "b2": 83 | // For S3-compatible services, region can be any value but must be provided 84 | cloudConfig.Region = "auto" 85 | } 86 | } 87 | 88 | // If not specified in command line, get other configuration parameters 89 | if uploadCustomDomain == "" { 90 | uploadCustomDomain = cloudConfig.CustomDomain 91 | } 92 | 93 | if uploadPathPrefix == "" { 94 | uploadPathPrefix = cloudConfig.PathPrefix 95 | } 96 | 97 | if uploadConcurrency == 5 && cloudConfig.Concurrency != 0 { // 5 is default value 98 | uploadConcurrency = cloudConfig.Concurrency 99 | } 100 | 101 | if uploadCACertPath == "" { 102 | uploadCACertPath = cloudConfig.CACertPath 103 | } 104 | 105 | if uploadSkipVerify == false && cloudConfig.SkipVerify { 106 | uploadSkipVerify = true 107 | } 108 | 109 | if uploadConflictPolicy == "rename" && cloudConfig.ConflictPolicy != "" { 110 | uploadConflictPolicy = cloudConfig.ConflictPolicy 111 | } 112 | 113 | if uploadCacheDir == "" { 114 | uploadCacheDir = cloudConfig.CacheDir 115 | } 116 | 117 | // Parse include extensions 118 | var exts []string 119 | if uploadIncludeExts != "" { 120 | exts = strings.Split(uploadIncludeExts, ",") 121 | for i, ext := range exts { 122 | exts[i] = strings.TrimSpace(ext) 123 | } 124 | } 125 | 126 | // Validate conflict policy 127 | var conflictPolicy uploader.ConflictPolicy 128 | switch strings.ToLower(uploadConflictPolicy) { 129 | case "rename": 130 | conflictPolicy = uploader.ConflictPolicyRename 131 | case "version": 132 | conflictPolicy = uploader.ConflictPolicyVersion 133 | case "overwrite": 134 | conflictPolicy = uploader.ConflictPolicyOverwrite 135 | case "": 136 | conflictPolicy = uploader.ConflictPolicyRename // Default 137 | default: 138 | return fmt.Errorf("invalid conflict policy: %s (must be rename, version, or overwrite)", uploadConflictPolicy) 139 | } 140 | 141 | // For R2, use account ID from configuration file 142 | if strings.ToLower(uploadProvider) == "r2" && cloudConfig.AccountID == "" { 143 | fmt.Printf("Note: R2 account ID not found in configuration, please set account_id in config file if you want to use r2.dev public URLs\n") 144 | } 145 | 146 | // Create uploader 147 | up, err := uploader.New(uploader.UploaderConfig{ 148 | SourceFile: uploadSourceFile, 149 | SourceDir: uploadSourceDir, 150 | Provider: uploadProvider, 151 | Bucket: uploadBucket, 152 | CustomDomain: uploadCustomDomain, 153 | PathPrefix: uploadPathPrefix, 154 | DryRun: uploadDryRun, 155 | Concurrency: uploadConcurrency, 156 | ForceUpload: uploadForceUpload, 157 | SkipVerify: uploadSkipVerify, 158 | CACertPath: uploadCACertPath, 159 | ConflictPolicy: conflictPolicy, 160 | CacheDir: uploadCacheDir, 161 | FileExtensions: exts, 162 | }) 163 | if err != nil { 164 | return fmt.Errorf("failed to create uploader: %v", err) 165 | } 166 | 167 | // Process files 168 | stats, err := up.Process() 169 | if err != nil { 170 | return fmt.Errorf("failed to process files: %v", err) 171 | } 172 | 173 | // Print statistics 174 | fmt.Printf("\nUpload Statistics:\n") 175 | fmt.Printf(" Total Files Processed: %d\n", stats.ProcessedFiles) 176 | fmt.Printf(" Images Uploaded: %d\n", stats.UploadedImages) 177 | fmt.Printf(" Images Skipped: %d\n", stats.SkippedImages) 178 | fmt.Printf(" Failed Uploads: %d\n", stats.FailedImages) 179 | fmt.Printf(" Files Changed: %d\n", stats.ChangedFiles) 180 | 181 | return nil 182 | }, 183 | } 184 | ) 185 | 186 | func init() { 187 | // Add flags 188 | uploadCmd.Flags().StringVarP(&uploadSourceFile, "file", "f", "", "Source markdown file to process") 189 | uploadCmd.Flags().StringVarP(&uploadSourceDir, "dir", "d", "", "Source directory containing markdown files to process") 190 | uploadCmd.Flags().StringVarP(&uploadProvider, "provider", "p", "", "Cloud storage provider (s3, r2, minio)") 191 | uploadCmd.Flags().StringVarP(&uploadBucket, "bucket", "b", "", "Cloud storage bucket name") 192 | uploadCmd.Flags().StringVarP(&uploadCustomDomain, "custom-domain", "c", "", "Custom domain for generated URLs") 193 | uploadCmd.Flags().StringVar(&uploadPathPrefix, "prefix", "", "Path prefix for uploaded files") 194 | uploadCmd.Flags().BoolVar(&uploadDryRun, "dry-run", false, "Preview changes without uploading") 195 | uploadCmd.Flags().IntVar(&uploadConcurrency, "concurrency", 5, "Number of concurrent uploads") 196 | uploadCmd.Flags().BoolVarP(&uploadForceUpload, "force", "F", false, "Force upload even if file exists") 197 | uploadCmd.Flags().BoolVar(&uploadSkipVerify, "skip-verify", false, "Skip SSL verification") 198 | uploadCmd.Flags().StringVar(&uploadCACertPath, "ca-cert", "", "Path to CA certificate") 199 | uploadCmd.Flags().StringVar(&uploadConflictPolicy, "conflict", "rename", "Conflict policy (rename, version, overwrite)") 200 | uploadCmd.Flags().StringVar(&uploadCacheDir, "cache-dir", "", "Cache directory path") 201 | uploadCmd.Flags().StringVar(&uploadIncludeExts, "include", "", "Comma-separated list of file extensions to include") 202 | uploadCmd.Flags().StringVar(&uploadStorageName, "storage", "", "Storage name to use") 203 | } 204 | -------------------------------------------------------------------------------- /docs/features/upload.md: -------------------------------------------------------------------------------- 1 | ## Design Document: Image Upload Feature for mdctl 2 | 3 | ### Overview 4 | 5 | Add a new feature to mdctl that uploads local images in markdown files to cloud storage services (S3-compatible APIs like Cloudflare R2, AWS S3, etc.) and rewrites the URLs in the markdown content. 6 | 7 | ### Goals 8 | 9 | 1. Upload local images to cloud storage services 10 | 2. Support multiple storage providers with S3-compatible APIs 11 | 3. Rewrite image URLs in markdown files to point to the cloud storage 12 | 4. Maintain the existing design patterns and code structure 13 | 5. Implement idempotent operations with content verification 14 | 6. Support concurrent uploads for performance optimization 15 | 7. Handle custom SSL certificates for various cloud providers 16 | 17 | ### Architecture 18 | 19 | Following the existing architecture pattern of mdctl, the upload feature will be implemented with these components: 20 | 21 | #### 1. Command Layer (`cmd/upload.go`) 22 | 23 | - Define CLI parameters: 24 | - Source file/directory (`-f/--file` or `-d/--dir`) 25 | - Cloud provider (`-p/--provider`) 26 | - Bucket name (`-b/--bucket`) 27 | - Custom domain (optional, `-c/--custom-domain`) 28 | - Path prefix (optional, `--prefix`) 29 | - File extensions to include (optional, `--include`) 30 | - Dry run mode (optional, `--dry-run`) 31 | - Concurrency level (optional, `--concurrency`) 32 | - Force upload (optional, `-F/--force`) 33 | - Skip SSL verification (optional, `--skip-verify`) 34 | - CA certificate path (optional, `--ca-cert`) 35 | - Conflict policy (optional, `--conflict=rename|version|overwrite`) 36 | - Cache directory (optional, `--cache-dir`) 37 | 38 | - Validate input parameters 39 | - Create and configure uploader component 40 | - Add to the "core" command group alongside download and translate 41 | 42 | #### 2. Uploader Module (`internal/uploader/uploader.go`) 43 | 44 | - Core business logic for uploading files 45 | - Methods for: 46 | - Processing single files or directories recursively 47 | - Identifying local images in markdown 48 | - Uploading files to cloud storage 49 | - Rewriting URLs in markdown content 50 | - Generating appropriate cloud storage paths 51 | - Managing the worker pool for concurrent uploads 52 | - Tracking upload progress with statistics 53 | - Calculating and verifying content hashes 54 | - Handling conflict resolution 55 | - Managing the local cache of uploaded files 56 | 57 | #### 3. Storage Provider Interface (`internal/storage/provider.go`) 58 | 59 | - Define a provider interface with methods: 60 | - `Upload(localPath, remotePath string, metadata map[string]string) (url string, err error)` 61 | - `Configure(config CloudConfig) error` 62 | - `GetPublicURL(remotePath string) string` 63 | - `ObjectExists(remotePath string) (bool, error)` 64 | - `CompareHash(remotePath, localHash string) (bool, error)` 65 | - `SetObjectMetadata(remotePath string, metadata map[string]string) error` 66 | - `GetObjectMetadata(remotePath string) (map[string]string, error)` 67 | 68 | #### 4. Storage Provider Implementations 69 | 70 | - S3-compatible provider (`internal/storage/s3.go`): 71 | - Implementation for AWS S3, Cloudflare R2, Minio, etc. 72 | - Configure region, endpoint, credentials 73 | - Handle authentication and uploads 74 | - Support custom certificates and SSL verification options 75 | - Implement content verification with ETag/MD5 hash comparison 76 | - Support object tagging for metadata 77 | 78 | #### 5. Cache Management (`internal/cache/cache.go`) 79 | 80 | - Maintain record of uploaded files with their hash values 81 | - Cache structure with file path, remote URL, and hash 82 | - Support for serializing/deserializing cache to disk 83 | - Methods for lookup, update, and verification 84 | 85 | #### 6. Configuration Extensions (`internal/config/config.go`) 86 | 87 | Add new configuration fields: 88 | ```go 89 | type CloudConfig struct { 90 | Provider string `json:"provider"` 91 | Region string `json:"region"` 92 | Endpoint string `json:"endpoint"` 93 | AccessKey string `json:"access_key"` 94 | SecretKey string `json:"secret_key"` 95 | Bucket string `json:"bucket"` 96 | CustomDomain string `json:"custom_domain,omitempty"` 97 | PathPrefix string `json:"path_prefix,omitempty"` 98 | ProviderOpts map[string]string `json:"provider_opts,omitempty"` 99 | Concurrency int `json:"concurrency"` 100 | SkipVerify bool `json:"skip_verify"` 101 | CACertPath string `json:"ca_cert_path,omitempty"` 102 | ConflictPolicy string `json:"conflict_policy"` 103 | CacheDir string `json:"cache_dir,omitempty"` 104 | } 105 | 106 | // Add to Config struct 107 | type Config struct { 108 | // Existing fields... 109 | CloudStorage CloudConfig `json:"cloud_storage"` 110 | } 111 | ``` 112 | 113 | ### Implementation Plan 114 | 115 | 1. Add cloud storage config section to config.go 116 | 2. Implement cache management module 117 | 3. Create storage provider interface 118 | 4. Implement S3-compatible provider with SSL handling 119 | 5. Create worker pool for concurrent uploads 120 | 6. Create uploader module implementation with verification logic 121 | 7. Implement idempotency and conflict resolution strategies 122 | 8. Add upload command to cmd package 123 | 9. Create comprehensive tests 124 | 10. Update help text and documentation 125 | 11. Add sample usage to README 126 | 127 | ### Command Usage Examples 128 | 129 | ```bash 130 | # Upload images from a single file 131 | mdctl upload -f path/to/file.md -p s3 -b my-bucket 132 | 133 | # Upload images from a directory 134 | mdctl upload -d path/to/dir -p r2 -b my-images --prefix blog/ 135 | 136 | # Use with a custom domain 137 | mdctl upload -f post.md -p s3 -b media-bucket -c assets.example.com 138 | 139 | # Use custom concurrency setting 140 | mdctl upload -f blog-post.md -p s3 -b my-bucket --concurrency 10 141 | 142 | # Force upload (bypass hash verification) 143 | mdctl upload -f readme.md -p r2 -b my-images -F 144 | 145 | # Specify conflict resolution strategy 146 | mdctl upload -d docs/ -p s3 -b media --conflict=version 147 | 148 | # Use custom SSL certificate 149 | mdctl upload -f doc.md -p s3 -b media --ca-cert /path/to/cert.pem 150 | 151 | # Skip SSL verification for self-signed certificates 152 | mdctl upload -f doc.md -p minio -b local --skip-verify 153 | 154 | # Configure cloud provider 155 | mdctl config set -k cloud_storage.provider -v "r2" 156 | mdctl config set -k cloud_storage.endpoint -v "https://xxxx.r2.cloudflarestorage.com" 157 | mdctl config set -k cloud_storage.access_key -v "YOUR_ACCESS_KEY" 158 | mdctl config set -k cloud_storage.secret_key -v "YOUR_SECRET_KEY" 159 | mdctl config set -k cloud_storage.bucket -v "my-images" 160 | mdctl config set -k cloud_storage.concurrency -v 5 161 | mdctl config set -k cloud_storage.conflict_policy -v "rename" 162 | ``` 163 | 164 | ### Technical Considerations 165 | 166 | 1. **S3 SDK**: Use the AWS SDK for Go to interact with S3-compatible APIs 167 | 2. **Image Processing**: Optional compression/resizing before upload 168 | 3. **Error Handling**: Provide detailed error messages for failed uploads 169 | 4. **URL Generation**: 170 | - Support both direct S3 URLs or custom domain URLs 171 | - Handle path prefixing correctly 172 | 5. **Idempotency & Verification**: 173 | - Calculate content hashes (MD5/SHA) for each file 174 | - Store metadata in the object tags for verification 175 | - Skip uploads for identical content (check hash before upload) 176 | - Optional force upload flag to override verification 177 | - Maintain a local cache of uploaded files with their hashes 178 | 6. **Concurrency & Reliability**: 179 | - Implement worker pool for parallel uploads 180 | - Configurable concurrency level (default: 5) 181 | - Progress tracking for concurrent operations 182 | - Built-in retry mechanism for failed uploads (hardcoded 3 retry attempts) 183 | - Exponential backoff between retries (starting at 1s, doubling each retry) 184 | - Standard timeout for upload operations 185 | 7. **SSL/Certificate Handling**: 186 | - Support custom CA certificates 187 | - Option to skip verification for self-signed certificates 188 | - Configurable TLS settings per provider 189 | 8. **Conflict Resolution**: 190 | - Strategies for handling name collisions (rename, version, overwrite) 191 | - Option to preserve original filenames or use hashed names 192 | 9. **Incremental Uploads**: 193 | - Track already uploaded files to avoid redundant operations 194 | - Support for resuming interrupted batch uploads 195 | 196 | ### Testing Strategy 197 | 198 | 1. Unit tests for URL parsing and rewriting 199 | 2. Mocked storage provider for testing upload logic 200 | 3. Verification tests for hash calculation and comparison 201 | 4. Concurrency tests to ensure worker pool functions correctly 202 | 5. SSL/TLS configuration tests with mock certificates 203 | 6. Cache management tests for serialization/deserialization 204 | 7. Conflict resolution strategy tests 205 | 8. Integration tests with a local MinIO server 206 | 9. End-to-end tests with actual markdown files 207 | 10. Idempotency tests to verify repeated executions 208 | 11. Performance benchmarks for concurrent uploads -------------------------------------------------------------------------------- /internal/exporter/sitereader/mkdocs.go: -------------------------------------------------------------------------------- 1 | package sitereader 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | "log" 7 | "os" 8 | "path/filepath" 9 | "regexp" 10 | "strings" 11 | 12 | "gopkg.in/yaml.v3" 13 | ) 14 | 15 | type MkDocsReader struct { 16 | Logger *log.Logger 17 | } 18 | 19 | type MkDocsConfig struct { 20 | Docs []string `yaml:"nav"` 21 | DocsDir string `yaml:"docs_dir"` 22 | Inherit string `yaml:"INHERIT"` 23 | } 24 | 25 | func (r *MkDocsReader) Detect(dir string) bool { 26 | // Setting up the Logger 27 | if r.Logger == nil { 28 | r.Logger = log.New(io.Discard, "", 0) 29 | } 30 | 31 | // Check if mkdocs.yml file exists 32 | mkdocsPath := filepath.Join(dir, "mkdocs.yml") 33 | if _, err := os.Stat(mkdocsPath); os.IsNotExist(err) { 34 | // Try mkdocs.yaml 35 | mkdocsPath = filepath.Join(dir, "mkdocs.yaml") 36 | if _, err := os.Stat(mkdocsPath); os.IsNotExist(err) { 37 | r.Logger.Printf("No mkdocs.yml or mkdocs.yaml found in %s", dir) 38 | return false 39 | } 40 | } 41 | 42 | r.Logger.Printf("Found MkDocs configuration file: %s", mkdocsPath) 43 | return true 44 | } 45 | 46 | func (r *MkDocsReader) ReadStructure(dir string, configPath string, navPath string) ([]string, error) { 47 | // Setting up the Logger 48 | if r.Logger == nil { 49 | r.Logger = log.New(io.Discard, "", 0) 50 | } 51 | 52 | r.Logger.Printf("Reading MkDocs site structure from: %s", dir) 53 | if navPath != "" { 54 | r.Logger.Printf("Filtering by navigation path: %s", navPath) 55 | } 56 | 57 | // Find config file 58 | if configPath == "" { 59 | configNames := []string{"mkdocs.yml", "mkdocs.yaml"} 60 | var err error 61 | configPath, err = FindConfigFile(dir, configNames) 62 | if err != nil { 63 | r.Logger.Printf("Failed to find MkDocs config file: %s", err) 64 | return nil, fmt.Errorf("failed to find MkDocs config file: %s", err) 65 | } 66 | } 67 | r.Logger.Printf("Using config file: %s", configPath) 68 | 69 | // Read and parse config file, including handling INHERIT 70 | config, err := r.readAndMergeConfig(configPath, dir) 71 | if err != nil { 72 | r.Logger.Printf("Failed to read config file: %s", err) 73 | return nil, fmt.Errorf("failed to read config file: %s", err) 74 | } 75 | 76 | // Get docs directory 77 | docsDir := "docs" 78 | if docsDirValue, ok := config["docs_dir"]; ok { 79 | if docsDirStr, ok := docsDirValue.(string); ok { 80 | docsDir = docsDirStr 81 | } 82 | } 83 | docsDir = filepath.Join(dir, docsDir) 84 | r.Logger.Printf("Using docs directory: %s", docsDir) 85 | 86 | // Parse navigation structure 87 | var nav interface{} 88 | if navValue, ok := config["nav"]; ok { 89 | nav = navValue 90 | } else { 91 | // If no navigation config, try to find all Markdown files 92 | r.Logger.Println("No navigation configuration found, searching for all markdown files") 93 | return getAllMarkdownFiles(docsDir) 94 | } 95 | 96 | // Parse navigation structure, get file list 97 | files, err := parseNavigation(nav, docsDir, navPath) 98 | if err != nil { 99 | r.Logger.Printf("Failed to parse navigation: %s", err) 100 | return nil, fmt.Errorf("failed to parse navigation: %s", err) 101 | } 102 | 103 | r.Logger.Printf("Found %d files in navigation", len(files)) 104 | return files, nil 105 | } 106 | 107 | // readAndMergeConfig Read and merge MkDocs config file, handling INHERIT directive 108 | func (r *MkDocsReader) readAndMergeConfig(configPath string, baseDir string) (map[string]interface{}, error) { 109 | r.Logger.Printf("Reading and merging config file: %s", configPath) 110 | 111 | // Read main config file 112 | configData, err := os.ReadFile(configPath) 113 | if err != nil { 114 | r.Logger.Printf("Failed to read MkDocs config file: %s", err) 115 | return nil, fmt.Errorf("failed to read MkDocs config file: %s", err) 116 | } 117 | 118 | // Parse config file 119 | var config map[string]interface{} 120 | if err := yaml.Unmarshal(configData, &config); err != nil { 121 | r.Logger.Printf("Failed to parse MkDocs config file: %s", err) 122 | return nil, fmt.Errorf("failed to parse MkDocs config file: %s", err) 123 | } 124 | 125 | // Check if there's an INHERIT directive 126 | inheritValue, hasInherit := config["INHERIT"] 127 | if !hasInherit { 128 | // No inherit, return current config 129 | return config, nil 130 | } 131 | 132 | // Handle INHERIT directive 133 | inheritPath, ok := inheritValue.(string) 134 | if !ok { 135 | r.Logger.Printf("Invalid INHERIT value, expected string but got: %T", inheritValue) 136 | return nil, fmt.Errorf("invalid INHERIT value, expected string") 137 | } 138 | 139 | r.Logger.Printf("Found INHERIT directive pointing to: %s", inheritPath) 140 | 141 | // Parse inherit path, may be relative to current config file 142 | configDir := filepath.Dir(configPath) 143 | inheritFullPath := filepath.Join(configDir, inheritPath) 144 | 145 | // Read inherited config file 146 | inheritConfig, err := r.readAndMergeConfig(inheritFullPath, baseDir) 147 | if err != nil { 148 | return nil, fmt.Errorf("failed to read inherited config file %s: %s", inheritFullPath, err) 149 | } 150 | 151 | // Merge config, current config takes precedence 152 | mergedConfig := make(map[string]interface{}) 153 | 154 | // Copy inherit config first 155 | for k, v := range inheritConfig { 156 | mergedConfig[k] = v 157 | } 158 | 159 | // Override current config 160 | for k, v := range config { 161 | if k != "INHERIT" { // Don't copy INHERIT directive 162 | mergedConfig[k] = v 163 | } 164 | } 165 | 166 | r.Logger.Printf("Successfully merged config with inherited file") 167 | return mergedConfig, nil 168 | } 169 | 170 | // preprocessMarkdownFile Preprocess Markdown file, remove YAML front matter that may cause problems 171 | func preprocessMarkdownFile(filePath string) error { 172 | // Read file content 173 | content, err := os.ReadFile(filePath) 174 | if err != nil { 175 | return err 176 | } 177 | 178 | // Check if there's YAML front matter 179 | contentStr := string(content) 180 | yamlFrontMatterRegex := regexp.MustCompile(`(?s)^---\s*\n(.*?)\n---\s*\n`) 181 | 182 | // If there's YAML front matter, remove it 183 | if yamlFrontMatterRegex.MatchString(contentStr) { 184 | // Create temp file 185 | tempFile, err := os.CreateTemp("", "mdctl-*.md") 186 | if err != nil { 187 | return err 188 | } 189 | tempFilePath := tempFile.Name() 190 | tempFile.Close() 191 | 192 | // Remove YAML front matter 193 | processedContent := yamlFrontMatterRegex.ReplaceAllString(contentStr, "") 194 | 195 | // Write processed content to temp file 196 | if err := os.WriteFile(tempFilePath, []byte(processedContent), 0644); err != nil { 197 | os.Remove(tempFilePath) 198 | return err 199 | } 200 | 201 | // Replace original file 202 | if err := os.Rename(tempFilePath, filePath); err != nil { 203 | os.Remove(tempFilePath) 204 | return err 205 | } 206 | } 207 | 208 | return nil 209 | } 210 | 211 | // parseNavigation Parse MkDocs navigation structure 212 | func parseNavigation(nav interface{}, docsDir string, navPath string) ([]string, error) { 213 | var files []string 214 | 215 | switch v := nav.(type) { 216 | case []interface{}: 217 | // Navigation is a list 218 | for _, item := range v { 219 | itemFiles, err := parseNavigation(item, docsDir, navPath) 220 | if err != nil { 221 | return nil, err 222 | } 223 | files = append(files, itemFiles...) 224 | } 225 | case map[string]interface{}: 226 | // Navigation is a map 227 | for title, value := range v { 228 | // If nav path is specified, check if current node title matches 229 | if navPath != "" { 230 | // Support simple path matching, e.g. "Section1/Subsection2" 231 | navParts := strings.Split(navPath, "/") 232 | if strings.TrimSpace(title) == strings.TrimSpace(navParts[0]) { 233 | // If it's a multi-level path, continue matching the next level 234 | if len(navParts) > 1 { 235 | subNavPath := strings.Join(navParts[1:], "/") 236 | itemFiles, err := parseNavigation(value, docsDir, subNavPath) 237 | if err != nil { 238 | return nil, err 239 | } 240 | files = append(files, itemFiles...) 241 | continue 242 | } else { 243 | // If it's a single-level path and matches, only handle this node 244 | itemFiles, err := parseNavigation(value, docsDir, "") 245 | if err != nil { 246 | return nil, err 247 | } 248 | files = append(files, itemFiles...) 249 | continue 250 | } 251 | } else { 252 | // Title doesn't match, skip this node 253 | continue 254 | } 255 | } 256 | 257 | // If no nav path is specified or already matched the path, handle normally 258 | itemFiles, err := parseNavigation(value, docsDir, "") 259 | if err != nil { 260 | return nil, err 261 | } 262 | files = append(files, itemFiles...) 263 | } 264 | case string: 265 | // Navigation item is a file path 266 | if strings.HasSuffix(v, ".md") { 267 | filePath := filepath.Join(docsDir, v) 268 | if _, err := os.Stat(filePath); err == nil { 269 | // If no nav path is specified or already handled in nav path filtering, add file 270 | if navPath == "" { 271 | files = append(files, filePath) 272 | } 273 | } 274 | } 275 | } 276 | 277 | return files, nil 278 | } 279 | 280 | // getAllMarkdownFiles Get all Markdown files in a directory 281 | func getAllMarkdownFiles(dir string) ([]string, error) { 282 | var files []string 283 | 284 | err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error { 285 | if err != nil { 286 | return err 287 | } 288 | if !info.IsDir() { 289 | ext := strings.ToLower(filepath.Ext(path)) 290 | if ext == ".md" || ext == ".markdown" { 291 | files = append(files, path) 292 | } 293 | } 294 | return nil 295 | }) 296 | 297 | if err != nil { 298 | return nil, fmt.Errorf("failed to walk directory %s: %s", dir, err) 299 | } 300 | 301 | return files, nil 302 | } 303 | -------------------------------------------------------------------------------- /internal/exporter/exporter.go: -------------------------------------------------------------------------------- 1 | package exporter 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | "log" 7 | "os" 8 | "path/filepath" 9 | "sort" 10 | "strings" 11 | 12 | "github.com/samzong/mdctl/internal/exporter/sitereader" 13 | ) 14 | 15 | // ExportOptions defines export options 16 | type ExportOptions struct { 17 | Template string // Word template file path 18 | GenerateToc bool // Whether to generate table of contents 19 | ShiftHeadingLevelBy int // Heading level offset 20 | FileAsTitle bool // Whether to use filename as section title 21 | Format string // Output format (docx, pdf, epub) 22 | SiteType string // Site type (mkdocs, hugo, docusaurus) 23 | Verbose bool // Whether to enable verbose logging 24 | Logger *log.Logger // Logger 25 | SourceDirs []string // List of source directories for processing image paths 26 | TocDepth int // Table of contents depth, default is 3 27 | NavPath string // Specified navigation path to export 28 | } 29 | 30 | // Exporter defines exporter interface 31 | type Exporter interface { 32 | Export(input string, output string, options ExportOptions) error 33 | } 34 | 35 | // DefaultExporter is the default exporter implementation 36 | type DefaultExporter struct { 37 | pandocPath string 38 | logger *log.Logger 39 | } 40 | 41 | // NewExporter creates a new exporter 42 | func NewExporter() *DefaultExporter { 43 | return &DefaultExporter{ 44 | pandocPath: "pandoc", // Default to pandoc in system PATH 45 | logger: log.New(os.Stdout, "[EXPORTER] ", log.LstdFlags), 46 | } 47 | } 48 | 49 | // ExportFile exports a single Markdown file 50 | func (e *DefaultExporter) ExportFile(input, output string, options ExportOptions) error { 51 | // Set logger 52 | if options.Logger != nil { 53 | e.logger = options.Logger 54 | } else if !options.Verbose { 55 | e.logger = log.New(io.Discard, "", 0) 56 | } 57 | 58 | e.logger.Printf("Exporting file: %s -> %s", input, output) 59 | 60 | // Check if file exists 61 | if _, err := os.Stat(input); os.IsNotExist(err) { 62 | e.logger.Printf("Error: input file does not exist: %s", input) 63 | return fmt.Errorf("input file does not exist: %s", input) 64 | } 65 | e.logger.Printf("Input file exists: %s", input) 66 | 67 | // Create output directory (if it doesn't exist) 68 | outputDir := filepath.Dir(output) 69 | if err := os.MkdirAll(outputDir, 0755); err != nil { 70 | e.logger.Printf("Error: failed to create output directory: %s", err) 71 | return fmt.Errorf("failed to create output directory: %s", err) 72 | } 73 | e.logger.Printf("Output directory created/verified: %s", outputDir) 74 | 75 | // Add source directory to SourceDirs 76 | sourceDir := filepath.Dir(input) 77 | if options.SourceDirs == nil { 78 | options.SourceDirs = []string{sourceDir} 79 | } else { 80 | // Check if already exists 81 | found := false 82 | for _, dir := range options.SourceDirs { 83 | if dir == sourceDir { 84 | found = true 85 | break 86 | } 87 | } 88 | if !found { 89 | options.SourceDirs = append(options.SourceDirs, sourceDir) 90 | } 91 | } 92 | e.logger.Printf("Added source directory to resource paths: %s", sourceDir) 93 | 94 | // Use Pandoc to export 95 | e.logger.Println("Starting Pandoc export process...") 96 | pandocExporter := &PandocExporter{ 97 | PandocPath: e.pandocPath, 98 | Logger: e.logger, 99 | } 100 | err := pandocExporter.Export(input, output, options) 101 | if err != nil { 102 | e.logger.Printf("Pandoc export failed: %s", err) 103 | return err 104 | } 105 | 106 | e.logger.Printf("File export completed successfully: %s", output) 107 | return nil 108 | } 109 | 110 | // ExportDirectory exports Markdown files in a directory 111 | func (e *DefaultExporter) ExportDirectory(inputDir, output string, options ExportOptions) error { 112 | // Set logger 113 | if options.Logger != nil { 114 | e.logger = options.Logger 115 | } else if !options.Verbose { 116 | e.logger = log.New(io.Discard, "", 0) 117 | } 118 | 119 | e.logger.Printf("Exporting directory: %s -> %s", inputDir, output) 120 | 121 | // Check if directory exists 122 | if _, err := os.Stat(inputDir); os.IsNotExist(err) { 123 | e.logger.Printf("Error: input directory does not exist: %s", inputDir) 124 | return fmt.Errorf("input directory does not exist: %s", inputDir) 125 | } 126 | e.logger.Printf("Input directory exists: %s", inputDir) 127 | 128 | // Create output directory (if it doesn't exist) 129 | outputDir := filepath.Dir(output) 130 | if err := os.MkdirAll(outputDir, 0755); err != nil { 131 | e.logger.Printf("Error: failed to create output directory: %s", err) 132 | return fmt.Errorf("failed to create output directory: %s", err) 133 | } 134 | e.logger.Printf("Output directory created/verified: %s", outputDir) 135 | 136 | // Initialize SourceDirs (if nil) 137 | if options.SourceDirs == nil { 138 | options.SourceDirs = []string{inputDir} 139 | } else { 140 | // Check if already exists 141 | found := false 142 | for _, dir := range options.SourceDirs { 143 | if dir == inputDir { 144 | found = true 145 | break 146 | } 147 | } 148 | if !found { 149 | options.SourceDirs = append(options.SourceDirs, inputDir) 150 | } 151 | } 152 | e.logger.Printf("Added input directory to resource paths: %s", inputDir) 153 | 154 | // Depending on site type, choose different processing 155 | var files []string 156 | var err error 157 | 158 | if options.SiteType != "" && options.SiteType != "basic" { 159 | // Use site reader to get file list 160 | e.logger.Printf("Using site reader for site type: %s", options.SiteType) 161 | reader, err := sitereader.GetSiteReader(options.SiteType, options.Verbose, e.logger) 162 | if err != nil { 163 | e.logger.Printf("Error getting site reader: %s", err) 164 | return err 165 | } 166 | 167 | // Detect if it's the specified type of site 168 | e.logger.Printf("Detecting if directory is a %s site...", options.SiteType) 169 | if !reader.Detect(inputDir) { 170 | e.logger.Printf("Error: directory %s does not appear to be a %s site", inputDir, options.SiteType) 171 | return fmt.Errorf("directory %s does not appear to be a %s site", inputDir, options.SiteType) 172 | } 173 | e.logger.Printf("Directory confirmed as %s site", options.SiteType) 174 | 175 | e.logger.Println("Reading site structure...") 176 | files, err = reader.ReadStructure(inputDir, "", options.NavPath) 177 | if err != nil { 178 | e.logger.Printf("Error reading site structure: %s", err) 179 | return err 180 | } 181 | e.logger.Printf("Found %d files in site structure", len(files)) 182 | } else { 183 | // Basic directory mode: sort files by name 184 | e.logger.Println("Using basic directory mode, sorting files by name") 185 | files, err = GetMarkdownFilesInDir(inputDir) 186 | if err != nil { 187 | e.logger.Printf("Error getting markdown files: %s", err) 188 | return err 189 | } 190 | e.logger.Printf("Found %d markdown files in directory", len(files)) 191 | } 192 | 193 | if len(files) == 0 { 194 | e.logger.Printf("Error: no markdown files found in directory: %s", inputDir) 195 | return fmt.Errorf("no markdown files found in directory: %s", inputDir) 196 | } 197 | 198 | // If there's only one file, export directly 199 | if len(files) == 1 { 200 | e.logger.Printf("Only one file found, exporting directly: %s", files[0]) 201 | return e.ExportFile(files[0], output, options) 202 | } 203 | 204 | // Merge multiple files 205 | e.logger.Printf("Merging %d files...", len(files)) 206 | merger := &Merger{ 207 | ShiftHeadingLevelBy: options.ShiftHeadingLevelBy, 208 | FileAsTitle: options.FileAsTitle, 209 | Logger: e.logger, 210 | SourceDirs: make([]string, 0), 211 | Verbose: options.Verbose, 212 | } 213 | 214 | // Create temporary file 215 | e.logger.Println("Creating temporary file for merged content...") 216 | tempFile, err := os.CreateTemp("", "mdctl-merged-*.md") 217 | if err != nil { 218 | e.logger.Printf("Error creating temporary file: %s", err) 219 | return fmt.Errorf("failed to create temporary file: %s", err) 220 | } 221 | tempFilePath := tempFile.Name() 222 | tempFile.Close() 223 | defer os.Remove(tempFilePath) 224 | e.logger.Printf("Temporary file created: %s", tempFilePath) 225 | 226 | // Merge files 227 | e.logger.Println("Merging files...") 228 | if err := merger.Merge(files, tempFilePath); err != nil { 229 | e.logger.Printf("Error merging files: %s", err) 230 | return fmt.Errorf("failed to merge files: %s", err) 231 | } 232 | e.logger.Println("Files merged successfully") 233 | 234 | // Add merger collected source directories to options 235 | if merger.SourceDirs != nil && len(merger.SourceDirs) > 0 { 236 | e.logger.Printf("Adding %d source directories from merger", len(merger.SourceDirs)) 237 | for _, dir := range merger.SourceDirs { 238 | // Check if already exists 239 | found := false 240 | for _, existingDir := range options.SourceDirs { 241 | if existingDir == dir { 242 | found = true 243 | break 244 | } 245 | } 246 | if !found { 247 | options.SourceDirs = append(options.SourceDirs, dir) 248 | e.logger.Printf("Added source directory: %s", dir) 249 | } 250 | } 251 | } 252 | 253 | // Export merged file 254 | e.logger.Println("Starting Pandoc export process...") 255 | pandocExporter := &PandocExporter{ 256 | PandocPath: e.pandocPath, 257 | Logger: e.logger, 258 | } 259 | err = pandocExporter.Export(tempFilePath, output, options) 260 | if err != nil { 261 | e.logger.Printf("Pandoc export failed: %s", err) 262 | return err 263 | } 264 | 265 | e.logger.Printf("Directory export completed successfully: %s", output) 266 | return nil 267 | } 268 | 269 | // SiteReader defines site reader interface 270 | type SiteReader interface { 271 | // Detect if given directory is this type of site 272 | Detect(dir string) bool 273 | // Read site structure, return sorted list of files 274 | ReadStructure(dir string, configPath string) ([]string, error) 275 | } 276 | 277 | // GetMarkdownFilesInDir gets all Markdown files in a directory and sorts them by filename 278 | func GetMarkdownFilesInDir(dir string) ([]string, error) { 279 | // Check if directory exists 280 | info, err := os.Stat(dir) 281 | if err != nil { 282 | return nil, err 283 | } 284 | if !info.IsDir() { 285 | return nil, fmt.Errorf("%s is not a directory", dir) 286 | } 287 | 288 | // Recursively find all Markdown files 289 | var files []string 290 | err = filepath.Walk(dir, func(path string, info os.FileInfo, err error) error { 291 | if err != nil { 292 | return err 293 | } 294 | if !info.IsDir() { 295 | ext := strings.ToLower(filepath.Ext(path)) 296 | if ext == ".md" || ext == ".markdown" { 297 | files = append(files, path) 298 | } 299 | } 300 | return nil 301 | }) 302 | 303 | if err != nil { 304 | return nil, fmt.Errorf("failed to walk directory %s: %s", dir, err) 305 | } 306 | 307 | // Sort by filename 308 | sort.Strings(files) 309 | 310 | return files, nil 311 | } 312 | -------------------------------------------------------------------------------- /internal/exporter/merger.go: -------------------------------------------------------------------------------- 1 | package exporter 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "io" 7 | "log" 8 | "os" 9 | "path/filepath" 10 | "regexp" 11 | "strings" 12 | "unicode/utf8" 13 | 14 | "golang.org/x/text/encoding/simplifiedchinese" 15 | "golang.org/x/text/transform" 16 | ) 17 | 18 | // Merger Merge multiple Markdown files 19 | type Merger struct { 20 | ShiftHeadingLevelBy int 21 | FileAsTitle bool 22 | Logger *log.Logger 23 | // Store all source directories, used to set Pandoc's resource paths 24 | SourceDirs []string 25 | // Whether to enable verbose logging 26 | Verbose bool 27 | } 28 | 29 | // Merge Merge multiple Markdown files into a single target file 30 | func (m *Merger) Merge(sources []string, target string) error { 31 | // If no logger is provided, create a default one 32 | if m.Logger == nil { 33 | if m.Verbose { 34 | m.Logger = log.New(os.Stdout, "[MERGER] ", log.LstdFlags) 35 | } else { 36 | m.Logger = log.New(io.Discard, "", 0) 37 | } 38 | } 39 | 40 | if len(sources) == 0 { 41 | m.Logger.Println("Error: no source files provided") 42 | return fmt.Errorf("no source files provided") 43 | } 44 | 45 | m.Logger.Printf("Merging %d files into: %s", len(sources), target) 46 | var mergedContent strings.Builder 47 | 48 | // Initialize source directory list 49 | m.SourceDirs = make([]string, 0, len(sources)) 50 | sourceDirsMap := make(map[string]bool) // Used for deduplication 51 | 52 | // Process each source file 53 | for i, source := range sources { 54 | m.Logger.Printf("Processing file %d/%d: %s", i+1, len(sources), source) 55 | 56 | // Get source file's directory and add to list (deduplication) 57 | sourceDir := filepath.Dir(source) 58 | if !sourceDirsMap[sourceDir] { 59 | sourceDirsMap[sourceDir] = true 60 | m.SourceDirs = append(m.SourceDirs, sourceDir) 61 | } 62 | 63 | // Read file content 64 | content, err := os.ReadFile(source) 65 | if err != nil { 66 | m.Logger.Printf("Error reading file %s: %s", source, err) 67 | return fmt.Errorf("failed to read file %s: %s", source, err) 68 | } 69 | 70 | // Process content 71 | processedContent := string(content) 72 | 73 | // Ensure content is valid UTF-8 74 | if !utf8.ValidString(processedContent) { 75 | m.Logger.Printf("File %s contains invalid UTF-8, attempting to convert from GBK", source) 76 | // Attempt to convert content from GBK to UTF-8 77 | reader := transform.NewReader(bytes.NewReader(content), simplifiedchinese.GBK.NewDecoder()) 78 | decodedContent, err := io.ReadAll(reader) 79 | if err != nil { 80 | m.Logger.Printf("Failed to decode content from file %s: %s", source, err) 81 | return fmt.Errorf("failed to decode content from file %s: %s", source, err) 82 | } 83 | processedContent = string(decodedContent) 84 | m.Logger.Printf("Successfully converted content from GBK to UTF-8") 85 | } 86 | 87 | // Remove YAML front matter 88 | m.Logger.Println("Removing YAML front matter...") 89 | processedContent = removeYAMLFrontMatter(processedContent) 90 | 91 | // Process image paths 92 | m.Logger.Println("Processing image paths...") 93 | processedContent, err = processImagePaths(processedContent, source, m.Logger, m.Verbose) 94 | if err != nil { 95 | m.Logger.Printf("Error processing image paths: %s", err) 96 | return fmt.Errorf("failed to process image paths: %s", err) 97 | } 98 | 99 | // Adjust heading levels 100 | if m.ShiftHeadingLevelBy != 0 { 101 | m.Logger.Printf("Shifting heading levels by %d", m.ShiftHeadingLevelBy) 102 | processedContent = ShiftHeadings(processedContent, m.ShiftHeadingLevelBy) 103 | } 104 | 105 | // Add filename as title 106 | if m.FileAsTitle { 107 | filename := filepath.Base(source) 108 | m.Logger.Printf("Adding filename as title: %s", filename) 109 | processedContent = AddTitleFromFilename(processedContent, filename, 1+m.ShiftHeadingLevelBy) 110 | } 111 | 112 | // Add to merged content 113 | m.Logger.Printf("Adding processed content to merged result (length: %d bytes)", len(processedContent)) 114 | mergedContent.WriteString(processedContent) 115 | 116 | // If not the last file, add separator 117 | if i < len(sources)-1 { 118 | mergedContent.WriteString("\n\n") 119 | } 120 | } 121 | 122 | // Final content 123 | finalContent := mergedContent.String() 124 | 125 | // Check again for any YAML-related issues 126 | m.Logger.Println("Sanitizing final content...") 127 | finalContent = sanitizeContent(finalContent) 128 | 129 | // Write target file, ensuring UTF-8 encoding 130 | m.Logger.Printf("Writing merged content to target file: %s (size: %d bytes)", target, len(finalContent)) 131 | err := os.WriteFile(target, []byte(finalContent), 0644) 132 | if err != nil { 133 | m.Logger.Printf("Error writing merged content: %s", err) 134 | return fmt.Errorf("failed to write merged content to %s: %s", target, err) 135 | } 136 | 137 | m.Logger.Printf("Successfully merged %d files into: %s", len(sources), target) 138 | return nil 139 | } 140 | 141 | // processImagePaths Process image paths in Markdown, converting relative paths to paths relative to the command execution location 142 | func processImagePaths(content, sourcePath string, logger *log.Logger, verbose bool) (string, error) { 143 | // If no logger is provided, create a default one 144 | if logger == nil { 145 | if verbose { 146 | logger = log.New(os.Stdout, "[IMAGE] ", log.LstdFlags) 147 | } else { 148 | logger = log.New(io.Discard, "", 0) 149 | } 150 | } 151 | 152 | // Get source file's directory 153 | sourceDir := filepath.Dir(sourcePath) 154 | if verbose { 155 | logger.Printf("Processing image paths: source file directory = %s", sourceDir) 156 | } 157 | 158 | // Get current working directory (location of command execution) 159 | workingDir, err := os.Getwd() 160 | if err != nil { 161 | return "", fmt.Errorf("unable to get current working directory: %v", err) 162 | } 163 | if verbose { 164 | logger.Printf("Current working directory = %s", workingDir) 165 | } 166 | 167 | // Get absolute path of source file's directory 168 | absSourceDir, err := filepath.Abs(sourceDir) 169 | if err != nil { 170 | return "", fmt.Errorf("unable to get absolute path of source file's directory: %v", err) 171 | } 172 | if verbose { 173 | logger.Printf("Source file's directory absolute path = %s", absSourceDir) 174 | } 175 | 176 | // Match Markdown image syntax: ![alt](path) 177 | imageRegex := regexp.MustCompile(`!\[(.*?)\]\((.*?)\)`) 178 | 179 | // Replace all image paths 180 | processedContent := imageRegex.ReplaceAllStringFunc(content, func(match string) string { 181 | // Extract image path 182 | submatches := imageRegex.FindStringSubmatch(match) 183 | if len(submatches) < 3 { 184 | return match // If match is incorrect, keep as-is 185 | } 186 | 187 | altText := submatches[1] 188 | imagePath := submatches[2] 189 | if verbose { 190 | logger.Printf("Found image: alt = %s, path = %s", altText, imagePath) 191 | } 192 | 193 | // If image is a web image (starts with http:// or https://), keep as-is 194 | if strings.HasPrefix(imagePath, "http://") || strings.HasPrefix(imagePath, "https://") { 195 | if verbose { 196 | logger.Printf("Keeping web image path: %s", imagePath) 197 | } 198 | return match 199 | } 200 | 201 | // Parse image's absolute path 202 | var absoluteImagePath string 203 | if filepath.IsAbs(imagePath) { 204 | absoluteImagePath = imagePath 205 | } else { 206 | // For relative paths, convert to absolute path first 207 | absoluteImagePath = filepath.Join(absSourceDir, imagePath) 208 | } 209 | if verbose { 210 | logger.Printf("Image path: relative path = %s, absolute path = %s", imagePath, absoluteImagePath) 211 | } 212 | 213 | // Check if image file exists 214 | if _, err := os.Stat(absoluteImagePath); os.IsNotExist(err) { 215 | if verbose { 216 | logger.Printf("Image does not exist: %s", absoluteImagePath) 217 | } 218 | // Image does not exist, try to find it in adjacent directories 219 | // For example, if path is ../images/image.png, try to find it in the images subdirectory of the parent directory of the source file's directory 220 | if strings.HasPrefix(imagePath, "../") { 221 | parentDir := filepath.Dir(absSourceDir) 222 | relPath := strings.TrimPrefix(imagePath, "../") 223 | alternativePath := filepath.Join(parentDir, relPath) 224 | if verbose { 225 | logger.Printf("Trying alternative path: %s", alternativePath) 226 | } 227 | if _, err := os.Stat(alternativePath); err == nil { 228 | absoluteImagePath = alternativePath 229 | if verbose { 230 | logger.Printf("Found image in alternative path: %s", absoluteImagePath) 231 | } 232 | } else { 233 | // Still not found, keep as-is 234 | if verbose { 235 | logger.Printf("Image does not exist in alternative path: %s", alternativePath) 236 | } 237 | return match 238 | } 239 | } else { 240 | // Image not found, keep as-is 241 | return match 242 | } 243 | } 244 | 245 | // Calculate image's path relative to current working directory 246 | relPath, err := filepath.Rel(workingDir, absoluteImagePath) 247 | if err != nil { 248 | if verbose { 249 | logger.Printf("Unable to calculate relative path, keeping original path: %s, error: %v", imagePath, err) 250 | } 251 | return match 252 | } 253 | 254 | // Update image reference with path relative to current working directory 255 | newRef := fmt.Sprintf("![%s](%s)", altText, relPath) 256 | if verbose { 257 | logger.Printf("Updating image reference: %s -> %s", match, newRef) 258 | } 259 | return newRef 260 | }) 261 | 262 | return processedContent, nil 263 | } 264 | 265 | // removeYAMLFrontMatter Remove YAML front matter 266 | func removeYAMLFrontMatter(content string) string { 267 | // Match YAML front matter 268 | yamlFrontMatterRegex := regexp.MustCompile(`(?s)^---\s*\n(.*?)\n---\s*\n`) 269 | return yamlFrontMatterRegex.ReplaceAllString(content, "") 270 | } 271 | 272 | // sanitizeContent Clean content, removing content that may cause Pandoc parsing errors 273 | func sanitizeContent(content string) string { 274 | // Remove lines that may cause YAML parsing errors 275 | lines := strings.Split(content, "\n") 276 | var cleanedLines []string 277 | 278 | for _, line := range lines { 279 | // Skip lines that may cause YAML parsing errors 280 | if strings.Contains(line, ":") && !strings.Contains(line, ": ") && !strings.HasPrefix(line, " ") && !strings.HasPrefix(line, "\t") { 281 | // In this case, there should be a space after the colon, but there isn't, which may cause YAML parsing errors 282 | // Try to fix it 283 | fixedLine := strings.Replace(line, ":", ": ", 1) 284 | cleanedLines = append(cleanedLines, fixedLine) 285 | } else if strings.HasPrefix(line, "-") && !strings.HasPrefix(line, "- ") && len(line) > 1 { 286 | // In this case, there should be a space after the dash, but there isn't, which may cause YAML parsing errors 287 | // Try to fix it 288 | fixedLine := strings.Replace(line, "-", "- ", 1) 289 | cleanedLines = append(cleanedLines, fixedLine) 290 | } else { 291 | cleanedLines = append(cleanedLines, line) 292 | } 293 | } 294 | 295 | return strings.Join(cleanedLines, "\n") 296 | } 297 | -------------------------------------------------------------------------------- /internal/exporter/pandoc.go: -------------------------------------------------------------------------------- 1 | package exporter 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | "log" 7 | "os" 8 | "os/exec" 9 | "path/filepath" 10 | "regexp" 11 | "strings" 12 | ) 13 | 14 | // PandocExporter Use Pandoc to export Markdown files 15 | type PandocExporter struct { 16 | PandocPath string 17 | Logger *log.Logger 18 | } 19 | 20 | // Export Use Pandoc to export Markdown files 21 | func (e *PandocExporter) Export(input, output string, options ExportOptions) error { 22 | // If no logger is provided, create a default one 23 | if e.Logger == nil { 24 | if options.Verbose { 25 | e.Logger = log.New(os.Stdout, "[PANDOC] ", log.LstdFlags) 26 | } else { 27 | e.Logger = log.New(io.Discard, "", 0) 28 | } 29 | } 30 | 31 | e.Logger.Printf("Starting Pandoc export: %s -> %s", input, output) 32 | 33 | // Ensure output path is absolute 34 | absOutput, err := filepath.Abs(output) 35 | if err != nil { 36 | e.Logger.Printf("Failed to get absolute path for output: %s", err) 37 | return fmt.Errorf("failed to get absolute path for output: %s", err) 38 | } 39 | e.Logger.Printf("Using absolute output path: %s", absOutput) 40 | 41 | // Create a temporary file for sanitized content 42 | e.Logger.Println("Creating sanitized copy of input file...") 43 | tempFile, err := createSanitizedCopy(input, e.Logger) 44 | if err != nil { 45 | e.Logger.Printf("Failed to create sanitized copy: %s", err) 46 | return fmt.Errorf("failed to create sanitized copy: %s", err) 47 | } 48 | defer os.Remove(tempFile) 49 | e.Logger.Printf("Sanitized copy created: %s", tempFile) 50 | 51 | // Build Pandoc command arguments 52 | e.Logger.Println("Building Pandoc command arguments...") 53 | args := []string{ 54 | tempFile, 55 | "-o", absOutput, 56 | "--standalone", 57 | "--pdf-engine=xelatex", 58 | "-V", "mainfont=SimSun", // Use SimSun as the main font 59 | "--wrap=preserve", 60 | "--embed-resources", // Embed resources into output file 61 | } 62 | 63 | // Add resource path parameters, helping Pandoc find images 64 | // Collect all possible resource paths 65 | resourcePaths := make(map[string]bool) 66 | 67 | // Add input file directory 68 | inputDir := filepath.Dir(input) 69 | resourcePaths[inputDir] = true 70 | e.Logger.Printf("Added input file directory to resource paths: %s", inputDir) 71 | 72 | // Add current working directory 73 | workingDir, err := os.Getwd() 74 | if err == nil { 75 | resourcePaths[workingDir] = true 76 | e.Logger.Printf("Added current working directory to resource paths: %s", workingDir) 77 | } 78 | 79 | // Add output file directory 80 | outputDir := filepath.Dir(absOutput) 81 | resourcePaths[outputDir] = true 82 | e.Logger.Printf("Added output file directory to resource paths: %s", outputDir) 83 | 84 | // Add source file directories to resource paths 85 | if len(options.SourceDirs) > 0 { 86 | for _, dir := range options.SourceDirs { 87 | resourcePaths[dir] = true 88 | e.Logger.Printf("Added source file directory to resource paths: %s", dir) 89 | } 90 | } 91 | 92 | // Add all resource paths to Pandoc arguments 93 | for path := range resourcePaths { 94 | args = append(args, "--resource-path", path) 95 | } 96 | 97 | // Add template parameter 98 | if options.Template != "" { 99 | e.Logger.Printf("Using template: %s", options.Template) 100 | args = append(args, "--reference-doc", options.Template) 101 | } 102 | 103 | // Add directory parameter 104 | if options.GenerateToc { 105 | e.Logger.Println("Generating table of contents") 106 | args = append(args, "--toc") 107 | 108 | // Add directory depth parameter 109 | if options.TocDepth > 0 { 110 | e.Logger.Printf("Setting table of contents depth to: %d", options.TocDepth) 111 | args = append(args, "--toc-depth", fmt.Sprintf("%d", options.TocDepth)) 112 | } 113 | } 114 | 115 | // Add heading level offset parameter 116 | if options.ShiftHeadingLevelBy != 0 { 117 | e.Logger.Printf("Shifting heading levels by: %d", options.ShiftHeadingLevelBy) 118 | args = append(args, "--shift-heading-level-by", fmt.Sprintf("%d", options.ShiftHeadingLevelBy)) 119 | } 120 | 121 | // Add specific parameters based on output format 122 | e.Logger.Printf("Using output format: %s", options.Format) 123 | switch options.Format { 124 | case "pdf": 125 | // PDF format needs special handling for Chinese 126 | e.Logger.Println("Adding PDF-specific parameters for CJK support") 127 | args = append(args, 128 | "-V", "CJKmainfont=SimSun", // CJK font settings 129 | "-V", "documentclass=article", 130 | "-V", "geometry=margin=1in") 131 | case "epub": 132 | // EPUB format specific parameters 133 | e.Logger.Println("Adding EPUB-specific parameters") 134 | args = append(args, "--epub-chapter-level=1") 135 | } 136 | 137 | // Execute Pandoc command 138 | e.Logger.Printf("Executing Pandoc command: %s %s", e.PandocPath, strings.Join(args, " ")) 139 | cmd := exec.Command(e.PandocPath, args...) 140 | 141 | // Set working directory to input file directory, which helps Pandoc find relative paths for images 142 | cmd.Dir = inputDir 143 | 144 | outputBytes, err := cmd.CombinedOutput() 145 | if err != nil { 146 | // If execution fails, try to look at input file content for debugging 147 | e.Logger.Printf("Pandoc execution failed: %s", err) 148 | e.Logger.Printf("Pandoc output: %s", string(outputBytes)) 149 | 150 | inputContent, readErr := os.ReadFile(tempFile) 151 | if readErr == nil { 152 | // Only show the first 500 characters to avoid too much output 153 | contentPreview := string(inputContent) 154 | if len(contentPreview) > 500 { 155 | contentPreview = contentPreview[:500] + "..." 156 | } 157 | e.Logger.Printf("Input file preview:\n%s", contentPreview) 158 | return fmt.Errorf("pandoc execution failed: %s\nOutput: %s\nCommand: %s\nInput file preview:\n%s", 159 | err, string(outputBytes), strings.Join(cmd.Args, " "), contentPreview) 160 | } 161 | 162 | return fmt.Errorf("pandoc execution failed: %s\nOutput: %s\nCommand: %s", 163 | err, string(outputBytes), strings.Join(cmd.Args, " ")) 164 | } 165 | 166 | e.Logger.Printf("Pandoc export completed successfully: %s", output) 167 | return nil 168 | } 169 | 170 | // createSanitizedCopy Create a sanitized temporary file copy 171 | func createSanitizedCopy(inputFile string, logger *log.Logger) (string, error) { 172 | if logger == nil { 173 | logger = log.New(io.Discard, "", 0) 174 | } 175 | 176 | // Read input file content 177 | logger.Printf("Reading input file: %s", inputFile) 178 | content, err := os.ReadFile(inputFile) 179 | if err != nil { 180 | return "", fmt.Errorf("failed to read input file: %s", err) 181 | } 182 | 183 | // Convert content to string 184 | contentStr := string(content) 185 | 186 | // Remove YAML front matter 187 | logger.Println("Removing YAML front matter...") 188 | yamlFrontMatterRegex := regexp.MustCompile(`(?s)^---\s*\n(.*?)\n---\s*\n`) 189 | if yamlFrontMatterRegex.MatchString(contentStr) { 190 | logger.Println("YAML front matter found, removing it") 191 | contentStr = yamlFrontMatterRegex.ReplaceAllString(contentStr, "") 192 | } 193 | 194 | // Fix lines that may cause YAML parsing errors 195 | logger.Println("Fixing potential YAML parsing issues...") 196 | lines := strings.Split(contentStr, "\n") 197 | var cleanedLines []string 198 | fixedLines := 0 199 | 200 | for _, line := range lines { 201 | // Skip lines that may cause YAML parsing errors 202 | if strings.Contains(line, ":") && !strings.Contains(line, ": ") && !strings.HasPrefix(line, " ") && !strings.HasPrefix(line, "\t") { 203 | // In this case, there should be a space after the colon, but there isn't, which may cause YAML parsing errors 204 | // Try to fix it 205 | fixedLine := strings.Replace(line, ":", ": ", 1) 206 | cleanedLines = append(cleanedLines, fixedLine) 207 | fixedLines++ 208 | logger.Printf("Fixed line with missing space after colon: %s -> %s", line, fixedLine) 209 | } else if strings.HasPrefix(line, "-") && !strings.HasPrefix(line, "- ") && len(line) > 1 { 210 | // In this case, there should be a space after the dash, but there isn't, which may cause YAML parsing errors 211 | // Try to fix it 212 | fixedLine := strings.Replace(line, "-", "- ", 1) 213 | cleanedLines = append(cleanedLines, fixedLine) 214 | fixedLines++ 215 | logger.Printf("Fixed line with missing space after dash: %s -> %s", line, fixedLine) 216 | } else { 217 | cleanedLines = append(cleanedLines, line) 218 | } 219 | } 220 | 221 | logger.Printf("Fixed %d lines with potential YAML issues", fixedLines) 222 | 223 | // Create a temporary file 224 | tempDir := os.TempDir() 225 | tempFilePath := filepath.Join(tempDir, "mdctl-sanitized-"+filepath.Base(inputFile)) 226 | 227 | // Write sanitized content to temporary file 228 | logger.Printf("Writing sanitized content to temporary file: %s", tempFilePath) 229 | err = os.WriteFile(tempFilePath, []byte(strings.Join(cleanedLines, "\n")), 0644) 230 | if err != nil { 231 | return "", err 232 | } 233 | 234 | return tempFilePath, nil 235 | } 236 | 237 | // preprocessInputFile Preprocess input file, removing content that may cause Pandoc parsing errors 238 | func preprocessInputFile(filePath string) error { 239 | // Read file content 240 | content, err := os.ReadFile(filePath) 241 | if err != nil { 242 | return err 243 | } 244 | 245 | contentStr := string(content) 246 | 247 | // Check for unconventional YAML front matter 248 | yamlFrontMatterRegex := regexp.MustCompile(`(?s)^---\s*\n(.*?)\n---\s*\n`) 249 | if yamlFrontMatterRegex.MatchString(contentStr) { 250 | // Extract YAML front matter content 251 | matches := yamlFrontMatterRegex.FindStringSubmatch(contentStr) 252 | if len(matches) > 1 { 253 | yamlContent := matches[1] 254 | 255 | // Check if YAML content has formatting issues 256 | if strings.Contains(yamlContent, "\n-") && !strings.Contains(yamlContent, "\n- ") { 257 | // Fix formatting issue: ensure there's a space after the dash 258 | fixedYaml := strings.ReplaceAll(yamlContent, "\n-", "\n- ") 259 | fixedContent := strings.Replace(contentStr, yamlContent, fixedYaml, 1) 260 | 261 | // Write back to file 262 | return os.WriteFile(filePath, []byte(fixedContent), 0644) 263 | } 264 | } 265 | 266 | // If YAML format has other issues, remove entire front matter 267 | processedContent := yamlFrontMatterRegex.ReplaceAllString(contentStr, "") 268 | return os.WriteFile(filePath, []byte(processedContent), 0644) 269 | } 270 | 271 | return nil 272 | } 273 | 274 | // CheckPandocAvailability Check if Pandoc is available 275 | func CheckPandocAvailability() error { 276 | cmd := exec.Command("pandoc", "--version") 277 | outputBytes, err := cmd.CombinedOutput() 278 | if err != nil { 279 | return fmt.Errorf("pandoc is not available: %s\n\nPlease install Pandoc to use the export feature:\n\n"+ 280 | "macOS: brew install pandoc\n"+ 281 | "Ubuntu/Debian: sudo apt-get install pandoc\n"+ 282 | "Windows: choco install pandoc\n\n"+ 283 | "For more information, visit: https://pandoc.org/installing.html", err) 284 | } 285 | 286 | // Check version 287 | versionStr := string(outputBytes) 288 | if !strings.Contains(versionStr, "pandoc") { 289 | return fmt.Errorf("unexpected pandoc version output: %s", versionStr) 290 | } 291 | 292 | return nil 293 | } 294 | -------------------------------------------------------------------------------- /internal/translator/translator.go: -------------------------------------------------------------------------------- 1 | package translator 2 | 3 | import ( 4 | "bytes" 5 | "encoding/json" 6 | "fmt" 7 | "io" 8 | "net/http" 9 | "os" 10 | "path/filepath" 11 | "regexp" 12 | "sort" 13 | "strings" 14 | 15 | "github.com/samzong/mdctl/internal/config" 16 | "github.com/samzong/mdctl/internal/markdownfmt" 17 | "gopkg.in/yaml.v3" 18 | ) 19 | 20 | // SupportedLanguages defines the mapping of supported languages 21 | var SupportedLanguages = map[string]string{ 22 | "zh": "中文", 23 | "en": "English", 24 | "ja": "日本語", 25 | "ko": "한국어", 26 | "fr": "Français", 27 | "de": "Deutsch", 28 | "es": "Español", 29 | "it": "Italiano", 30 | "ru": "Русский", 31 | "pt": "Português", 32 | "vi": "Tiếng Việt", 33 | "th": "ไทย", 34 | "ar": "العربية", 35 | "hi": "हिन्दी", 36 | } 37 | 38 | // IsLanguageSupported checks if the language is supported 39 | func IsLanguageSupported(lang string) bool { 40 | _, ok := SupportedLanguages[lang] 41 | return ok 42 | } 43 | 44 | // GetSupportedLanguages returns a list of supported languages 45 | func GetSupportedLanguages() string { 46 | var langs []string 47 | for code, name := range SupportedLanguages { 48 | langs = append(langs, fmt.Sprintf("%s (%s)", code, name)) 49 | } 50 | sort.Strings(langs) 51 | return strings.Join(langs, ", ") 52 | } 53 | 54 | type OpenAIMessage struct { 55 | Role string `json:"role"` 56 | Content string `json:"content"` 57 | } 58 | 59 | type OpenAIRequest struct { 60 | Model string `json:"model"` 61 | Messages []OpenAIMessage `json:"messages"` 62 | Temperature float64 `json:"temperature"` 63 | TopP float64 `json:"top_p"` 64 | } 65 | 66 | type OpenAIResponse struct { 67 | Choices []struct { 68 | Message struct { 69 | Content string `json:"content"` 70 | } `json:"message"` 71 | } `json:"choices"` 72 | } 73 | 74 | // Progress is used to track translation progress 75 | type Progress struct { 76 | Total int 77 | Current int 78 | SourceFile string 79 | TargetFile string 80 | } 81 | 82 | // ProgressCallback defines the progress callback function type 83 | type ProgressCallback func(progress Progress) 84 | 85 | // Translator struct for the translator 86 | type Translator struct { 87 | config *config.Config 88 | format bool 89 | progress ProgressCallback 90 | } 91 | 92 | // New creates a new translator instance 93 | func New(cfg *config.Config, format bool) *Translator { 94 | return &Translator{ 95 | config: cfg, 96 | format: format, 97 | progress: func(p Progress) { 98 | if p.Total > 1 { 99 | fmt.Printf("Translating file [%d/%d]: %s\n", p.Current, p.Total, p.SourceFile) 100 | } 101 | }, 102 | } 103 | } 104 | 105 | var ( 106 | // RegexPatterns defines patterns for removing special content blocks 107 | RegexPatterns = []struct { 108 | Pattern string 109 | Replace string 110 | }{ 111 | {`(?s).*?\n?`, ""}, // Remove ollama deepthink thinking process 112 | } 113 | ) 114 | 115 | // TranslateContent translates the content 116 | func (t *Translator) TranslateContent(content string, lang string) (string, error) { 117 | // Remove potential front matter 118 | content = removeFrontMatter(content) 119 | 120 | prompt := strings.Replace(t.config.TranslatePrompt, "{TARGET_LANG}", lang, 1) 121 | 122 | messages := []OpenAIMessage{ 123 | {Role: "system", Content: prompt}, 124 | {Role: "user", Content: content}, 125 | } 126 | 127 | reqBody := OpenAIRequest{ 128 | Model: t.config.ModelName, 129 | Messages: messages, 130 | Temperature: t.config.Temperature, 131 | TopP: t.config.TopP, 132 | } 133 | 134 | jsonData, err := json.Marshal(reqBody) 135 | if err != nil { 136 | return "", fmt.Errorf("failed to marshal request: %v", err) 137 | } 138 | 139 | req, err := http.NewRequest("POST", t.config.OpenAIEndpointURL+"/chat/completions", bytes.NewBuffer(jsonData)) 140 | if err != nil { 141 | return "", fmt.Errorf("failed to create request: %v", err) 142 | } 143 | 144 | req.Header.Set("Content-Type", "application/json") 145 | req.Header.Set("Authorization", "Bearer "+t.config.OpenAIAPIKey) 146 | 147 | client := &http.Client{} 148 | resp, err := client.Do(req) 149 | if err != nil { 150 | return "", fmt.Errorf("failed to send request: %v", err) 151 | } 152 | defer resp.Body.Close() 153 | 154 | body, err := io.ReadAll(resp.Body) 155 | if err != nil { 156 | return "", fmt.Errorf("failed to read response: %v", err) 157 | } 158 | 159 | var response OpenAIResponse 160 | if err := json.Unmarshal(body, &response); err != nil { 161 | return "", fmt.Errorf("failed to parse response: %v\nResponse body: %s", err, string(body)) 162 | } 163 | 164 | if len(response.Choices) == 0 { 165 | return "", fmt.Errorf("no translation result\nResponse body: %s", string(body)) 166 | } 167 | 168 | // Get translated content 169 | translatedContent := response.Choices[0].Message.Content 170 | 171 | // Remove special content blocks 172 | for _, pattern := range RegexPatterns { 173 | translatedContent = regexp.MustCompile(pattern.Pattern).ReplaceAllString(translatedContent, pattern.Replace) 174 | } 175 | 176 | // Remove potential markdown code block markers 177 | translatedContent = strings.TrimPrefix(translatedContent, "\n") 178 | 179 | // If formatting is enabled, format the translated content 180 | if t.format { 181 | formatter := markdownfmt.New(true) 182 | translatedContent = formatter.Format(translatedContent) 183 | } 184 | 185 | return translatedContent, nil 186 | } 187 | 188 | // removeFrontMatter removes front matter from content 189 | func removeFrontMatter(content string) string { 190 | // If content starts with ---, it may contain front matter 191 | trimmedContent := strings.TrimSpace(content) 192 | if strings.HasPrefix(trimmedContent, "---") { 193 | parts := strings.SplitN(trimmedContent, "---", 3) 194 | if len(parts) >= 3 { 195 | return strings.TrimSpace(parts[2]) 196 | } 197 | } 198 | return content 199 | } 200 | 201 | // ProcessFile handles translation of a single file 202 | func ProcessFile(srcPath, dstPath, targetLang string, cfg *config.Config, format bool, force bool) error { 203 | t := New(cfg, format) 204 | 205 | // Check if target path is a directory 206 | dstInfo, err := os.Stat(dstPath) 207 | if err == nil && dstInfo.IsDir() { 208 | dstPath = filepath.Join(dstPath, filepath.Base(srcPath)) 209 | } 210 | 211 | // Check if target file already exists 212 | if _, err := os.Stat(dstPath); err == nil { 213 | dstContent, err := os.ReadFile(dstPath) 214 | if err != nil { 215 | return fmt.Errorf("failed to read target file: %v", err) 216 | } 217 | 218 | // Check if already translated 219 | var dstFrontMatter map[string]interface{} 220 | if strings.HasPrefix(string(dstContent), "---\n") { 221 | parts := strings.SplitN(string(dstContent)[4:], "\n---\n", 2) 222 | if len(parts) == 2 { 223 | if err := yaml.Unmarshal([]byte(parts[0]), &dstFrontMatter); err != nil { 224 | return fmt.Errorf("failed to parse target file front matter: %v", err) 225 | } 226 | if translated, ok := dstFrontMatter["translated"].(bool); ok && translated { 227 | if !force { 228 | fmt.Printf("Skipping %s (already translated, use -F to force translate)\n", srcPath) 229 | return nil 230 | } 231 | fmt.Printf("Force translating %s\n", srcPath) 232 | } 233 | } 234 | } 235 | } 236 | 237 | // Read source file content 238 | content, err := os.ReadFile(srcPath) 239 | if err != nil { 240 | return fmt.Errorf("failed to read source file: %v", err) 241 | } 242 | 243 | // Parse front matter 244 | var frontMatter map[string]interface{} 245 | contentToTranslate := string(content) 246 | 247 | // Check and parse front matter 248 | if strings.HasPrefix(contentToTranslate, "---\n") { 249 | parts := strings.SplitN(contentToTranslate[4:], "\n---\n", 2) 250 | if len(parts) == 2 { 251 | if err := yaml.Unmarshal([]byte(parts[0]), &frontMatter); err != nil { 252 | return fmt.Errorf("failed to parse front matter: %v", err) 253 | } 254 | contentToTranslate = parts[1] 255 | } 256 | } 257 | 258 | // Translate content 259 | translatedContent, err := t.TranslateContent(contentToTranslate, targetLang) 260 | if err != nil { 261 | return fmt.Errorf("failed to translate content: %v", err) 262 | } 263 | 264 | // Update front matter 265 | if frontMatter == nil { 266 | frontMatter = make(map[string]interface{}) 267 | } 268 | frontMatter["translated"] = true 269 | 270 | // Generate new file content 271 | frontMatterBytes, err := yaml.Marshal(frontMatter) 272 | if err != nil { 273 | return fmt.Errorf("failed to marshal front matter: %v", err) 274 | } 275 | 276 | newContent := fmt.Sprintf("---\n%s---\n\n%s", string(frontMatterBytes), translatedContent) 277 | 278 | // Create target directory if it doesn't exist 279 | if err := os.MkdirAll(filepath.Dir(dstPath), 0755); err != nil { 280 | return fmt.Errorf("failed to create target directory: %v", err) 281 | } 282 | 283 | // Write translated content to target file 284 | if err := os.WriteFile(dstPath, []byte(newContent), 0644); err != nil { 285 | return fmt.Errorf("failed to write target file: %v", err) 286 | } 287 | 288 | return nil 289 | } 290 | 291 | // ProcessDirectory processes all markdown files in the directory 292 | func ProcessDirectory(srcDir, dstDir string, targetLang string, cfg *config.Config, force bool, format bool) error { 293 | // First calculate the total number of files to process 294 | var total int 295 | err := filepath.Walk(srcDir, func(path string, info os.FileInfo, err error) error { 296 | if err != nil { 297 | return err 298 | } 299 | if !info.IsDir() && filepath.Ext(path) == ".md" { 300 | total++ 301 | } 302 | return nil 303 | }) 304 | if err != nil { 305 | return fmt.Errorf("failed to count files: %v", err) 306 | } 307 | 308 | fmt.Printf("Found %d markdown files to translate\n", total) 309 | 310 | // Create translator instance 311 | t := New(cfg, format) 312 | current := 0 313 | 314 | // Walk through source directory 315 | return filepath.Walk(srcDir, func(path string, info os.FileInfo, err error) error { 316 | if err != nil { 317 | return err 318 | } 319 | 320 | // Skip directories 321 | if info.IsDir() { 322 | return nil 323 | } 324 | 325 | // Only process markdown files 326 | ext := filepath.Ext(path) 327 | if ext != ".md" { 328 | return nil 329 | } 330 | 331 | current++ 332 | 333 | // Get relative path 334 | relPath, err := filepath.Rel(srcDir, path) 335 | if err != nil { 336 | return fmt.Errorf("failed to get relative path: %v", err) 337 | } 338 | 339 | var dstPath string 340 | if dstDir == "" { 341 | // If target directory is empty, create translation file in source directory 342 | dir := filepath.Dir(path) 343 | base := filepath.Base(path) 344 | nameWithoutExt := strings.TrimSuffix(base, ext) 345 | dstPath = filepath.Join(dir, nameWithoutExt+"_"+targetLang+ext) 346 | } else { 347 | // If a different target directory is specified, use the specified directory structure 348 | dstPath = filepath.Join(dstDir, relPath) 349 | } 350 | 351 | t.progress(Progress{ 352 | Total: total, 353 | Current: current, 354 | SourceFile: path, 355 | TargetFile: dstPath, 356 | }) 357 | 358 | // Process file 359 | if err := ProcessFile(path, dstPath, targetLang, cfg, format, force); err != nil { 360 | return fmt.Errorf("failed to process file %s: %v", path, err) 361 | } 362 | 363 | return nil 364 | }) 365 | } 366 | --------------------------------------------------------------------------------