├── pkg ├── utils │ ├── hash.go │ ├── regex.go │ ├── sanitize.go │ ├── tree.go │ └── errors.go ├── parse │ ├── sitemap.go │ └── normalize.go ├── log │ └── adapter.go ├── storage │ ├── interface.go │ └── badger_store.go ├── fetch │ ├── client.go │ ├── ratelimit.go │ ├── robots.go │ └── fetcher.go ├── models │ └── models.go ├── queue │ └── priority_queue.go ├── config │ └── config.go ├── process │ ├── links.go │ ├── content.go │ └── image.go └── sitemap │ └── processor.go ├── .gitignore ├── go.mod ├── compare.py ├── config.yaml ├── LICENSE ├── go.sum ├── README.md └── cmd └── crawler └── main.go /pkg/utils/hash.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "crypto/sha256" 5 | "encoding/hex" 6 | "io" 7 | "os" 8 | ) 9 | 10 | // CalculateFileMD5 computes the MD5 hash of a file's content. 11 | func CalculateFileMD5(filePath string) (string, error) { 12 | file, err := os.Open(filePath) 13 | if err != nil { 14 | return "", err 15 | } 16 | defer file.Close() 17 | 18 | hash := sha256.New() 19 | if _, err := io.Copy(hash, file); err != nil { 20 | return "", err 21 | } 22 | return hex.EncodeToString(hash.Sum(nil)), nil 23 | } 24 | 25 | // CalculateStringMD5 computes the MD5 hash of a string. 26 | func CalculateStringMD5(content string) string { 27 | hash := sha256.New() 28 | hash.Write([]byte(content)) 29 | return hex.EncodeToString(hash.Sum(nil)) 30 | } 31 | -------------------------------------------------------------------------------- /pkg/utils/regex.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "regexp" 5 | ) 6 | 7 | // CompileRegexPatterns compiles regex strings into usable *regexp.Regexp objects. 8 | // Returns an error if any pattern is invalid. 9 | func CompileRegexPatterns(patterns []string) ([]*regexp.Regexp, error) { 10 | compiled := make([]*regexp.Regexp, 0, len(patterns)) 11 | for i, pattern := range patterns { 12 | if pattern == "" { // Skip empty patterns silently 13 | continue 14 | } 15 | re, err := regexp.Compile(pattern) 16 | if err != nil { 17 | // Return a specific error including the pattern index and content 18 | // Use the config validation sentinel error? 19 | return nil, WrapErrorf(ErrConfigValidation, "invalid regex pattern #%d ('%s')", i+1, pattern) 20 | } 21 | compiled = append(compiled, re) 22 | } 23 | return compiled, nil 24 | } 25 | -------------------------------------------------------------------------------- /pkg/parse/sitemap.go: -------------------------------------------------------------------------------- 1 | package parse 2 | 3 | import "encoding/xml" 4 | 5 | // --- XML Structs for Sitemap Parsing --- 6 | 7 | // XMLURL represents a element in a sitemap 8 | type XMLURL struct { 9 | Loc string `xml:"loc"` 10 | LastMod string `xml:"lastmod,omitempty"` 11 | } 12 | 13 | // XMLURLSet represents a element in a sitemap 14 | type XMLURLSet struct { 15 | XMLName xml.Name `xml:"urlset"` 16 | URLs []XMLURL `xml:"url"` 17 | } 18 | 19 | // XMLSitemap represents a element in a sitemap index file 20 | type XMLSitemap struct { 21 | Loc string `xml:"loc"` 22 | LastMod string `xml:"lastmod,omitempty"` 23 | } 24 | 25 | // XMLSitemapIndex represents a element 26 | type XMLSitemapIndex struct { 27 | XMLName xml.Name `xml:"sitemapindex"` 28 | Sitemaps []XMLSitemap `xml:"sitemap"` 29 | } 30 | -------------------------------------------------------------------------------- /pkg/log/adapter.go: -------------------------------------------------------------------------------- 1 | package log 2 | 3 | import "github.com/sirupsen/logrus" 4 | 5 | // BadgerLogrusAdapter implements badger.Logger interface using logrus 6 | type BadgerLogrusAdapter struct { 7 | *logrus.Entry // Embed logrus Entry 8 | } 9 | 10 | // NewBadgerLogrusAdapter creates a new adapter 11 | func NewBadgerLogrusAdapter(entry *logrus.Entry) *BadgerLogrusAdapter { 12 | return &BadgerLogrusAdapter{entry} 13 | } 14 | 15 | // Errorf logs an error message 16 | func (l *BadgerLogrusAdapter) Errorf(f string, v ...interface{}) { l.Entry.Errorf(f, v...) } 17 | 18 | // Warningf logs a warning message 19 | func (l *BadgerLogrusAdapter) Warningf(f string, v ...interface{}) { l.Entry.Warningf(f, v...) } 20 | 21 | // Infof logs an info message 22 | func (l *BadgerLogrusAdapter) Infof(f string, v ...interface{}) { l.Entry.Infof(f, v...) } 23 | 24 | // Debugf logs a debug message 25 | func (l *BadgerLogrusAdapter) Debugf(f string, v ...interface{}) { l.Entry.Debugf(f, v...) } 26 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Go dependencies (managed by go mod) 2 | vendor/ 3 | 4 | # Go build outputs 5 | *.exe 6 | *.exe~ 7 | *.dll 8 | *.so 9 | *.dylib 10 | *.test 11 | *.prof 12 | 13 | # Output directory for crawled documents (from config.yaml) 14 | crawled_docs/ 15 | 16 | # State directory (from config.yaml) 17 | crawler_state/ 18 | 19 | archive/ 20 | tuning.md 21 | 22 | # Visited log file (if -write-visited-log is used) 23 | *-visited.txt 24 | 25 | # OS generated files 26 | .DS_Store 27 | .DS_Store? 28 | ._* 29 | .Spotlight-V100 30 | .Trashes 31 | ehthumbs.db 32 | Thumbs.db 33 | 34 | # IDE / Editor specific files 35 | .idea/ 36 | .vscode/ 37 | *.sublime-project 38 | *.sublime-workspace 39 | *.suo 40 | *.ntvs* 41 | *.njsproj 42 | *.sln 43 | *.sw? 44 | 45 | # Log files (if you redirect output) 46 | *.log 47 | 48 | # Profile data files 49 | *.pprof 50 | cpu.pprof 51 | mem.pprof 52 | mutex.pprof 53 | block.pprof 54 | 55 | # Coverage profile 56 | coverage.out 57 | coverage.html 58 | 59 | # Environment variables file (if used) 60 | .env -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module doc-scraper 2 | 3 | go 1.24.2 4 | 5 | require ( 6 | github.com/JohannesKaufmann/html-to-markdown v1.6.0 7 | github.com/PuerkitoBio/goquery v1.10.3 8 | github.com/dgraph-io/badger/v4 v4.7.0 9 | github.com/sirupsen/logrus v1.9.3 10 | github.com/temoto/robotstxt v1.1.2 11 | golang.org/x/sync v0.13.0 12 | gopkg.in/yaml.v3 v3.0.1 13 | ) 14 | 15 | require ( 16 | github.com/andybalholm/cascadia v1.3.3 // indirect 17 | github.com/cespare/xxhash/v2 v2.3.0 // indirect 18 | github.com/dgraph-io/ristretto/v2 v2.2.0 // indirect 19 | github.com/dustin/go-humanize v1.0.1 // indirect 20 | github.com/go-logr/logr v1.4.2 // indirect 21 | github.com/go-logr/stdr v1.2.2 // indirect 22 | github.com/google/flatbuffers v25.2.10+incompatible // indirect 23 | github.com/klauspost/compress v1.18.0 // indirect 24 | go.opentelemetry.io/auto/sdk v1.1.0 // indirect 25 | go.opentelemetry.io/otel v1.35.0 // indirect 26 | go.opentelemetry.io/otel/metric v1.35.0 // indirect 27 | go.opentelemetry.io/otel/trace v1.35.0 // indirect 28 | golang.org/x/net v0.39.0 // indirect 29 | golang.org/x/sys v0.32.0 // indirect 30 | google.golang.org/protobuf v1.36.6 // indirect 31 | ) 32 | -------------------------------------------------------------------------------- /pkg/utils/sanitize.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "regexp" 5 | "strings" 6 | ) 7 | 8 | // --- Filename Sanitization --- 9 | var invalidFilenameChars = regexp.MustCompile(`[<>:"/\\|?*\x00-\x1F]`) // Characters invalid in Windows/Unix filenames 10 | var consecutiveUnderscores = regexp.MustCompile(`_+`) // Pattern to replace multiple underscores with one 11 | const maxFilenameLength = 100 // Max length for sanitized filenames 12 | 13 | // SanitizeFilename cleans a string to be safe for use as a filename component 14 | func SanitizeFilename(name string) string { 15 | sanitized := invalidFilenameChars.ReplaceAllString(name, "_") // Replace invalid chars with underscore 16 | sanitized = consecutiveUnderscores.ReplaceAllString(sanitized, "_") // Collapse multiple underscores 17 | sanitized = strings.Trim(sanitized, "_ ") // Remove leading/trailing underscores or spaces 18 | 19 | // Limit filename length (considering multi-byte characters) 20 | if len(sanitized) > maxFilenameLength { 21 | // Simple truncation by byte length is usually sufficient for sanitization purposes 22 | sanitized = sanitized[:maxFilenameLength] 23 | // Trim again in case truncation created leading/trailing underscores 24 | sanitized = strings.Trim(sanitized, "_ ") 25 | } 26 | 27 | if sanitized == "" { // Handle cases where sanitization results in an empty string 28 | sanitized = "untitled" // Provide a default name 29 | } 30 | return sanitized 31 | } 32 | -------------------------------------------------------------------------------- /pkg/parse/normalize.go: -------------------------------------------------------------------------------- 1 | package parse 2 | 3 | import ( 4 | "net" 5 | "net/url" 6 | "strings" 7 | ) 8 | 9 | // NormalizeURL standardizes a URL for comparison and storage 10 | // It lowercases the scheme and host, removes default ports (80 for http, 443 for https), removes trailing slashes from paths (unless root "/"), ensures empty path becomes "/", and removes fragments and query strings 11 | // Does not modify the input *url.URL 12 | func NormalizeURL(u *url.URL) string { 13 | if u == nil { 14 | return "" 15 | } 16 | // Work on a copy 17 | normalized := *u 18 | 19 | normalized.Scheme = strings.ToLower(normalized.Scheme) 20 | normalized.Host = strings.ToLower(normalized.Host) 21 | 22 | // Remove default ports 23 | host, port, err := net.SplitHostPort(normalized.Host) 24 | if err == nil { // Host included a port 25 | if (normalized.Scheme == "http" && port == "80") || 26 | (normalized.Scheme == "https" && port == "443") { 27 | normalized.Host = host // Use hostname without default port 28 | } 29 | } // If no port or error, Host remains unchanged 30 | 31 | // Handle path normalization 32 | if normalized.Path == "" { 33 | normalized.Path = "/" // Ensure empty path becomes "/" 34 | } else if len(normalized.Path) > 1 && strings.HasSuffix(normalized.Path, "/") { 35 | normalized.Path = normalized.Path[:len(normalized.Path)-1] // Remove trailing slash 36 | } 37 | 38 | normalized.Fragment = "" // Remove fragment 39 | normalized.RawQuery = "" // Remove query string 40 | 41 | return normalized.String() 42 | } 43 | 44 | // ParseAndNormalize parses a URL string using the stricter url.ParseRequestURI (requiring a scheme) and then normalizes it using NormalizeURL 45 | // Returns the normalized string, the parsed URL object, and any parse error 46 | func ParseAndNormalize(urlStr string) (string, *url.URL, error) { 47 | parsed, err := url.ParseRequestURI(urlStr) // Stricter parsing 48 | if err != nil { 49 | return "", nil, err 50 | } 51 | normalizedStr := NormalizeURL(parsed) 52 | return normalizedStr, parsed, nil 53 | } 54 | -------------------------------------------------------------------------------- /pkg/storage/interface.go: -------------------------------------------------------------------------------- 1 | package storage 2 | 3 | import ( 4 | "context" 5 | "time" 6 | 7 | "doc-scraper/pkg/models" 8 | ) 9 | 10 | // VisitedStore defines the interface for storing and retrieving visited status for pages and images 11 | type VisitedStore interface { 12 | // MarkPageVisited marks a page URL as visited (pending state) 13 | // Returns true if the URL was newly added, false if it already existed 14 | MarkPageVisited(normalizedPageURL string) (bool, error) 15 | 16 | // CheckPageStatus retrieves the status and details of a page URL 17 | // Returns status ("success", "failure", "pending", "not_found", "db_error"), the PageDBEntry if found and parsed, and any error 18 | CheckPageStatus(normalizedPageURL string) (status string, entry *models.PageDBEntry, err error) 19 | 20 | // UpdatePageStatus updates the status and details for a page URL 21 | UpdatePageStatus(normalizedPageURL string, entry *models.PageDBEntry) error 22 | 23 | // CheckImageStatus retrieves the status and details of an image URL 24 | // Returns status ("success", "failure", "not_found", "db_error"), the ImageDBEntry if found and parsed, and any error 25 | CheckImageStatus(normalizedImgURL string) (status string, entry *models.ImageDBEntry, err error) 26 | 27 | // UpdateImageStatus updates the status and details for an image URL 28 | UpdateImageStatus(normalizedImgURL string, entry *models.ImageDBEntry) error 29 | 30 | // GetVisitedCount returns an approximate count of all keys in the store 31 | GetVisitedCount() (int, error) 32 | 33 | // RequeueIncomplete scans the DB and sends incomplete items (failed, pending, empty) to the provided channel 34 | // Should be called only during resume 35 | RequeueIncomplete(ctx context.Context, workChan chan<- models.WorkItem) (requeuedCount int, scanErrors int, err error) 36 | 37 | // WriteVisitedLog writes all page and image keys (URLs) to the specified file path 38 | WriteVisitedLog(filePath string) error 39 | 40 | // RunGC runs periodic garbage collection. Should be run in a goroutine 41 | RunGC(ctx context.Context, interval time.Duration) 42 | 43 | // Close cleanly closes the database connection 44 | Close() error 45 | } 46 | -------------------------------------------------------------------------------- /pkg/fetch/client.go: -------------------------------------------------------------------------------- 1 | package fetch 2 | 3 | import ( 4 | "errors" 5 | "net" 6 | "net/http" 7 | 8 | "doc-scraper/pkg/config" 9 | 10 | "github.com/sirupsen/logrus" 11 | ) 12 | 13 | // NewClient creates a new HTTP client based on the provided configuration. 14 | func NewClient(cfg config.HTTPClientConfig, log *logrus.Logger) *http.Client { 15 | log.Info("Initializing HTTP client...") 16 | 17 | // Create custom dialer with configured timeouts 18 | dialer := &net.Dialer{ 19 | Timeout: cfg.DialerTimeout, 20 | KeepAlive: cfg.DialerKeepAlive, 21 | // DualStack support is enabled by default 22 | } 23 | 24 | // Create custom transport using configured settings 25 | transport := &http.Transport{ 26 | Proxy: http.ProxyFromEnvironment, // Use system proxy settings 27 | DialContext: dialer.DialContext, // Use our custom dialer 28 | ForceAttemptHTTP2: true, // Default to true unless explicitly disabled 29 | MaxIdleConns: cfg.MaxIdleConns, 30 | MaxIdleConnsPerHost: cfg.MaxIdleConnsPerHost, 31 | IdleConnTimeout: cfg.IdleConnTimeout, 32 | TLSHandshakeTimeout: cfg.TLSHandshakeTimeout, 33 | ExpectContinueTimeout: cfg.ExpectContinueTimeout, 34 | MaxResponseHeaderBytes: 1 << 20, // Default: 1MB max header size 35 | WriteBufferSize: 4096, // Default 36 | ReadBufferSize: 4096, // Default 37 | DisableKeepAlives: false, // Keep-alives enabled by default 38 | } 39 | // Handle explicit setting for ForceAttemptHTTP2 if provided 40 | if cfg.ForceAttemptHTTP2 != nil { 41 | transport.ForceAttemptHTTP2 = *cfg.ForceAttemptHTTP2 42 | } 43 | 44 | client := &http.Client{ 45 | Timeout: cfg.Timeout, // Use configured overall timeout 46 | Transport: transport, // Use our custom transport 47 | CheckRedirect: func(req *http.Request, via []*http.Request) error { 48 | // Default Go behavior is 10 redirects max 49 | if len(via) >= 10 { 50 | return errors.New("stopped after 10 redirects") 51 | } 52 | log.Debugf("Redirecting: %s -> %s (hop %d)", via[len(via)-1].URL, req.URL, len(via)) 53 | return nil // Allow redirect 54 | }, 55 | } 56 | log.Info("HTTP client initialized.") 57 | return client 58 | } 59 | -------------------------------------------------------------------------------- /pkg/fetch/ratelimit.go: -------------------------------------------------------------------------------- 1 | package fetch 2 | 3 | import ( 4 | "math/rand" 5 | "sync" 6 | "time" 7 | 8 | "github.com/sirupsen/logrus" 9 | ) 10 | 11 | // RateLimiter manages request timing per host for politeness 12 | type RateLimiter struct { 13 | hostLastRequest map[string]time.Time // hostname -> last request attempt time 14 | hostLastRequestMu sync.Mutex // Protects hostLastRequest map 15 | defaultDelay time.Duration // Fallback delay if specific delay is invalid 16 | log *logrus.Logger 17 | } 18 | 19 | // NewRateLimiter creates a RateLimiter 20 | func NewRateLimiter(defaultDelay time.Duration, log *logrus.Logger) *RateLimiter { 21 | return &RateLimiter{ 22 | hostLastRequest: make(map[string]time.Time), 23 | defaultDelay: defaultDelay, 24 | log: log, 25 | } 26 | } 27 | 28 | // ApplyDelay sleeps if the time since the last request to the host is less than minDelay 29 | // Includes jitter (+/- 10%) to desynchronize requests 30 | func (rl *RateLimiter) ApplyDelay(host string, minDelay time.Duration) { 31 | // Use default delay if minDelay is invalid 32 | if minDelay <= 0 { 33 | minDelay = rl.defaultDelay 34 | } 35 | // No delay needed if effective delay is zero or negative 36 | if minDelay <= 0 { 37 | return 38 | } 39 | 40 | // Read last request time safely 41 | rl.hostLastRequestMu.Lock() 42 | lastReqTime, exists := rl.hostLastRequest[host] 43 | rl.hostLastRequestMu.Unlock() // Unlock before potentially sleeping 44 | 45 | if exists { 46 | elapsed := time.Since(lastReqTime) 47 | if elapsed < minDelay { 48 | sleepDuration := minDelay - elapsed 49 | 50 | // Add jitter: +/- 10% of sleepDuration 51 | var jitter time.Duration 52 | if sleepDuration > 0 { 53 | jitterRange := int64(sleepDuration) / 5 // 20% range width for +/-10% 54 | if jitterRange > 0 { // Avoid Int63n(0) 55 | jitter = time.Duration(rand.Int63n(jitterRange)) - (sleepDuration / 10) 56 | } 57 | } 58 | 59 | finalSleep := sleepDuration + jitter 60 | if finalSleep < 0 { 61 | finalSleep = 0 // Ensure non-negative sleep 62 | } 63 | 64 | if finalSleep > 0 { 65 | rl.log.WithFields(logrus.Fields{ 66 | "host": host, "sleep": finalSleep, "required_delay": minDelay, "elapsed": elapsed, 67 | }).Debug("Rate limit applying sleep") 68 | time.Sleep(finalSleep) 69 | } 70 | } 71 | } 72 | // Note: Timestamp update via UpdateLastRequestTime happens *after* the request attempt in calling code 73 | } 74 | 75 | // UpdateLastRequestTime records the current time as the last request attempt time for the host 76 | // Call this *after* an HTTP request attempt to the host 77 | func (rl *RateLimiter) UpdateLastRequestTime(host string) { 78 | rl.hostLastRequestMu.Lock() 79 | rl.hostLastRequest[host] = time.Now() 80 | rl.hostLastRequestMu.Unlock() 81 | } 82 | -------------------------------------------------------------------------------- /pkg/models/models.go: -------------------------------------------------------------------------------- 1 | package models 2 | 3 | import "time" 4 | 5 | // WorkItem represents a URL and its depth to be processed by a worker 6 | type WorkItem struct { 7 | URL string 8 | Depth int 9 | } 10 | 11 | // PageDBEntry stores the result of processing a page URL in the database 12 | type PageDBEntry struct { 13 | Status string `json:"status"` // "success" or "failure" 14 | ErrorType string `json:"error_type,omitempty"` // Error category (on failure) 15 | ProcessedAt time.Time `json:"processed_at,omitempty"` // Timestamp of successful processing 16 | LastAttempt time.Time `json:"last_attempt"` // Timestamp of the last processing attempt 17 | Depth int `json:"depth"` // Depth at which this page was processed/attempted 18 | } 19 | 20 | // ImageDBEntry stores the result of processing an image URL in the database 21 | type ImageDBEntry struct { 22 | Status string `json:"status"` // "success" or "failure" 23 | LocalPath string `json:"local_path,omitempty"` // Relative path from site output dir (on success) 24 | Caption string `json:"caption,omitempty"` // Captured caption/alt (on success) 25 | ErrorType string `json:"error_type,omitempty"` // Error category (on failure) 26 | LastAttempt time.Time `json:"last_attempt"` // Timestamp of the last processing attempt 27 | } 28 | 29 | // ImageData stores information about a successfully downloaded image 30 | type ImageData struct { 31 | OriginalURL string 32 | LocalPath string // Relative path from site output dir 33 | Caption string // Image caption/alt text 34 | } 35 | 36 | // CrawlMetadata holds all metadata for a single crawl session of a site. 37 | type CrawlMetadata struct { 38 | SiteKey string `yaml:"site_key"` 39 | AllowedDomain string `yaml:"allowed_domain"` 40 | CrawlStartTime time.Time `yaml:"crawl_start_time"` 41 | CrawlEndTime time.Time `yaml:"crawl_end_time"` 42 | TotalPagesSaved int `yaml:"total_pages_saved"` 43 | SiteConfiguration map[string]interface{} `yaml:"site_configuration,omitempty"` // For a flexible dump of SiteConfig 44 | Pages []PageMetadata `yaml:"pages"` 45 | } 46 | 47 | // PageMetadata holds metadata for a single scraped page. 48 | type PageMetadata struct { 49 | OriginalURL string `yaml:"original_url"` 50 | NormalizedURL string `yaml:"normalized_url"` 51 | LocalFilePath string `yaml:"local_file_path"` // Relative to site_output_dir 52 | Title string `yaml:"title,omitempty"` 53 | Depth int `yaml:"depth"` 54 | ProcessedAt time.Time `yaml:"processed_at"` 55 | ContentHash string `yaml:"content_hash,omitempty"` // MD5 or SHA256 hex string 56 | ImageCount int `yaml:"image_count,omitempty"` // Count of images processed for this page 57 | // LinkedFrom []string `yaml:"linked_from,omitempty"` // Deferring for now 58 | } 59 | -------------------------------------------------------------------------------- /pkg/queue/priority_queue.go: -------------------------------------------------------------------------------- 1 | package queue 2 | 3 | import ( 4 | "container/heap" 5 | "sync" 6 | 7 | "doc-scraper/pkg/models" 8 | 9 | "github.com/sirupsen/logrus" 10 | ) 11 | 12 | // --- Priority Queue Implementation --- 13 | 14 | // PQItem represents an item in the priority queue 15 | type PQItem struct { 16 | workItem *models.WorkItem 17 | priority int // Lower value means higher priority (e.g., Depth) 18 | index int // The index of the item in the heap (required by heap interface) 19 | } 20 | 21 | // PriorityQueue implements heap.Interface 22 | type PriorityQueue []*PQItem 23 | 24 | func (pq PriorityQueue) Len() int { return len(pq) } 25 | 26 | func (pq PriorityQueue) Less(i, j int) bool { 27 | // Pop should return the item with the smallest priority value (lowest depth) 28 | return pq[i].priority < pq[j].priority 29 | } 30 | 31 | func (pq PriorityQueue) Swap(i, j int) { 32 | pq[i], pq[j] = pq[j], pq[i] 33 | pq[i].index = i 34 | pq[j].index = j 35 | } 36 | 37 | // Push adds an element to the heap 38 | func (pq *PriorityQueue) Push(x any) { 39 | n := len(*pq) 40 | item := x.(*PQItem) 41 | item.index = n 42 | *pq = append(*pq, item) 43 | } 44 | 45 | // Pop removes and returns the highest priority element (minimum value) from the heap 46 | func (pq *PriorityQueue) Pop() any { 47 | old := *pq 48 | n := len(old) 49 | item := old[n-1] 50 | old[n-1] = nil // avoid memory leak 51 | item.index = -1 // for safety 52 | *pq = old[0 : n-1] 53 | return item 54 | } 55 | 56 | // ThreadSafePriorityQueue wraps PriorityQueue with concurrency controls 57 | type ThreadSafePriorityQueue struct { 58 | pq PriorityQueue 59 | mu sync.Mutex 60 | cond *sync.Cond // Condition variable to wait for items 61 | closed bool 62 | log *logrus.Logger // Reference to the main logger 63 | } 64 | 65 | // NewThreadSafePriorityQueue creates a new thread-safe priority queue 66 | func NewThreadSafePriorityQueue(logger *logrus.Logger) *ThreadSafePriorityQueue { 67 | tspq := &ThreadSafePriorityQueue{log: logger} 68 | tspq.cond = sync.NewCond(&tspq.mu) // Initialize condition variable 69 | heap.Init(&tspq.pq) // Initialize the underlying heap 70 | return tspq 71 | } 72 | 73 | // Add pushes a work item onto the queue with priority based on depth 74 | func (tspq *ThreadSafePriorityQueue) Add(item *models.WorkItem) { 75 | tspq.mu.Lock() 76 | defer tspq.mu.Unlock() 77 | 78 | if tspq.closed { 79 | tspq.log.Warnf("Attempted to add item to closed queue: %s", item.URL) 80 | return 81 | } 82 | 83 | pqItem := &PQItem{ 84 | workItem: item, 85 | priority: item.Depth, // Use Depth as priority (lower depth = higher priority) 86 | } 87 | heap.Push(&tspq.pq, pqItem) // Add item to the heap 88 | tspq.cond.Signal() // Signal one waiting worker that an item is available 89 | } 90 | 91 | // Pop retrieves and removes the highest priority work item 92 | // It blocks if the queue is empty until an item is added or the queue is closed 93 | // Returns the item and true, or nil and false if the queue is closed and empty 94 | func (tspq *ThreadSafePriorityQueue) Pop() (*models.WorkItem, bool) { 95 | tspq.mu.Lock() 96 | defer tspq.mu.Unlock() 97 | 98 | // Wait while the queue is empty AND not closed 99 | for len(tspq.pq) == 0 { 100 | if tspq.closed { 101 | return nil, false // Queue closed and empty, signal worker to exit 102 | } 103 | // Wait releases the lock and waits for a Signal/Broadcast; reacquires lock upon waking 104 | tspq.cond.Wait() 105 | } 106 | 107 | // Re-check after waking up, in case Close() was called concurrently 108 | if len(tspq.pq) == 0 && tspq.closed { 109 | return nil, false 110 | } 111 | 112 | // Pop the highest priority item from the heap 113 | pqItem := heap.Pop(&tspq.pq).(*PQItem) 114 | return pqItem.workItem, true 115 | } 116 | 117 | // Close signals that no more items will be added to the queue 118 | func (tspq *ThreadSafePriorityQueue) Close() { 119 | tspq.mu.Lock() 120 | defer tspq.mu.Unlock() 121 | if !tspq.closed { 122 | tspq.closed = true 123 | tspq.cond.Broadcast() // Wake up ALL waiting workers so they can check the closed status 124 | } 125 | } 126 | 127 | // Len returns the current number of items in the queue (thread-safe) 128 | func (tspq *ThreadSafePriorityQueue) Len() int { 129 | tspq.mu.Lock() 130 | defer tspq.mu.Unlock() 131 | return len(tspq.pq) 132 | } 133 | -------------------------------------------------------------------------------- /pkg/utils/tree.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "bufio" 5 | "fmt" 6 | "io" 7 | "os" 8 | "path/filepath" 9 | "slices" 10 | "strings" 11 | 12 | "github.com/sirupsen/logrus" 13 | ) 14 | 15 | const ( 16 | indentPrefix = " " 17 | entryPrefix = "├── " 18 | lastEntryPrefix = "└── " 19 | verticalLine = "│ " 20 | ) 21 | 22 | // GenerateAndSaveTreeStructure walks the targetDir and writes a text-based directory tree structure to the specified outputFilePath 23 | func GenerateAndSaveTreeStructure(targetDir, outputFilePath string, log *logrus.Logger) error { 24 | log.Debugf("Starting tree generation for target: %s", targetDir) 25 | // Ensure target directory exists 26 | if _, err := os.Stat(targetDir); os.IsNotExist(err) { 27 | return fmt.Errorf("target directory '%s' does not exist: %w", targetDir, err) 28 | } else if err != nil { 29 | return fmt.Errorf("error checking target directory '%s': %w", targetDir, err) 30 | } 31 | 32 | // Create or truncate the output file 33 | file, err := os.Create(outputFilePath) 34 | if err != nil { 35 | return fmt.Errorf("failed to create output file '%s': %w", outputFilePath, err) 36 | } 37 | defer file.Close() // Ensure file is closed 38 | 39 | writer := bufio.NewWriter(file) 40 | defer writer.Flush() // Ensure buffer is flushed 41 | 42 | // Write header 43 | _, err = fmt.Fprintf(writer, "Directory Structure for: %s\n", targetDir) 44 | if err != nil { 45 | return err 46 | } 47 | _, err = fmt.Fprintf(writer, "%s\n\n", strings.Repeat("=", 25+len(targetDir))) 48 | if err != nil { 49 | return err 50 | } 51 | 52 | // Write the root directory name itself 53 | rootName := filepath.Base(targetDir) 54 | _, err = fmt.Fprintf(writer, "%s/\n", rootName) 55 | if err != nil { 56 | return err 57 | } 58 | 59 | // Start the recursive walk from the target directory, passing the logger 60 | log.Debugf("Initiating recursive walk from: %s", targetDir) 61 | err = walkDirRecursive(writer, targetDir, "", log) 62 | if err != nil { 63 | // os.Remove(outputFilePath) 64 | log.Errorf("Error occurred during recursive walk for '%s': %v", targetDir, err) 65 | return fmt.Errorf("error generating tree structure for '%s': %w", targetDir, err) 66 | } 67 | log.Debugf("Finished recursive walk for: %s", targetDir) 68 | 69 | return nil // Success 70 | } 71 | 72 | // walkDirRecursive performs the recursive directory walk and writes entries 73 | func walkDirRecursive(writer io.Writer, dirPath string, currentIndent string, log *logrus.Logger) error { 74 | log.Debugf("Walking directory: %s", dirPath) 75 | entries, err := os.ReadDir(dirPath) 76 | if err != nil { 77 | // Log the error at warning level as we are handling it by returning 78 | log.Warnf("Failed to read directory '%s': %v", dirPath, err) 79 | return fmt.Errorf("failed to read directory '%s': %w", dirPath, err) 80 | } 81 | log.Debugf("Found %d entries in %s", len(entries), dirPath) 82 | 83 | // Sort entries: directories first, then alphabetically by name 84 | slices.SortFunc(entries, func(a, b os.DirEntry) int { 85 | aIsDir := a.IsDir() 86 | bIsDir := b.IsDir() 87 | if aIsDir && !bIsDir { 88 | return -1 // a (dir) comes before b (file) 89 | } 90 | if !aIsDir && bIsDir { 91 | return 1 // b (dir) comes before a (file) 92 | } 93 | // Both are dirs or both are files, sort by name 94 | return strings.Compare(strings.ToLower(a.Name()), strings.ToLower(b.Name())) 95 | }) 96 | 97 | for i, entry := range entries { 98 | isLast := (i == len(entries)-1) // Check if this is the last entry at this level 99 | 100 | // Determine connector prefix 101 | connector := entryPrefix 102 | if isLast { 103 | connector = lastEntryPrefix 104 | } 105 | 106 | // Log the entry being written 107 | log.Debugf("Writing entry: %s%s%s", currentIndent, connector, entry.Name()) 108 | 109 | // Write the current entry line 110 | _, writeErr := fmt.Fprintf(writer, "%s%s%s\n", currentIndent, connector, entry.Name()) 111 | if writeErr != nil { 112 | log.Errorf("Error writing entry '%s' to output file: %v", entry.Name(), writeErr) 113 | return writeErr // Stop processing on write error 114 | } 115 | 116 | // If it's a directory, recurse 117 | if entry.IsDir() { 118 | // Determine the prefix for the next level's indentation 119 | nextIndent := currentIndent 120 | if isLast { 121 | nextIndent += indentPrefix // No vertical line needed after last entry 122 | } else { 123 | nextIndent += verticalLine // Add vertical line for non-last entries 124 | } 125 | 126 | // Recursive call 127 | subDirPath := filepath.Join(dirPath, entry.Name()) 128 | log.Debugf("Recursing into directory: %s", subDirPath) 129 | err := walkDirRecursive(writer, subDirPath, nextIndent, log) 130 | if err != nil { 131 | return err // Propagate error up 132 | } 133 | log.Debugf("Finished recursion for directory: %s", subDirPath) 134 | } 135 | } 136 | log.Debugf("Finished processing directory: %s", dirPath) 137 | return nil // Success for this directory level 138 | } 139 | -------------------------------------------------------------------------------- /compare.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import difflib 3 | from pathlib import Path 4 | 5 | 6 | def get_relative_items(root_dir: Path) -> set[Path]: 7 | """ 8 | Recursively finds all files and directories within root_dir 9 | and returns their paths relative to root_dir. 10 | """ 11 | items = set() 12 | for item in root_dir.rglob("*"): 13 | items.add(item.relative_to(root_dir)) 14 | return items 15 | 16 | 17 | def compare_files_line_by_line(file1: Path, file2: Path) -> list[str] | None: 18 | """ 19 | Compares two files line by line and returns a unified diff list 20 | if they differ, or None if they are identical or an error occurs. 21 | """ 22 | try: 23 | with ( 24 | open(file1, "r", encoding="utf-8", errors="ignore") as f1, 25 | open(file2, "r", encoding="utf-8", errors="ignore") as f2, 26 | ): 27 | lines1 = f1.readlines() 28 | lines2 = f2.readlines() 29 | 30 | diff = list( 31 | difflib.unified_diff( 32 | lines1, lines2, fromfile=str(file1), tofile=str(file2), lineterm="\n" 33 | ) 34 | ) 35 | 36 | if not diff: 37 | return None 38 | return diff 39 | 40 | except OSError as e: 41 | print(f" [Error] Cannot read/compare file: {e}") 42 | return ["Error reading file."] 43 | except Exception as e: 44 | print(f" [Error] Unexpected error comparing {file1} and {file2}: {e}") 45 | return ["Unexpected error during comparison."] 46 | 47 | 48 | def compare_folders(dir1: Path, dir2: Path): 49 | """ 50 | Compares two directories thoroughly: structure and file content. 51 | """ 52 | print(f"Comparing '{dir1}' and '{dir2}'...\n") 53 | 54 | if not dir1.is_dir(): 55 | print(f"Error: Folder '{dir1}' does not exist or is not a directory.") 56 | return 57 | if not dir2.is_dir(): 58 | print(f"Error: Folder '{dir2}' does not exist or is not a directory.") 59 | return 60 | 61 | print("Scanning directories...") 62 | items1 = get_relative_items(dir1) 63 | items2 = get_relative_items(dir2) 64 | print(f"Found {len(items1)} items in '{dir1}', {len(items2)} items in '{dir2}'.") 65 | 66 | # --- 1. Structure Differences --- 67 | only_in_dir1 = items1 - items2 68 | only_in_dir2 = items2 - items1 69 | common_items = items1 & items2 70 | 71 | found_diff = False 72 | 73 | if only_in_dir1: 74 | found_diff = True 75 | print("\n--- Items only in '{}' ---".format(dir1)) 76 | for item in sorted(list(only_in_dir1)): 77 | item_type = "(Dir)" if (dir1 / item).is_dir() else "(File)" 78 | print(f"+ {item} {item_type}") 79 | 80 | if only_in_dir2: 81 | found_diff = True 82 | print("\n--- Items only in '{}' ---".format(dir2)) 83 | for item in sorted(list(only_in_dir2)): 84 | item_type = "(Dir)" if (dir2 / item).is_dir() else "(File)" 85 | print(f"+ {item} {item_type}") 86 | 87 | # --- 2. Compare Common Items --- 88 | if common_items: 89 | print(f"\n--- Comparing {len(common_items)} common items ---") 90 | for item_rel_path in sorted(list(common_items)): 91 | path1 = dir1 / item_rel_path 92 | path2 = dir2 / item_rel_path 93 | 94 | is_dir1 = path1.is_dir() 95 | is_dir2 = path2.is_dir() 96 | 97 | if is_dir1 != is_dir2: 98 | found_diff = True 99 | print(f"\n* Type mismatch: '{item_rel_path}'") 100 | print(f" '{path1}': {'Directory' if is_dir1 else 'File'}") 101 | print(f" '{path2}': {'Directory' if is_dir2 else 'File'}") 102 | continue 103 | 104 | if is_dir1 and is_dir2: 105 | # print(f" Directory: '{item_rel_path}' (present in both)") 106 | pass 107 | 108 | elif not is_dir1 and not is_dir2: 109 | try: 110 | stat1 = path1.stat() 111 | stat2 = path2.stat() 112 | 113 | if stat1.st_size != stat2.st_size: 114 | found_diff = True 115 | print(f"\n* Size difference: '{item_rel_path}'") 116 | print(f" '{path1}': {stat1.st_size} bytes") 117 | print(f" '{path2}': {stat2.st_size} bytes") 118 | # Still attempt line-by-line comparison unless files are huge 119 | # if abs(stat1.st_size - stat2.st_size) > SOME_THRESHOLD: continue 120 | 121 | diff_lines = compare_files_line_by_line(path1, path2) 122 | 123 | if diff_lines: 124 | found_diff = True 125 | print(f"\n* Content difference: '{item_rel_path}'") 126 | # for line in diff_lines[:50]: 127 | for line in diff_lines: 128 | print(f" {line.rstrip()}") 129 | 130 | except OSError as e: 131 | found_diff = True 132 | print(f"\n* Error accessing file stats for '{item_rel_path}': {e}") 133 | except Exception as e: 134 | found_diff = True 135 | print( 136 | f"\n* Unexpected error processing file '{item_rel_path}': {e}" 137 | ) 138 | 139 | # --- 3. Final Summary --- 140 | print("\n--- Comparison Summary ---") 141 | if not found_diff: 142 | print("No differences found.") 143 | else: 144 | print("Differences found (listed above).") 145 | 146 | 147 | if __name__ == "__main__": 148 | parser = argparse.ArgumentParser( 149 | description="Compare two folders thoroughly (structure and file content)." 150 | ) 151 | parser.add_argument("folder1", type=Path, help="Path to the first folder.") 152 | parser.add_argument("folder2", type=Path, help="Path to the second folder.") 153 | 154 | args = parser.parse_args() 155 | 156 | compare_folders(args.folder1, args.folder2) 157 | -------------------------------------------------------------------------------- /pkg/fetch/robots.go: -------------------------------------------------------------------------------- 1 | package fetch 2 | 3 | import ( 4 | "context" 5 | "io" 6 | "net/http" 7 | "net/url" 8 | "sync" 9 | 10 | "github.com/sirupsen/logrus" 11 | "github.com/temoto/robotstxt" 12 | "golang.org/x/sync/semaphore" 13 | 14 | "doc-scraper/pkg/config" 15 | ) 16 | 17 | // SitemapDiscoverer defines the callback interface for handling discovered sitemap URLs 18 | type SitemapDiscoverer interface { 19 | FoundSitemap(sitemapURL string) 20 | } 21 | 22 | // RobotsHandler manages fetching, parsing, caching, and checking robots.txt data 23 | type RobotsHandler struct { 24 | fetcher *Fetcher 25 | rateLimiter *RateLimiter 26 | robotsCache map[string]*robotstxt.RobotsData // hostname -> parsed data (or nil) 27 | robotsCacheMu sync.Mutex 28 | globalSemaphore *semaphore.Weighted 29 | sitemapNotifier SitemapDiscoverer // Component to notify about found sitemaps 30 | cfg config.AppConfig 31 | log *logrus.Logger 32 | } 33 | 34 | // NewRobotsHandler creates a RobotsHandler 35 | func NewRobotsHandler( 36 | fetcher *Fetcher, 37 | rateLimiter *RateLimiter, 38 | globalSemaphore *semaphore.Weighted, 39 | sitemapNotifier SitemapDiscoverer, 40 | cfg config.AppConfig, 41 | log *logrus.Logger, 42 | ) *RobotsHandler { 43 | return &RobotsHandler{ 44 | fetcher: fetcher, 45 | rateLimiter: rateLimiter, 46 | robotsCache: make(map[string]*robotstxt.RobotsData), 47 | globalSemaphore: globalSemaphore, 48 | sitemapNotifier: sitemapNotifier, 49 | cfg: cfg, 50 | log: log, 51 | } 52 | } 53 | 54 | // GetRobotsData retrieves robots.txt data for the targetURL's host, using cache or fetching 55 | // Returns parsed data or nil on any error/4xx/missing file 56 | // signalChan is only for coordinating the initial crawler startup fetch 57 | func (rh *RobotsHandler) GetRobotsData(targetURL *url.URL, signalChan chan<- bool, ctx context.Context) *robotstxt.RobotsData { 58 | if ctx == nil { 59 | ctx = context.Background() 60 | } 61 | // Signal completion on exit if channel provided (non-blocking) 62 | if signalChan != nil { 63 | defer func() { 64 | select { 65 | case signalChan <- true: 66 | default: 67 | rh.log.Warn("Failed robots signalChan send") 68 | } 69 | }() 70 | } 71 | 72 | host := targetURL.Hostname() 73 | hostLog := rh.log.WithField("host", host) 74 | 75 | // 1. Check Cache 76 | rh.robotsCacheMu.Lock() 77 | robotsData, found := rh.robotsCache[host] 78 | rh.robotsCacheMu.Unlock() 79 | if found { 80 | return robotsData // Return cached data (could be nil) 81 | } 82 | 83 | // 2. Prepare Fetch URL 84 | robotsURL := &url.URL{Scheme: targetURL.Scheme, Host: host, Path: "/robots.txt"} 85 | if targetURL.Scheme != "http" && targetURL.Scheme != "https" { 86 | hostLog.Warnf("Invalid scheme '%s', defaulting to https for robots.txt", targetURL.Scheme) 87 | robotsURL.Scheme = "https" 88 | } 89 | robotsURLStr := robotsURL.String() 90 | robotsLog := hostLog.WithField("robots_url", robotsURLStr) 91 | robotsLog.Info("Fetching robots.txt...") // Log only on cache miss 92 | 93 | // 3. Acquire Global Semaphore 94 | semTimeout := rh.cfg.SemaphoreAcquireTimeout 95 | acquiredSemaphore := false 96 | robotsLog.Debug("Acquiring global semaphore...") 97 | ctxAcquire, cancelAcquire := context.WithTimeout(ctx, semTimeout) 98 | err := rh.globalSemaphore.Acquire(ctxAcquire, 1) 99 | cancelAcquire() 100 | if err != nil { 101 | robotsLog.Errorf("Error acquiring global semaphore: %v", err) 102 | rh.robotsCacheMu.Lock() 103 | rh.robotsCache[host] = nil 104 | rh.robotsCacheMu.Unlock() // Cache failure 105 | return nil 106 | } 107 | acquiredSemaphore = true 108 | robotsLog.Debug("Acquired global semaphore.") 109 | defer func() { // Ensure release 110 | if acquiredSemaphore { 111 | rh.globalSemaphore.Release(1) 112 | robotsLog.Debug("Released global semaphore.") 113 | } 114 | }() 115 | 116 | // 4. Apply Rate Limit (using default delay) 117 | rh.rateLimiter.ApplyDelay(host, rh.cfg.DefaultDelayPerHost) 118 | 119 | // 5. Fetch Request (with retries via Fetcher) 120 | req, err := http.NewRequestWithContext(ctx, "GET", robotsURLStr, nil) 121 | if err != nil { 122 | robotsLog.Errorf("Error creating request: %v", err) 123 | rh.robotsCacheMu.Lock() 124 | rh.robotsCache[host] = nil 125 | rh.robotsCacheMu.Unlock() 126 | return nil 127 | } 128 | req.Header.Set("User-Agent", rh.cfg.DefaultUserAgent) // Use default agent for robots 129 | 130 | resp, fetchErr := rh.fetcher.FetchWithRetry(req, ctx) 131 | rh.rateLimiter.UpdateLastRequestTime(host) // Update time after attempt 132 | 133 | if fetchErr != nil { 134 | // Fetcher already logged error details 135 | robotsLog.Errorf("Fetching robots.txt failed: %v", fetchErr) 136 | rh.robotsCacheMu.Lock() 137 | rh.robotsCache[host] = nil 138 | rh.robotsCacheMu.Unlock() 139 | return nil 140 | } 141 | // Success: 2xx response. 142 | defer resp.Body.Close() 143 | 144 | // 7. Read and Parse Body. 145 | bodyBytes, err := io.ReadAll(resp.Body) 146 | if err != nil { 147 | robotsLog.Errorf("Error reading body: %v", err) 148 | rh.robotsCacheMu.Lock() 149 | rh.robotsCache[host] = nil 150 | rh.robotsCacheMu.Unlock() 151 | return nil 152 | } 153 | 154 | data, err := robotstxt.FromBytes(bodyBytes) 155 | if err != nil { 156 | robotsLog.Errorf("Error parsing content: %v", err) 157 | rh.robotsCacheMu.Lock() 158 | rh.robotsCache[host] = nil 159 | rh.robotsCacheMu.Unlock() 160 | return nil 161 | } 162 | 163 | // 8. Cache Success & Notify Sitemaps 164 | robotsLog.Info("Successfully fetched and parsed robots.txt") 165 | rh.robotsCacheMu.Lock() 166 | rh.robotsCache[host] = data // Cache successful parse 167 | rh.robotsCacheMu.Unlock() 168 | 169 | if rh.sitemapNotifier != nil && len(data.Sitemaps) > 0 { 170 | robotsLog.Infof("Found %d sitemap directive(s)", len(data.Sitemaps)) 171 | for _, sitemapURL := range data.Sitemaps { 172 | rh.sitemapNotifier.FoundSitemap(sitemapURL) // Notify discoverer 173 | } 174 | } 175 | 176 | return data 177 | } 178 | 179 | // TestAgent checks if the user agent is allowed access based on cached/fetched rules 180 | // Returns true if allowed (or robots fetch/parse fails), false otherwise 181 | func (rh *RobotsHandler) TestAgent(targetURL *url.URL, userAgent string, ctx context.Context) bool { 182 | // Get data, fetching if needed. Handles caching internally 183 | robotsData := rh.GetRobotsData(targetURL, nil, ctx) 184 | 185 | // Assume allowed if robots data could not be obtained (4xx, 5xx, network error, parse error) 186 | if robotsData == nil { 187 | return true 188 | } 189 | 190 | // Perform check using the parsed data 191 | return robotsData.TestAgent(targetURL.RequestURI(), userAgent) 192 | } 193 | -------------------------------------------------------------------------------- /pkg/config/config.go: -------------------------------------------------------------------------------- 1 | package config 2 | 3 | import "time" 4 | 5 | // SiteConfig holds configuration specific to a single website crawl 6 | type SiteConfig struct { 7 | StartURLs []string `yaml:"start_urls"` 8 | AllowedDomain string `yaml:"allowed_domain"` 9 | AllowedPathPrefix string `yaml:"allowed_path_prefix"` 10 | ContentSelector string `yaml:"content_selector"` 11 | LinkExtractionSelectors []string `yaml:"link_extraction_selectors,omitempty"` 12 | DisallowedPathPatterns []string `yaml:"disallowed_path_patterns,omitempty"` // Regex patterns for paths to exclude 13 | RespectNofollow bool `yaml:"respect_nofollow,omitempty"` 14 | UserAgent string `yaml:"user_agent,omitempty"` 15 | DelayPerHost time.Duration `yaml:"delay_per_host,omitempty"` 16 | MaxDepth int `yaml:"max_depth"` 17 | SkipImages *bool `yaml:"skip_images,omitempty"` 18 | MaxImageSizeBytes *int64 `yaml:"max_image_size_bytes,omitempty"` 19 | AllowedImageDomains []string `yaml:"allowed_image_domains,omitempty"` 20 | DisallowedImageDomains []string `yaml:"disallowed_image_domains,omitempty"` 21 | EnableOutputMapping *bool `yaml:"enable_output_mapping,omitempty"` 22 | OutputMappingFilename string `yaml:"output_mapping_filename,omitempty"` 23 | EnableMetadataYAML *bool `yaml:"enable_metadata_yaml,omitempty"` 24 | MetadataYAMLFilename string `yaml:"metadata_yaml_filename,omitempty"` 25 | } 26 | 27 | // AppConfig holds the global application configuration 28 | type AppConfig struct { 29 | DefaultUserAgent string `yaml:"default_user_agent"` 30 | DefaultDelayPerHost time.Duration `yaml:"default_delay_per_host"` 31 | NumWorkers int `yaml:"num_workers"` 32 | NumImageWorkers int `yaml:"num_image_workers,omitempty"` 33 | MaxRequests int `yaml:"max_requests"` 34 | MaxRequestsPerHost int `yaml:"max_requests_per_host"` 35 | OutputBaseDir string `yaml:"output_base_dir"` 36 | StateDir string `yaml:"state_dir"` 37 | MaxRetries int `yaml:"max_retries,omitempty"` 38 | InitialRetryDelay time.Duration `yaml:"initial_retry_delay,omitempty"` 39 | MaxRetryDelay time.Duration `yaml:"max_retry_delay,omitempty"` 40 | SemaphoreAcquireTimeout time.Duration `yaml:"semaphore_acquire_timeout,omitempty"` 41 | GlobalCrawlTimeout time.Duration `yaml:"global_crawl_timeout,omitempty"` 42 | SkipImages bool `yaml:"skip_images,omitempty"` 43 | MaxImageSizeBytes int64 `yaml:"max_image_size_bytes,omitempty"` 44 | HTTPClientSettings HTTPClientConfig `yaml:"http_client_settings,omitempty"` 45 | Sites map[string]SiteConfig `yaml:"sites"` 46 | EnableOutputMapping bool `yaml:"enable_output_mapping,omitempty"` 47 | OutputMappingFilename string `yaml:"output_mapping_filename,omitempty"` 48 | EnableMetadataYAML bool `yaml:"enable_metadata_yaml,omitempty"` 49 | MetadataYAMLFilename string `yaml:"metadata_yaml_filename,omitempty"` 50 | } 51 | 52 | // HTTPClientConfig holds settings for the shared HTTP client 53 | type HTTPClientConfig struct { 54 | Timeout time.Duration `yaml:"timeout,omitempty"` // Overall request timeout 55 | MaxIdleConns int `yaml:"max_idle_conns,omitempty"` // Max total idle connections 56 | MaxIdleConnsPerHost int `yaml:"max_idle_conns_per_host,omitempty"` // Max idle connections per host 57 | IdleConnTimeout time.Duration `yaml:"idle_conn_timeout,omitempty"` // Timeout for idle connections 58 | TLSHandshakeTimeout time.Duration `yaml:"tls_handshake_timeout,omitempty"` // Timeout for TLS handshake 59 | ExpectContinueTimeout time.Duration `yaml:"expect_continue_timeout,omitempty"` // Timeout for 100-continue 60 | ForceAttemptHTTP2 *bool `yaml:"force_attempt_http2,omitempty"` // Explicitly enable/disable HTTP/2 attempt (use pointer for tri-state: nil=default, true=force, false=disable) 61 | DialerTimeout time.Duration `yaml:"dialer_timeout,omitempty"` // Connection dial timeout 62 | DialerKeepAlive time.Duration `yaml:"dialer_keep_alive,omitempty"` // TCP keep-alive interval 63 | } 64 | 65 | // GetEffectiveSkipImages determines the effective skip setting 66 | func GetEffectiveSkipImages(siteCfg SiteConfig, appCfg AppConfig) bool { 67 | if siteCfg.SkipImages != nil { 68 | return *siteCfg.SkipImages 69 | } 70 | return appCfg.SkipImages 71 | } 72 | 73 | // GetEffectiveMaxImageSize determines the effective max image size 74 | func GetEffectiveMaxImageSize(siteCfg SiteConfig, appCfg AppConfig) int64 { 75 | if siteCfg.MaxImageSizeBytes != nil { 76 | return *siteCfg.MaxImageSizeBytes 77 | } 78 | return appCfg.MaxImageSizeBytes 79 | } 80 | 81 | // GetEffectiveEnableOutputMapping determines the effective setting for enabling the mapping file 82 | func GetEffectiveEnableOutputMapping(siteCfg SiteConfig, appCfg AppConfig) bool { 83 | if siteCfg.EnableOutputMapping != nil { 84 | return *siteCfg.EnableOutputMapping 85 | } 86 | return appCfg.EnableOutputMapping // Fallback to global setting 87 | } 88 | 89 | // GetEffectiveOutputMappingFilename determines the effective filename for the mapping file 90 | // Site config (if non-empty) overrides global 91 | // If both site and global are empty, a hardcoded default is returned 92 | func GetEffectiveOutputMappingFilename(siteCfg SiteConfig, appCfg AppConfig) string { 93 | if siteCfg.OutputMappingFilename != "" { 94 | return siteCfg.OutputMappingFilename 95 | } 96 | if appCfg.OutputMappingFilename != "" { 97 | return appCfg.OutputMappingFilename 98 | } 99 | // Fallback to a hardcoded default if neither global nor site-specific filename is provided 100 | return "url_to_file_map.tsv" 101 | } 102 | 103 | // GetEffectiveEnableMetadataYAML determines if YAML metadata should be generated. 104 | func GetEffectiveEnableMetadataYAML(siteCfg SiteConfig, appCfg AppConfig) bool { 105 | if siteCfg.EnableMetadataYAML != nil { 106 | return *siteCfg.EnableMetadataYAML 107 | } 108 | return appCfg.EnableMetadataYAML 109 | } 110 | 111 | // GetEffectiveMetadataYAMLFilename determines the filename for the YAML metadata. 112 | func GetEffectiveMetadataYAMLFilename(siteCfg SiteConfig, appCfg AppConfig) string { 113 | if siteCfg.MetadataYAMLFilename != "" { 114 | return siteCfg.MetadataYAMLFilename 115 | } 116 | if appCfg.MetadataYAMLFilename != "" { 117 | return appCfg.MetadataYAMLFilename 118 | } 119 | return "metadata.yaml" 120 | } 121 | -------------------------------------------------------------------------------- /pkg/utils/errors.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "context" 5 | "errors" 6 | "fmt" 7 | "net" 8 | "os" 9 | "strings" 10 | ) 11 | 12 | // --- Sentinel Errors for Categorization --- 13 | var ( 14 | ErrRetryFailed = errors.New("request failed after all retries") // Wraps the last underlying error 15 | ErrClientHTTPError = errors.New("client HTTP error (4xx)") // Wraps original error/status 16 | ErrServerHTTPError = errors.New("server HTTP error (5xx)") // Wraps original error/status 17 | ErrOtherHTTPError = errors.New("non-2xx HTTP status") // Wraps original error/status 18 | ErrRobotsDisallowed = errors.New("disallowed by robots.txt") 19 | ErrScopeViolation = errors.New("URL out of scope (domain/prefix/pattern)") 20 | ErrMaxDepthExceeded = errors.New("maximum crawl depth exceeded") 21 | ErrContentSelector = errors.New("content selector not found") 22 | ErrParsing = errors.New("parsing error") // Wraps specific parsing error (HTML, URL, JSON, XML) 23 | ErrFilesystem = errors.New("filesystem error") // Wraps os errors 24 | ErrDatabase = errors.New("database error") // Wraps badger errors 25 | ErrSemaphoreTimeout = errors.New("timeout acquiring semaphore") 26 | ErrRequestCreation = errors.New("failed to create HTTP request") 27 | ErrResponseBodyRead = errors.New("failed to read response body") 28 | ErrMarkdownConversion = errors.New("failed to convert HTML to markdown") 29 | ErrConfigValidation = errors.New("configuration validation error") 30 | ) 31 | 32 | // CategorizeError maps an error to a predefined category string for logging/metrics. 33 | func CategorizeError(err error) string { 34 | if err == nil { 35 | return "None" 36 | } 37 | 38 | // Check against sentinel errors first 39 | switch { 40 | case errors.Is(err, ErrRetryFailed): 41 | underlying := errors.Unwrap(err) 42 | if underlying != nil { 43 | if errors.Is(underlying, ErrServerHTTPError) { 44 | return "RetryFailed_HTTPServer" 45 | } 46 | if errors.Is(underlying, ErrClientHTTPError) { 47 | return "RetryFailed_HTTPClient" 48 | } 49 | 50 | // Check for common network error substrings if wrapped error isn't a known sentinel 51 | errMsg := underlying.Error() 52 | if strings.Contains(errMsg, "timeout") || strings.Contains(errMsg, "Timeout") || strings.Contains(errMsg, "deadline exceeded") { 53 | return "RetryFailed_NetworkTimeout" 54 | } 55 | if strings.Contains(errMsg, "connection refused") { 56 | return "RetryFailed_ConnectionRefused" 57 | } 58 | if strings.Contains(errMsg, "no such host") { 59 | return "RetryFailed_DNSLookup" 60 | } 61 | var netErr net.Error 62 | if errors.As(underlying, &netErr) { 63 | if netErr.Timeout() { 64 | return "RetryFailed_NetworkTimeout" 65 | } 66 | } 67 | return "RetryFailed_NetworkOther" // Catch-all for other network errors after retry 68 | } 69 | return "RetryFailed_Unknown" // Retry failed, but couldn't identify underlying cause 70 | case errors.Is(err, ErrClientHTTPError): 71 | // Could try to extract exact 4xx code if needed, but category is often enough 72 | errMsg := err.Error() 73 | if strings.Contains(errMsg, " 404 ") { 74 | return "HTTP_404" 75 | } 76 | if strings.Contains(errMsg, " 403 ") { 77 | return "HTTP_403" 78 | } 79 | if strings.Contains(errMsg, " 401 ") { 80 | return "HTTP_401" 81 | } 82 | if strings.Contains(errMsg, " 429 ") { 83 | return "HTTP_429" 84 | } 85 | return "HTTP_4xx" // Generic 4xx 86 | case errors.Is(err, ErrServerHTTPError): 87 | // Should only see this wrapped by ErrRetryFailed usually, but handle directly too 88 | return "HTTP_5xx" 89 | case errors.Is(err, ErrOtherHTTPError): 90 | return "HTTP_OtherStatus" 91 | case errors.Is(err, ErrRobotsDisallowed): 92 | return "Policy_Robots" 93 | case errors.Is(err, ErrScopeViolation): 94 | return "Policy_Scope" 95 | case errors.Is(err, ErrMaxDepthExceeded): 96 | return "Policy_MaxDepth" 97 | case errors.Is(err, ErrContentSelector): 98 | return "Content_SelectorNotFound" 99 | case errors.Is(err, ErrParsing): 100 | // Could check wrapped error for URL vs HTML vs JSON vs XML parsing if needed 101 | errMsg := err.Error() 102 | if strings.Contains(errMsg, "URL") { 103 | return "Content_ParsingURL" 104 | } 105 | if strings.Contains(errMsg, "HTML") { 106 | return "Content_ParsingHTML" 107 | } 108 | if strings.Contains(errMsg, "JSON") { 109 | return "Content_ParsingJSON" 110 | } 111 | if strings.Contains(errMsg, "XML") { 112 | return "Content_ParsingXML" 113 | } 114 | return "Content_ParsingOther" 115 | case errors.Is(err, ErrMarkdownConversion): 116 | return "Content_Markdown" 117 | case errors.Is(err, ErrFilesystem): 118 | if errors.Is(err, os.ErrPermission) { 119 | return "Filesystem_Permission" 120 | } 121 | if errors.Is(err, os.ErrNotExist) { 122 | return "Filesystem_NotExist" 123 | } 124 | if errors.Is(err, os.ErrExist) { 125 | return "Filesystem_Exist" 126 | } 127 | // Add checks for disk full? requires syscall or specific error strings/numbers per OS 128 | return "Filesystem_Other" 129 | case errors.Is(err, ErrDatabase): 130 | // Could check for specific Badger errors if necessary 131 | return "Database_Other" 132 | case errors.Is(err, ErrSemaphoreTimeout): 133 | return "Resource_SemaphoreTimeout" 134 | case errors.Is(err, ErrRequestCreation): 135 | return "Internal_RequestCreation" 136 | case errors.Is(err, ErrResponseBodyRead): 137 | return "Network_BodyRead" 138 | case errors.Is(err, ErrConfigValidation): 139 | return "Config_Validation" 140 | } 141 | 142 | // --- Fallback checks for common underlying error types/strings --- 143 | 144 | // Context errors 145 | if errors.Is(err, context.Canceled) { 146 | return "System_ContextCanceled" 147 | } 148 | if errors.Is(err, context.DeadlineExceeded) { 149 | // Check if it was semaphore timeout wrapped in context error 150 | if strings.Contains(err.Error(), "semaphore") { 151 | return "Resource_SemaphoreTimeout" 152 | } 153 | return "System_ContextDeadlineExceeded" 154 | } 155 | 156 | // Network errors (if not wrapped by custom sentinels) 157 | var netErr net.Error 158 | if errors.As(err, &netErr) { 159 | if netErr.Timeout() { 160 | return "Network_Timeout" 161 | } 162 | // Other net.Error checks 163 | } 164 | errMsg := err.Error() 165 | // Use lowercase for reliable substring checks 166 | lowerErrMsg := strings.ToLower(errMsg) 167 | if strings.Contains(lowerErrMsg, "timeout") { 168 | return "Network_TimeoutGeneric" 169 | } 170 | if strings.Contains(lowerErrMsg, "connection refused") { 171 | return "Network_ConnectionRefused" 172 | } 173 | if strings.Contains(lowerErrMsg, "no such host") { 174 | return "Network_DNSLookup" 175 | } 176 | if strings.Contains(lowerErrMsg, "tls") || strings.Contains(lowerErrMsg, "certificate") { 177 | return "Network_TLS" 178 | } 179 | if strings.Contains(lowerErrMsg, "reset by peer") { 180 | return "Network_ConnectionReset" 181 | } 182 | if strings.Contains(lowerErrMsg, "broken pipe") { 183 | return "Network_BrokenPipe" 184 | } 185 | 186 | return "Unknown" 187 | } 188 | 189 | // Helper function to wrap an error with context if it's not nil. 190 | func WrapErrorf(err error, format string, args ...interface{}) error { 191 | if err == nil { 192 | return nil 193 | } 194 | // Append the original error to the args for %w 195 | args = append(args, err) 196 | return fmt.Errorf(format+": %w", args...) 197 | } 198 | -------------------------------------------------------------------------------- /pkg/process/links.go: -------------------------------------------------------------------------------- 1 | package process 2 | 3 | import ( 4 | "fmt" 5 | "net/url" 6 | "regexp" 7 | "strings" 8 | "sync" 9 | 10 | "github.com/PuerkitoBio/goquery" 11 | "github.com/sirupsen/logrus" 12 | 13 | "doc-scraper/pkg/config" 14 | "doc-scraper/pkg/models" 15 | "doc-scraper/pkg/parse" 16 | "doc-scraper/pkg/queue" 17 | "doc-scraper/pkg/storage" 18 | "doc-scraper/pkg/utils" 19 | ) 20 | 21 | // LinkProcessor handles extracting and queueing links found on a page 22 | type LinkProcessor struct { 23 | store storage.VisitedStore // To check/mark visited status 24 | pq *queue.ThreadSafePriorityQueue // To queue new work items 25 | compiledDisallowedPatterns []*regexp.Regexp // Pre-compiled patterns 26 | log *logrus.Logger 27 | } 28 | 29 | // NewLinkProcessor creates a LinkProcessor 30 | func NewLinkProcessor( 31 | store storage.VisitedStore, 32 | pq *queue.ThreadSafePriorityQueue, 33 | compiledDisallowedPatterns []*regexp.Regexp, 34 | log *logrus.Logger, 35 | ) *LinkProcessor { 36 | return &LinkProcessor{ 37 | store: store, 38 | pq: pq, 39 | compiledDisallowedPatterns: compiledDisallowedPatterns, 40 | log: log, 41 | } 42 | } 43 | 44 | // ExtractAndQueueLinks finds crawlable links within the specified selectors of a document, filters them based on scope and rules, and adds new ones to the priority queue 45 | // It takes the *original* document to ensure all potential links are considered, before the content might be modified by Markdown conversion etc 46 | func (lp *LinkProcessor) ExtractAndQueueLinks( 47 | originalDoc *goquery.Document, // Use the original, unmodified document 48 | finalURL *url.URL, // The final URL of the page (after redirects) to use as base 49 | currentDepth int, // The depth of the current page 50 | siteCfg config.SiteConfig, // Need site config for rules (nofollow, selectors, scope) 51 | wg *sync.WaitGroup, // Need WaitGroup to increment for queued items 52 | taskLog *logrus.Entry, 53 | ) (queuedCount int, err error) { // Return non-fatal error for DB issues 54 | 55 | nextDepth := currentDepth + 1 56 | taskLog = taskLog.WithField("next_depth", nextDepth) // Add next depth to log context 57 | taskLog.Debug("Extracting and queueing links...") 58 | queuedCount = 0 59 | var firstDBError error = nil 60 | 61 | // Check Max Depth for *next* level before even starting extraction 62 | if siteCfg.MaxDepth > 0 && nextDepth > siteCfg.MaxDepth { 63 | taskLog.Debugf("Max depth (%d) reached/exceeded for next level (%d), skipping link extraction.", siteCfg.MaxDepth, nextDepth) 64 | return 0, nil // No error, just skip 65 | } 66 | 67 | // Use a map to store unique absolute normalized URLs found across all selectors for this page 68 | foundLinks := make(map[string]string) // Map normalized URL -> original URL (for queuing) 69 | 70 | // Determine which selectors to use for link extraction 71 | selectorsToSearch := siteCfg.LinkExtractionSelectors 72 | if len(selectorsToSearch) == 0 { 73 | // Default behavior: Search the whole document body 74 | selectorsToSearch = []string{"body"} 75 | taskLog.Debug("No link_extraction_selectors defined, defaulting to 'body'") 76 | } else { 77 | taskLog.Debugf("Using link_extraction_selectors: %v", selectorsToSearch) 78 | } 79 | 80 | // --- Loop through the specified selectors --- 81 | for _, selector := range selectorsToSearch { 82 | taskLog.Debugf("Searching for links within selector: '%s'", selector) 83 | // Find links within the specified selector in the *original* document 84 | originalDoc.Find(selector).Find("a[href]").Each(func(index int, element *goquery.Selection) { 85 | href, exists := element.Attr("href") 86 | if !exists || href == "" { 87 | return // Skip empty hrefs 88 | } 89 | 90 | // Check nofollow *before* resolving URL (slightly more efficient) 91 | if siteCfg.RespectNofollow { 92 | if rel, _ := element.Attr("rel"); strings.Contains(strings.ToLower(rel), "nofollow") { 93 | taskLog.Debugf("Skipping nofollow link: %s", href) 94 | return 95 | } 96 | } 97 | 98 | // Resolve URL relative to the page's final URL 99 | linkURL, parseErr := finalURL.Parse(href) 100 | if parseErr != nil { 101 | taskLog.Warnf("Skipping invalid link href '%s' in selector '%s': %v", href, selector, parseErr) 102 | return // Skip unparseable links 103 | } 104 | absoluteLinkURL := linkURL.String() // Get the absolute URL string 105 | 106 | // --- Apply Standard Filtering Logic --- 107 | // Scheme check 108 | if linkURL.Scheme != "http" && linkURL.Scheme != "https" { 109 | return // Skip non-http(s) links like mailto:, tel:, etc 110 | } 111 | // Basic skip patterns (fragments handled by normalization later) 112 | // TODO: Javascript check - more robust? Maybe check scheme directly 113 | 114 | // Scope: Domain 115 | if linkURL.Hostname() != siteCfg.AllowedDomain { 116 | return // Skip different domains 117 | } 118 | 119 | // Scope: Prefix 120 | // Normalize path for check (ensure leading slash) 121 | targetPath := linkURL.Path 122 | if targetPath == "" { 123 | targetPath = "/" 124 | } 125 | if !strings.HasPrefix(targetPath, siteCfg.AllowedPathPrefix) { 126 | return // Skip paths outside the allowed prefix 127 | } 128 | 129 | // Scope: Disallowed patterns (use pre-compiled regex from LinkProcessor) 130 | isDisallowed := false 131 | for _, pattern := range lp.compiledDisallowedPatterns { 132 | // Match against the path part of the URL 133 | if pattern.MatchString(linkURL.Path) { 134 | isDisallowed = true 135 | taskLog.Debugf("Link '%s' disallowed by pattern: %s", absoluteLinkURL, pattern.String()) 136 | break 137 | } 138 | } 139 | if isDisallowed { 140 | return // Skip disallowed paths 141 | } 142 | // --- End Filtering --- 143 | 144 | // Normalize the valid, in-scope URL 145 | normalizedLink, _, errNorm := parse.ParseAndNormalize(absoluteLinkURL) 146 | if errNorm != nil { 147 | taskLog.Warnf("Cannot normalize extracted link '%s': %v", absoluteLinkURL, errNorm) 148 | return // Skip if normalization fails 149 | } 150 | 151 | // Add to map if not already present (using normalized as key) 152 | if _, found := foundLinks[normalizedLink]; !found { 153 | foundLinks[normalizedLink] = absoluteLinkURL // Store original URL for queueing 154 | } 155 | }) 156 | } 157 | 158 | // --- Queue New Links (check DB before queueing) --- 159 | if len(foundLinks) > 0 { 160 | taskLog.Debugf("Found %d unique, valid, in-scope links across all specified selectors.", len(foundLinks)) 161 | for normalizedLink, originalLinkURL := range foundLinks { 162 | // Check DB: Use MarkPageVisited which handles adding if not found 163 | // This prevents queueing if the link was *already* successfully processed or is currently pending from another source (sitemap, resume) 164 | added, visitErr := lp.store.MarkPageVisited(normalizedLink) 165 | if visitErr != nil { 166 | dbErr := fmt.Errorf("%w: checking/marking link '%s' visited: %w", utils.ErrDatabase, normalizedLink, visitErr) 167 | taskLog.Error(dbErr) 168 | if firstDBError == nil { 169 | firstDBError = dbErr 170 | } // Collect first DB error encountered 171 | continue // Skip this link if DB error occurs 172 | } 173 | 174 | // Only queue if MarkPageVisited returned true (meaning it was newly added) 175 | if added { 176 | wg.Add(1) // Increment WaitGroup *before* adding to queue 177 | nextWorkItem := models.WorkItem{URL: originalLinkURL, Depth: nextDepth} // Queue the *original* URL 178 | lp.pq.Add(&nextWorkItem) 179 | queuedCount++ 180 | taskLog.Debugf("Queued new link: %s (Normalized: %s)", originalLinkURL, normalizedLink) 181 | } else { 182 | taskLog.Debugf("Link already visited/pending, skipping queue: %s", normalizedLink) 183 | } 184 | } 185 | } else { 186 | taskLog.Debug("No new valid links found to queue.") 187 | } 188 | 189 | taskLog.Infof("Finished link extraction. Queued %d NEW links.", queuedCount) 190 | return queuedCount, firstDBError // Return count and any non-fatal DB error encountered 191 | } 192 | -------------------------------------------------------------------------------- /config.yaml: -------------------------------------------------------------------------------- 1 | # --- Global Application Settings --- 2 | 3 | # Default User-Agent string for HTTP requests (used if site doesn't specify). 4 | # # Also used for fetching robots.txt. 5 | # default_user_agent: "GoCrawler/1.0 (Your Bot Info)" 6 | 7 | # Default minimum time delay between requests to the same host. 8 | # Format: e.g., "2s", "500ms". "0s" means no default delay. 9 | default_delay_per_host: 500ms 10 | 11 | # Number of concurrent workers processing pages. 12 | num_workers: 6 13 | 14 | # Number of concurrent workers downloading images (optional, defaults to num_workers if omitted). 15 | num_image_workers: 6 16 | 17 | # Global limit for total concurrent outgoing HTTP requests across all workers. 18 | max_requests: 48 19 | 20 | # Limit for concurrent outgoing HTTP requests *per host*. 21 | max_requests_per_host: 4 22 | 23 | # Base directory for saving crawled output (Markdown files, images). 24 | # Each site gets a subdirectory named after its sanitized domain. 25 | output_base_dir: "./crawled_docs" 26 | 27 | # Directory for storing persistent state (e.g., visited URLs database). 28 | # Each site gets a subdirectory here. 29 | state_dir: "./crawler_state" 30 | 31 | # --- Retry Settings (optional) --- 32 | # Maximum number of retries for failed requests (network errors, 5xx). 33 | max_retries: 4 34 | # Initial delay before the first retry (e.g., "1s", "500ms"). 35 | initial_retry_delay: 1s 36 | # Maximum delay between retries (caps exponential backoff). 37 | max_retry_delay: 30s 38 | 39 | # --- Timeout Settings (optional) --- 40 | # Maximum time to wait to acquire a semaphore before timing out. 41 | semaphore_acquire_timeout: 30s 42 | # Overall maximum duration for the entire crawl run (e.g., "2h", "30m"). "0s" or omitted means no timeout. 43 | global_crawl_timeout: 0s 44 | 45 | # --- Global Image Handling Defaults (optional, can be overridden per site) --- 46 | # Globally skip downloading and processing images. 47 | skip_images: false 48 | # Globally limit maximum image size in bytes (0 means unlimited). 49 | max_image_size_bytes: 10485760 # Example: 10 MiB 50 | 51 | # Whether to generate and save the URL-to-file output mapping. 52 | # Helps in tracking and referencing downloaded content easily. 53 | enable_output_mapping: true 54 | 55 | # Filename for output URL-to-file mapping table. 56 | # Stores a map of original URLs to their corresponding local filenames. 57 | output_mapping_filename: "url_to_file_map.tsv" 58 | 59 | # Whether to generate and save metadata (like crawl timestamp, site info, etc.) in a YAML file. 60 | # Useful for tracking crawl history and managing data provenance. 61 | enable_metadata_yaml: true 62 | 63 | # Filename for the generated metadata YAML file. 64 | # This file will contain crawl-related information (e.g., site name, crawl time, total pages/images, errors). 65 | metadata_yaml_filename: "crawl_meta.yaml" 66 | 67 | # --- HTTP Client Settings (optional section, reasonable defaults applied if omitted) --- 68 | http_client_settings: 69 | # Overall timeout for a single HTTP request. 70 | # Covers connection, TLS handshake, sending request, waiting for headers, AND reading the entire response body. 71 | # Default in your code: 45s 72 | timeout: 45s 73 | 74 | # Max total idle connections across all hosts. 75 | # Limits the total number of reusable TCP connections kept open waiting for new requests to any server. 76 | # Default in your code: 100 77 | max_idle_conns: 100 78 | 79 | # Max idle connections per host. 80 | # Limits the number of reusable TCP connections kept open waiting for new requests to a *specific* server (hostname:port). 81 | # Default in your code: 5 (Slightly higher than Go's default of 2) 82 | max_idle_conns_per_host: 6 83 | 84 | # Max time an idle connection is kept alive. 85 | # How long an unused connection can sit in the idle pool before being closed automatically. 86 | # Default in your code: 90s 87 | idle_conn_timeout: 90s 88 | 89 | # Timeout for TLS handshake. 90 | # Time limit specifically for completing the secure connection setup (SSL/TLS). 91 | # Default in your code: 10s 92 | tls_handshake_timeout: 10s 93 | 94 | # Timeout waiting for "100 Continue" response. 95 | # Relevant for requests like POST/PUT where the client might send headers first and wait for a "100 Continue" status before sending the body. 96 | # Default in your code: 1s 97 | expect_continue_timeout: 1s 98 | 99 | # Explicitly attempt/disable HTTP/2 (true/false). null uses Go default. 100 | # Controls whether the client tries to negotiate HTTP/2 with servers that support it. HTTP/2 can offer performance benefits (e.g., multiplexing). 101 | # Default in your code: null (which means 'true' - use Go's default behavior of attempting HTTP/2) 102 | force_attempt_http2: null 103 | 104 | # Timeout for establishing TCP connection. 105 | # Time limit specifically for the underlying network dialer to connect to the server's IP address and port. 106 | # Default in your code: 15s 107 | dialer_timeout: 15s 108 | 109 | # TCP keep-alive interval. 110 | # If non-zero, enables TCP keep-alives, sending small packets on idle connections to check if they are still active and prevent intermediate network devices (firewalls, NATs) from dropping them. 111 | # Default in your code: 30s 112 | dialer_keep_alive: 30s 113 | 114 | # --- Site-Specific Configurations --- 115 | # Define one or more sites to crawl below, using a unique key for each. 116 | sites: 117 | # Replace 'example_site' with an identifier for your target site (e.g., 'example_docs') 118 | example_site: 119 | # List of starting URLs for this site. 120 | start_urls: 121 | - "https://example.com/docs/" 122 | # Domain that the crawler is restricted to for this site. (Required) 123 | allowed_domain: "example.com" 124 | # Path prefix the crawler is restricted to (e.g., "/docs"). Use "/" for the whole domain. (Required) 125 | allowed_path_prefix: "/docs" 126 | # CSS selector for the main content area to extract. (Required) 127 | content_selector: "article.content" # Example: Change to your target selector 128 | 129 | # --- Optional Site Settings --- 130 | # CSS selectors to search for links within (defaults to 'body' if omitted). 131 | link_extraction_selectors: 132 | - "article.content" 133 | - "nav.sidebar" 134 | # Regex patterns for paths to exclude (paths matching these won't be crawled). 135 | disallowed_path_patterns: 136 | - "/api/.*" 137 | - "/private/" 138 | - "\\.pdf$" 139 | # Respect rel="nofollow" attribute on links. 140 | respect_nofollow: true 141 | # Override the default_user_agent for this site. 142 | user_agent: "MyCustomBot/1.0 (+http://mybot.info)" 143 | # Override the default_delay_per_host for this site. 144 | delay_per_host: "1500ms" 145 | # Maximum crawl depth relative to start URLs (0 means unlimited). 146 | max_depth: 5 147 | # Override global 'skip_images' setting (true/false). null uses global. 148 | skip_images: null 149 | # Override global 'max_image_size_bytes' (0 for unlimited). null uses global. 150 | # max_image_size_bytes: 5242880 # Example: 5 MiB override 151 | # Only download images from these domain patterns (*.example.com or specific.com). 152 | allowed_image_domains: 153 | - "cdn.example.com" 154 | - "*.static-example.net" 155 | # Never download images from these domain patterns. 156 | disallowed_image_domains: 157 | - "ads.example.com" 158 | enable_output_mapping: true 159 | # Override global name for this site 160 | output_mapping_filename: "example_site_manifest.txt" 161 | 162 | langchain_py: 163 | start_urls: 164 | - https://python.langchain.com/docs/introduction/ 165 | allowed_domain: python.langchain.com 166 | allowed_path_prefix: /docs/ 167 | content_selector: article 168 | max_depth: 0 # unlimited 169 | respect_nofollow: true 170 | disallowed_path_patterns: 171 | - ^/docs/experimental/.* 172 | - ^/docs/api/.* # Adjusted pattern 173 | output_mapping_filename: "langchain_py_manifest.txt" 174 | enable_metadata_yaml: true 175 | metadata_yaml_filename: "langchain_metadata.yaml" 176 | 177 | rust_docs: 178 | start_urls: 179 | - https://rust-cli.github.io/book/tutorial/ 180 | allowed_domain: rust-cli.github.io 181 | allowed_path_prefix: /book/ 182 | content_selector: div#content.content 183 | max_depth: 0 # unlimited 184 | respect_nofollow: true 185 | link_extraction_selectors: 186 | - nav.nav-wrapper 187 | - div#menu-bar.menu-bar 188 | # - a.nav-chapeters.previous 189 | enable_output_mapping: true 190 | # Override global name for this site 191 | output_mapping_filename: "rust_docs_manifest.txt" 192 | enable_metadata_yaml: true -------------------------------------------------------------------------------- /pkg/fetch/fetcher.go: -------------------------------------------------------------------------------- 1 | package fetch 2 | 3 | import ( 4 | "context" 5 | "errors" 6 | "fmt" 7 | "io" 8 | "math" 9 | "math/rand" 10 | "net/http" 11 | "net/url" 12 | "time" 13 | 14 | "github.com/sirupsen/logrus" 15 | 16 | "doc-scraper/pkg/config" 17 | "doc-scraper/pkg/utils" 18 | ) 19 | 20 | // Fetcher handles making HTTP requests with configured retry logic, using an underlying http.Client 21 | type Fetcher struct { 22 | client *http.Client // The configured HTTP client to use for requests 23 | cfg config.AppConfig // Application config, needed primarily for retry settings 24 | log *logrus.Logger 25 | } 26 | 27 | // NewFetcher creates a new Fetcher instance 28 | func NewFetcher(client *http.Client, cfg config.AppConfig, log *logrus.Logger) *Fetcher { 29 | return &Fetcher{ 30 | client: client, 31 | cfg: cfg, 32 | log: log, 33 | } 34 | } 35 | 36 | // FetchWithRetry performs an HTTP request associated with the provided context 37 | // It implements a retry mechanism with exponential backoff and jitter for transient network errors and specific HTTP status codes (5xx, 429) 38 | func (f *Fetcher) FetchWithRetry(req *http.Request, ctx context.Context) (*http.Response, error) { 39 | var lastErr error // Stores the error from the *last* failed attempt in the loop 40 | var currentResp *http.Response // Stores the response from the *current* attempt (potentially failed) 41 | 42 | reqLog := f.log.WithField("url", req.URL.String()) 43 | 44 | // Get retry settings from the application configuration 45 | maxRetries := f.cfg.MaxRetries 46 | initialRetryDelay := f.cfg.InitialRetryDelay 47 | maxRetryDelay := f.cfg.MaxRetryDelay 48 | 49 | // Retry loop: Try up to maxRetries+1 times (initial attempt + retries) 50 | for attempt := 0; attempt <= maxRetries; attempt++ { 51 | 52 | // --- Context Check --- 53 | // Check if the context has been cancelled *before* making the attempt or sleeping 54 | select { 55 | case <-ctx.Done(): 56 | reqLog.Warnf("Context cancelled before attempt %d: %v", attempt, ctx.Err()) 57 | // If context cancelled, wrap the last known error (if any) or return the context error 58 | if lastErr != nil { 59 | // Use utils.WrapErrorf if you want a consistent wrapping style 60 | return nil, fmt.Errorf("context cancelled (%v) during retry backoff after error: %w", ctx.Err(), lastErr) 61 | } 62 | return nil, fmt.Errorf("context cancelled before first attempt: %w", ctx.Err()) 63 | default: 64 | // Context is still active, proceed with the attempt 65 | } 66 | 67 | // --- Exponential Backoff Delay --- 68 | // Apply delay only *before* retry attempts (not before the first attempt) 69 | if attempt > 0 { 70 | // Calculate delay: initial * 2^(attempt-1), capped by maxRetryDelay 71 | backoff := float64(initialRetryDelay) * math.Pow(2, float64(attempt-1)) 72 | delay := time.Duration(backoff) 73 | if delay <= 0 || delay > maxRetryDelay { // Handle zero/negative initial delay or cap exceeding max 74 | delay = maxRetryDelay 75 | } 76 | 77 | // Add jitter: +/- 10% of the calculated delay to help avoid thundering herd 78 | // Ensure delay calculation doesn't lead to negative division in rand.Int63n 79 | var jitter time.Duration 80 | if delay > 0 { 81 | jitter = time.Duration(rand.Int63n(int64(delay)/5)) - (delay / 10) // +/- 10% range is delay/5 wide centered at 0 82 | } 83 | finalDelay := delay + jitter 84 | if finalDelay < 0 { // Ensure final delay isn't negative 85 | finalDelay = 0 86 | } 87 | 88 | reqLog.WithFields(logrus.Fields{"attempt": attempt, "max_retries": maxRetries, "delay": finalDelay}).Warn("Retrying request...") 89 | 90 | // Wait for the calculated delay, but respect context cancellation during the wait 91 | sleepCtx, sleepCancel := context.WithTimeout(ctx, finalDelay) 92 | select { 93 | case <-time.After(finalDelay): 94 | // Sleep completed normally 95 | case <-sleepCtx.Done(): 96 | // Context was cancelled *during* the sleep 97 | sleepCancel() // Clean up the sleep context timer 98 | reqLog.Warnf("Context cancelled during retry sleep: %v", sleepCtx.Err()) 99 | // Return the error from the *previous* attempt, wrapped with context info 100 | if lastErr != nil { 101 | return nil, fmt.Errorf("context cancelled (%v) during retry delay after error: %w", sleepCtx.Err(), lastErr) 102 | } 103 | return nil, fmt.Errorf("context cancelled during retry delay: %w", sleepCtx.Err()) 104 | } 105 | sleepCancel() // Clean up sleep context if time.After completed 106 | } 107 | 108 | // --- Perform HTTP Request --- 109 | // Attach the current context to the request for this attempt 110 | reqWithCtx := req.WithContext(ctx) 111 | // Execute the request using the underlying HTTP client 112 | currentResp, lastErr = f.client.Do(reqWithCtx) 113 | 114 | // --- Handle Network-Level Errors --- 115 | // Errors occurring before getting an HTTP response (DNS, TCP, TLS errors etc.) 116 | if lastErr != nil { 117 | // Check specifically for context cancellation/timeout during the HTTP call itself 118 | if errors.Is(lastErr, context.Canceled) || errors.Is(lastErr, context.DeadlineExceeded) { 119 | reqLog.Warnf("Context cancelled/timed out during HTTP request execution: %v", lastErr) 120 | // Ensure response body (if partially received) is closed 121 | if currentResp != nil { 122 | io.Copy(io.Discard, currentResp.Body) 123 | currentResp.Body.Close() 124 | } 125 | // Do not retry context errors. Return the context error directly 126 | return nil, lastErr 127 | } 128 | 129 | // Check for URL errors - these are typically not retryable 130 | var urlErr *url.Error 131 | if errors.As(lastErr, &urlErr) { 132 | // Could add more specific checks here if needed, e.g., urlErr.Temporary() 133 | // For now, most url.Error types resulting from Do() are network-related, so we retry 134 | } 135 | 136 | // Log other network errors and proceed to retry 137 | reqLog.WithField("attempt", attempt).Errorf("Network error: %v", lastErr) 138 | // Ensure body is closed if a response object exists despite the error 139 | if currentResp != nil { 140 | io.Copy(io.Discard, currentResp.Body) 141 | currentResp.Body.Close() 142 | } 143 | continue // Go to the next retry attempt for network errors 144 | } 145 | 146 | // --- Handle HTTP Status Codes --- 147 | // If lastErr is nil, we received an HTTP response - Check its status code 148 | statusCode := currentResp.StatusCode 149 | resLog := reqLog.WithFields(logrus.Fields{"status_code": statusCode, "status": currentResp.Status, "attempt": attempt}) 150 | 151 | switch { 152 | case statusCode >= 200 && statusCode < 300: 153 | // Success (2xx)! Return the response immediately - Caller must close body 154 | resLog.Debug("Successfully fetched") 155 | return currentResp, nil 156 | 157 | case statusCode >= 500: 158 | // Server Error (5xx). These are potentially transient, so retry 159 | resLog.Warn("Server error, retrying...") 160 | // Store the error for this attempt, wrapped with a sentinel type 161 | lastErr = fmt.Errorf("%w: status %d %s", utils.ErrServerHTTPError, statusCode, currentResp.Status) 162 | // Must drain and close the body before the next retry attempt 163 | io.Copy(io.Discard, currentResp.Body) 164 | currentResp.Body.Close() 165 | continue // Go to the next retry attempt 166 | 167 | case statusCode == http.StatusTooManyRequests: // Specifically handle 429 168 | // Rate limited by the server; Retry according to policy 169 | // Future enhancement: Parse Retry-After header for smarter delay 170 | resLog.Warn("Received 429 Too Many Requests, retrying...") 171 | lastErr = fmt.Errorf("%w: status %d %s", utils.ErrClientHTTPError, statusCode, currentResp.Status) // Categorize as Client error for now 172 | io.Copy(io.Discard, currentResp.Body) 173 | currentResp.Body.Close() 174 | continue // Go to the next retry attempt 175 | 176 | case statusCode >= 400 && statusCode < 500: 177 | // Other Client Errors (4xx, excluding 429). These are generally not retryable (e.g., 404 Not Found, 403 Forbidden) 178 | resLog.Warn("Client error (4xx), not retrying") 179 | // Return the response object (caller might want to inspect headers/body) 180 | // along with a wrapped error indicating a non-retryable client error 181 | // *** Caller MUST close currentResp.Body in this case *** 182 | return currentResp, fmt.Errorf("%w: status %d %s", utils.ErrClientHTTPError, statusCode, currentResp.Status) 183 | 184 | default: 185 | // Other non-2xx statuses (e.g., 3xx if redirects were disabled, or other unexpected codes) 186 | // Treat these as non-retryable. 187 | resLog.Warnf("Non-retryable/unexpected status: %d", statusCode) 188 | // Return the response and a wrapped error 189 | // *** Caller MUST close currentResp.Body in this case *** 190 | return currentResp, fmt.Errorf("%w: status %d %s", utils.ErrOtherHTTPError, statusCode, currentResp.Status) 191 | } 192 | } 193 | 194 | // --- All Retries Failed --- 195 | // If the loop completes, all attempts (initial + retries) have failed 196 | reqLog.Errorf("All %d fetch retries failed. Last error: %v", maxRetries+1, lastErr) 197 | // Ensure the body of the *last* response (if any) is closed 198 | if currentResp != nil { 199 | io.Copy(io.Discard, currentResp.Body) 200 | currentResp.Body.Close() 201 | } 202 | 203 | // Wrap the *very last error* encountered (could be network error, 5xx, 429, or context error from last sleep) 204 | if lastErr != nil { 205 | // Check if the loop terminated because the context was cancelled during the *final* backoff sleep 206 | if errors.Is(lastErr, context.Canceled) || errors.Is(lastErr, context.DeadlineExceeded) { 207 | return nil, lastErr // Return the context error directly 208 | } 209 | // Otherwise, wrap the last HTTP/network error with the ErrRetryFailed sentinel 210 | return nil, fmt.Errorf("%w: %w", utils.ErrRetryFailed, lastErr) 211 | } 212 | 213 | // This case should be theoretically unreachable if maxRetries >= 0, but return a generic ErrRetryFailed if lastErr was somehow nil after the loop 214 | return nil, utils.ErrRetryFailed 215 | } 216 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2025 Sriram Periannan 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /pkg/process/content.go: -------------------------------------------------------------------------------- 1 | package process 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "net/url" 7 | "os" 8 | "path" 9 | "path/filepath" 10 | "strings" 11 | 12 | md "github.com/JohannesKaufmann/html-to-markdown" 13 | "github.com/PuerkitoBio/goquery" 14 | "github.com/sirupsen/logrus" 15 | 16 | "doc-scraper/pkg/config" 17 | "doc-scraper/pkg/utils" 18 | ) 19 | 20 | // ContentProcessor handles extracting, cleaning, processing (images, links), converting to Markdown, and saving of page content 21 | type ContentProcessor struct { 22 | imgProcessor *ImageProcessor 23 | log *logrus.Logger 24 | appCfg config.AppConfig 25 | } 26 | 27 | // NewContentProcessor creates a ContentProcessor 28 | func NewContentProcessor(imgProcessor *ImageProcessor, appCfg config.AppConfig, log *logrus.Logger) *ContentProcessor { 29 | return &ContentProcessor{ 30 | imgProcessor: imgProcessor, 31 | appCfg: appCfg, 32 | log: log, 33 | } 34 | } 35 | 36 | // ExtractProcessAndSaveContent extracts content using siteCfg.ContentSelector, processes images and internal links within that content, converts it to Markdown, and saves it to a path derived from finalURL and siteOutputDir 37 | // Returns the extracted page title and any critical error encountered during processing or saving 38 | func (cp *ContentProcessor) ExtractProcessAndSaveContent( 39 | doc *goquery.Document, // Parsed document of the fetched page 40 | finalURL *url.URL, // Final URL after redirects 41 | siteCfg config.SiteConfig, // Site-specific configuration 42 | siteOutputDir string, // Base output directory for this site 43 | taskLog *logrus.Entry, // Logger with task-specific context 44 | ctx context.Context, // Context for cancellation propagation 45 | ) (pageTitle string, savedFilePath string, err error) { 46 | taskLog.Debug("Extracting, processing, and saving content...") 47 | 48 | pageTitle = strings.TrimSpace(doc.Find("title").First().Text()) 49 | if pageTitle == "" { 50 | pageTitle = "Untitled Page" 51 | } 52 | taskLog = taskLog.WithField("page_title", pageTitle) 53 | 54 | mainContentSelection := doc.Find(siteCfg.ContentSelector) 55 | if mainContentSelection.Length() == 0 { 56 | err = fmt.Errorf("%w: selector '%s' not found on page '%s'", utils.ErrContentSelector, siteCfg.ContentSelector, finalURL.String()) 57 | taskLog.Warn(err.Error()) 58 | return pageTitle, "", err 59 | } 60 | // Clone selection to modify images/links without affecting original doc needed for link extraction 61 | mainContent := mainContentSelection.First().Clone() 62 | taskLog.Debugf("Found main content using selector '%s'", siteCfg.ContentSelector) 63 | 64 | currentPageFullOutputPath, pageInScope := cp.getOutputPathForURL(finalURL, siteCfg, siteOutputDir) 65 | if !pageInScope { 66 | err = fmt.Errorf("%w: output path calculation failed unexpectedly for in-scope URL '%s'", utils.ErrScopeViolation, finalURL.String()) 67 | taskLog.Error(err) 68 | return pageTitle, "", err 69 | } 70 | 71 | currentPageOutputDir := filepath.Dir(currentPageFullOutputPath) 72 | 73 | // ProcessImages finds images, attempts downloads/checks cache via workers, and sets 'data-crawl-status' attribute on mainContent tags 74 | imageMap, _ := cp.imgProcessor.ProcessImages(mainContent, finalURL, siteCfg, siteOutputDir, taskLog, ctx) 75 | // Ignore non-fatal image processing errors (already logged by ImageProcessor) 76 | 77 | // Rewrite or remove img tags based on the status set by ProcessImages 78 | imgRewriteCount, imgRemoveCount, imgSkippedCount := 0, 0, 0 79 | mainContent.Find("img").Each(func(i int, element *goquery.Selection) { 80 | status, _ := element.Attr("data-crawl-status") 81 | originalSrc, srcExists := element.Attr("src") 82 | element.RemoveAttr("data-crawl-status") // Cleanup attribute 83 | 84 | switch status { 85 | case "success", "pending-download": // Check map for actual result 86 | if !srcExists { // Should not happen if status was set 87 | element.Remove() 88 | imgRemoveCount++ 89 | taskLog.Warnf("Image status '%s' but missing src. Removing.", status) 90 | return 91 | } 92 | absImgURL, resolveErr := finalURL.Parse(originalSrc) 93 | if resolveErr != nil { // Should not happen if parsed before 94 | element.Remove() 95 | imgRemoveCount++ 96 | taskLog.Warnf("Could not re-parse original src '%s'. Removing tag. Error: %v", originalSrc, resolveErr) 97 | return 98 | } 99 | 100 | if imgData, ok := imageMap[absImgURL.String()]; ok && imgData.LocalPath != "" { 101 | 102 | // 1. Construct the absolute path to the saved image file 103 | absoluteImagePath := filepath.Join(siteOutputDir, imgData.LocalPath) 104 | // 2. Calculate the path relative from the current MD file's directory to the image file 105 | relativeImagePath, relErr := filepath.Rel(currentPageOutputDir, absoluteImagePath) 106 | if relErr != nil { 107 | taskLog.Warnf("Could not calculate relative image path from '%s' to '%s' for src '%s': %v. Removing image tag.", currentPageOutputDir, absoluteImagePath, originalSrc, relErr) 108 | element.Remove() 109 | imgRemoveCount++ 110 | return 111 | } 112 | 113 | // 3. Use the calculated relative path (ensure forward slashes for web/markdown) 114 | finalImageSrc := filepath.ToSlash(relativeImagePath) 115 | // Rewrite src to local path and update alt from caption 116 | element.SetAttr("src", finalImageSrc) 117 | if imgData.Caption != "" { 118 | element.SetAttr("alt", imgData.Caption) 119 | } else { 120 | element.RemoveAttr("alt") 121 | } 122 | imgRewriteCount++ 123 | } else { 124 | // Download/lookup failed 125 | element.Remove() 126 | imgRemoveCount++ 127 | taskLog.Debugf("Removing image tag for failed download/lookup: src='%s' (Status: %s)", originalSrc, status) 128 | } 129 | case "error-parse", "error-normalize", "error-db", "error-filesystem": 130 | // Fatal error during initial checks 131 | element.Remove() 132 | imgRemoveCount++ 133 | taskLog.Debugf("Removing image tag due to fatal error: src='%s' (Status: %s)", originalSrc, status) 134 | case "skipped-config", "skipped-empty-src", "skipped-data-uri", "skipped-scheme", "skipped-domain", "skipped-robots": 135 | // Non-fatal skip, leave original tag 136 | imgSkippedCount++ 137 | taskLog.Debugf("Leaving skipped image tag: src='%s' (Status: %s)", originalSrc, status) 138 | default: 139 | // Unknown or unexpected status, leave tag 140 | imgSkippedCount++ 141 | taskLog.Warnf("Image tag with unexpected status '%s': src='%s'. Leaving tag.", status, originalSrc) 142 | } 143 | }) 144 | taskLog.Debugf("Image handling complete: Rewrote %d, Removed %d, Left Skipped %d.", imgRewriteCount, imgRemoveCount, imgSkippedCount) 145 | 146 | // Rewrite internal links to relative markdown paths 147 | _, linkRewriteErr := cp.rewriteInternalLinks(mainContent, finalURL, currentPageFullOutputPath, siteCfg, siteOutputDir, taskLog) 148 | if linkRewriteErr != nil { 149 | // Log non-fatal link rewriting errors 150 | taskLog.Warnf("Non-fatal error during internal link rewriting: %v", linkRewriteErr) 151 | } 152 | 153 | // Convert the processed content to Markdown 154 | modifiedHTML, outerHtmlErr := goquery.OuterHtml(mainContent) 155 | if outerHtmlErr != nil { 156 | err = fmt.Errorf("failed getting modified HTML: %w", outerHtmlErr) 157 | taskLog.Error(err) 158 | return pageTitle, "", err 159 | } 160 | 161 | converter := md.NewConverter("", true, nil) 162 | markdownContent, convertErr := converter.ConvertString(modifiedHTML) 163 | if convertErr != nil { 164 | err = fmt.Errorf("%w: %w", utils.ErrMarkdownConversion, convertErr) 165 | taskLog.Error(err) 166 | return pageTitle, "", err 167 | } 168 | 169 | // Save the Markdown file 170 | outputDirForFile := filepath.Dir(currentPageFullOutputPath) 171 | if mkdirErr := os.MkdirAll(outputDirForFile, 0755); mkdirErr != nil { 172 | err = fmt.Errorf("%w: creating output directory '%s': %w", utils.ErrFilesystem, outputDirForFile, mkdirErr) 173 | taskLog.Error(err) 174 | return pageTitle, "", err 175 | } 176 | 177 | writeErr := os.WriteFile(currentPageFullOutputPath, []byte(markdownContent), 0644) 178 | if writeErr != nil { 179 | err = fmt.Errorf("%w: saving markdown '%s': %w", utils.ErrFilesystem, currentPageFullOutputPath, writeErr) 180 | taskLog.Error(err) 181 | return pageTitle, "", err 182 | } 183 | 184 | taskLog.Infof("Saved Markdown (%d bytes): %s", len(markdownContent), currentPageFullOutputPath) 185 | taskLog.Debug("Content extraction, processing, and saving complete.") 186 | return pageTitle, currentPageFullOutputPath, nil // Success 187 | } 188 | 189 | // getOutputPathForURL calculates the local filesystem path for a crawled URL, performing scope checks and mapping URLs to sanitized file/directory structures 190 | // Returns the absolute output path and true if the URL is in scope, otherwise empty path and false 191 | func (cp *ContentProcessor) getOutputPathForURL(targetURL *url.URL, siteCfg config.SiteConfig, siteOutputDir string) (string, bool) { 192 | // Scope checks: scheme, domain, path prefix 193 | if (targetURL.Scheme != "http" && targetURL.Scheme != "https") || 194 | targetURL.Hostname() != siteCfg.AllowedDomain { 195 | return "", false 196 | } 197 | targetPath := targetURL.Path 198 | if targetPath == "" { 199 | targetPath = "/" // Treat root URL as "/" 200 | } 201 | if !strings.HasPrefix(targetPath, siteCfg.AllowedPathPrefix) { 202 | return "", false 203 | } 204 | 205 | outputFilename := "index.md" // Default for directory-like URLs 206 | outputSubDir := siteOutputDir 207 | 208 | normalizedPath := strings.TrimSuffix(targetPath, "/") 209 | if normalizedPath == "" { 210 | normalizedPath = "/" 211 | } 212 | relativePath := strings.TrimPrefix(normalizedPath, siteCfg.AllowedPathPrefix) 213 | relativePath = strings.TrimPrefix(relativePath, "/") 214 | 215 | if relativePath != "" { 216 | baseName := path.Base(relativePath) 217 | dirPart := path.Dir(relativePath) 218 | ext := path.Ext(baseName) 219 | 220 | // Determine if path looks like a file (has extension) or directory 221 | if ext != "" && len(ext) > 1 { // File-like URL 222 | outputFilename = utils.SanitizeFilename(strings.TrimSuffix(baseName, ext)) + ".md" 223 | if dirPart != "" && dirPart != "." { 224 | var sanitizedDirParts []string 225 | for _, part := range strings.Split(dirPart, "/") { 226 | if part != "" { 227 | sanitizedDirParts = append(sanitizedDirParts, utils.SanitizeFilename(part)) 228 | } 229 | } 230 | if len(sanitizedDirParts) > 0 { 231 | outputSubDir = filepath.Join(siteOutputDir, filepath.Join(sanitizedDirParts...)) 232 | } 233 | } 234 | } else { // Directory-like URL 235 | // Use index.md; create subdirs based on full relative path 236 | var sanitizedDirParts []string 237 | for _, part := range strings.Split(relativePath, "/") { 238 | if part != "" { 239 | sanitizedDirParts = append(sanitizedDirParts, utils.SanitizeFilename(part)) 240 | } 241 | } 242 | if len(sanitizedDirParts) > 0 { 243 | outputSubDir = filepath.Join(siteOutputDir, filepath.Join(sanitizedDirParts...)) 244 | } 245 | } 246 | } 247 | 248 | fullPath := filepath.Join(outputSubDir, outputFilename) 249 | return fullPath, true 250 | } 251 | 252 | // rewriteInternalLinks modifies href attributes of anchor tags within mainContent 253 | // It converts links pointing within the crawl scope to relative filesystem paths 254 | // Returns the number of links rewritten and the first non-fatal error encountered 255 | func (cp *ContentProcessor) rewriteInternalLinks( 256 | mainContent *goquery.Selection, // The content selection to modify 257 | finalURL *url.URL, // Base URL for resolving relative hrefs 258 | currentPageFullOutputPath string, // Filesystem path of the current MD file 259 | siteCfg config.SiteConfig, // For scope checking linked URLs 260 | siteOutputDir string, // For calculating target MD paths 261 | taskLog *logrus.Entry, 262 | ) (rewriteCount int, err error) { 263 | taskLog.Debug("Rewriting internal links...") 264 | rewriteCount = 0 265 | var firstError error = nil 266 | 267 | currentPageOutputDir := filepath.Dir(currentPageFullOutputPath) 268 | 269 | mainContent.Find("a[href]").Each(func(index int, element *goquery.Selection) { 270 | href, exists := element.Attr("href") 271 | if !exists || href == "" { 272 | return 273 | } 274 | 275 | // Skip fragments, external links (mailto:, tel:, http:), protocol-relative (//), javascript: 276 | if strings.HasPrefix(href, "#") || strings.Contains(href, ":") || strings.HasPrefix(href, "//") { 277 | return 278 | } 279 | 280 | linkURL, parseErr := finalURL.Parse(href) 281 | if parseErr != nil { 282 | taskLog.Warnf("Skipping rewrite for unparseable link href '%s': %v", href, parseErr) 283 | if firstError == nil { 284 | firstError = parseErr 285 | } 286 | return 287 | } 288 | 289 | // Check if the linked URL is within crawl scope and get its potential output path 290 | targetOutputPath, isInScope := cp.getOutputPathForURL(linkURL, siteCfg, siteOutputDir) 291 | if !isInScope { 292 | return // Leave external or out-of-scope links unmodified 293 | } 294 | 295 | // Calculate the relative path from the current file's directory to the target file 296 | relativePath, relErr := filepath.Rel(currentPageOutputDir, targetOutputPath) 297 | if relErr != nil { 298 | taskLog.Warnf("Could not calculate relative path from '%s' to '%s' for link '%s': %v. Keeping original.", currentPageOutputDir, targetOutputPath, href, relErr) 299 | if firstError == nil { 300 | firstError = relErr 301 | } 302 | return 303 | } 304 | 305 | // Use forward slashes and preserve fragment 306 | relativePath = filepath.ToSlash(relativePath) 307 | if linkURL.Fragment != "" { 308 | relativePath += "#" + linkURL.Fragment 309 | } 310 | 311 | element.SetAttr("href", relativePath) 312 | rewriteCount++ 313 | }) 314 | 315 | taskLog.Debugf("Rewrote %d internal links.", rewriteCount) 316 | return rewriteCount, firstError 317 | } 318 | -------------------------------------------------------------------------------- /pkg/sitemap/processor.go: -------------------------------------------------------------------------------- 1 | package sitemap 2 | 3 | import ( 4 | "context" 5 | "encoding/xml" 6 | "errors" 7 | "io" 8 | "net/http" 9 | "net/url" 10 | "regexp" 11 | "runtime/debug" 12 | "strings" 13 | "sync" 14 | "time" 15 | 16 | "github.com/sirupsen/logrus" 17 | "golang.org/x/sync/semaphore" 18 | 19 | "doc-scraper/pkg/config" 20 | "doc-scraper/pkg/fetch" 21 | "doc-scraper/pkg/models" 22 | "doc-scraper/pkg/parse" 23 | "doc-scraper/pkg/queue" 24 | "doc-scraper/pkg/storage" 25 | ) 26 | 27 | // SitemapProcessor handles fetching, parsing, and processing sitemaps 28 | type SitemapProcessor struct { 29 | sitemapQueue chan string // Channel to receive sitemap URLs to process 30 | pq *queue.ThreadSafePriorityQueue // Main priority queue for page URLs 31 | store storage.VisitedStore // To mark pages visited 32 | fetcher *fetch.Fetcher // For fetching sitemaps 33 | rateLimiter *fetch.RateLimiter // For rate limiting fetches 34 | globalSemaphore *semaphore.Weighted // Global request limit 35 | compiledDisallowedPatterns []*regexp.Regexp // For filtering URLs found in sitemaps 36 | siteCfg config.SiteConfig // Need for scope checks 37 | appCfg config.AppConfig // Need for UA, timeouts 38 | log *logrus.Entry 39 | wg *sync.WaitGroup // Main crawler waitgroup 40 | sitemapsProcessed map[string]bool // Track sitemaps submitted to this processor 41 | sitemapsProcessedMu sync.Mutex // Mutex for the processed map 42 | } 43 | 44 | // NewSitemapProcessor creates a new SitemapProcessor 45 | func NewSitemapProcessor( 46 | sitemapQueue chan string, 47 | pq *queue.ThreadSafePriorityQueue, 48 | store storage.VisitedStore, 49 | fetcher *fetch.Fetcher, 50 | rateLimiter *fetch.RateLimiter, 51 | globalSemaphore *semaphore.Weighted, 52 | compiledDisallowedPatterns []*regexp.Regexp, 53 | siteCfg config.SiteConfig, 54 | appCfg config.AppConfig, 55 | log *logrus.Logger, 56 | wg *sync.WaitGroup, 57 | ) *SitemapProcessor { 58 | return &SitemapProcessor{ 59 | sitemapQueue: sitemapQueue, 60 | pq: pq, 61 | store: store, 62 | fetcher: fetcher, 63 | rateLimiter: rateLimiter, 64 | globalSemaphore: globalSemaphore, 65 | compiledDisallowedPatterns: compiledDisallowedPatterns, 66 | siteCfg: siteCfg, 67 | appCfg: appCfg, 68 | log: log.WithField("component", "sitemap_processor"), 69 | wg: wg, 70 | sitemapsProcessed: make(map[string]bool), // Initialize map 71 | } 72 | } 73 | 74 | // Start runs the sitemap processing loop in a goroutine 75 | func (sp *SitemapProcessor) Start(ctx context.Context) { 76 | sp.log.Info("Sitemap processing goroutine starting.") 77 | go sp.run(ctx) 78 | } 79 | 80 | // MarkSitemapProcessed records that a sitemap URL has been queued for processing 81 | // Returns true if it was newly marked, false if already marked 82 | func (sp *SitemapProcessor) MarkSitemapProcessed(sitemapURL string) bool { 83 | sp.sitemapsProcessedMu.Lock() 84 | defer sp.sitemapsProcessedMu.Unlock() 85 | if !sp.sitemapsProcessed[sitemapURL] { 86 | sp.sitemapsProcessed[sitemapURL] = true 87 | return true 88 | } 89 | return false 90 | } 91 | 92 | // run is the main processing loop 93 | func (sp *SitemapProcessor) run(ctx context.Context) { 94 | var sitemapProcessingWg sync.WaitGroup // Tracks active sitemap downloads/parses within this processor 95 | 96 | defer func() { 97 | sp.log.Info("Waiting for active sitemap processing tasks to finish before final exit...") 98 | sitemapProcessingWg.Wait() 99 | sp.log.Info("Sitemap processing goroutine finished waiting and exiting.") 100 | }() 101 | 102 | userAgent := sp.appCfg.DefaultUserAgent // Use default UA for sitemaps 103 | semTimeout := sp.appCfg.SemaphoreAcquireTimeout 104 | 105 | for { 106 | select { 107 | case <-ctx.Done(): // Check if the main crawl context has been cancelled 108 | sp.log.Warnf("Context cancelled, stopping sitemap processing: %v", ctx.Err()) 109 | return 110 | 111 | case sitemapURL, ok := <-sp.sitemapQueue: // Try to receive a URL from the channel 112 | if !ok { 113 | sp.log.Info("Sitemap queue channel closed.") 114 | return 115 | } 116 | 117 | // --- Received a URL, Launch a Goroutine to Process It --- 118 | sitemapProcessingWg.Add(1) // Increment local WaitGroup for the task we are about to launch 119 | go func(smURL string) { // Launch goroutine for concurrent processing 120 | // Ensure both WaitGroups are decremented when this goroutine finishes 121 | defer func() { 122 | sp.wg.Done() // Decrement the main crawler WaitGroup (incremented when queued) 123 | sitemapProcessingWg.Done() // Decrement this processor's WaitGroup 124 | }() 125 | 126 | // --- Panic Recovery for this sitemap task --- 127 | defer func() { 128 | if r := recover(); r != nil { 129 | stackTrace := string(debug.Stack()) 130 | sp.log.WithFields(logrus.Fields{ 131 | "sitemap_url": smURL, 132 | "panic_info": r, 133 | "stack_trace": stackTrace, 134 | }).Error("PANIC Recovered in sitemap processing goroutine") 135 | } 136 | }() 137 | 138 | sitemapLog := sp.log.WithField("sitemap_url", smURL) 139 | sitemapLog.Info("Processing sitemap") 140 | 141 | // --- Fetching Logic --- 142 | parsedSitemapURL, err := url.Parse(smURL) 143 | if err != nil { 144 | sitemapLog.Errorf("Failed parse URL: %v", err) 145 | return // Stop processing this invalid URL 146 | } 147 | sitemapHost := parsedSitemapURL.Hostname() 148 | 149 | // --- Acquire Global Semaphore (respecting context) --- 150 | ctxG, cancelG := context.WithTimeout(ctx, semTimeout) // Derive timeout from main context 151 | err = sp.globalSemaphore.Acquire(ctxG, 1) 152 | cancelG() // Release resources associated with the timed context 153 | if err != nil { 154 | // Check if the error was due to the main context being cancelled 155 | if errors.Is(err, context.DeadlineExceeded) && ctx.Err() != nil { 156 | sitemapLog.Warnf("Could not acquire GLOBAL semaphore due to main context cancellation: %v", ctx.Err()) 157 | } else if errors.Is(err, context.DeadlineExceeded) { 158 | sitemapLog.Errorf("Timeout acquiring GLOBAL semaphore: %v", err) 159 | } else { 160 | sitemapLog.Errorf("Error acquiring GLOBAL semaphore: %v", err) 161 | } 162 | return // Stop processing this sitemap if semaphore not acquired 163 | } 164 | defer sp.globalSemaphore.Release(1) // Ensure release on exit 165 | 166 | // --- Apply Rate Limit --- 167 | sp.rateLimiter.ApplyDelay(sitemapHost, sp.appCfg.DefaultDelayPerHost) 168 | 169 | // --- Create Request (with context) --- 170 | req, err := http.NewRequestWithContext(ctx, "GET", smURL, nil) 171 | if err != nil { 172 | sitemapLog.Errorf("Req Create error: %v", err) 173 | return 174 | } 175 | req.Header.Set("User-Agent", userAgent) 176 | 177 | // --- Fetch Request (with context and retries) --- 178 | resp, fetchErr := sp.fetcher.FetchWithRetry(req, ctx) 179 | sp.rateLimiter.UpdateLastRequestTime(sitemapHost) // Update after attempt 180 | 181 | if fetchErr != nil { 182 | sitemapLog.Errorf("Fetch failed: %v", fetchErr) 183 | if resp != nil { 184 | io.Copy(io.Discard, resp.Body) 185 | resp.Body.Close() 186 | } 187 | return 188 | } 189 | // If fetch succeeded, resp is non-nil, 2xx status 190 | defer resp.Body.Close() // Ensure body is closed eventually 191 | 192 | // --- Read & Parse XML --- 193 | sitemapBytes, readErr := io.ReadAll(resp.Body) 194 | if readErr != nil { 195 | sitemapLog.Errorf("Read body error: %v", readErr) 196 | return 197 | } 198 | 199 | // --- Try Parsing as Sitemap Index --- 200 | var index parse.XMLSitemapIndex 201 | errIndex := xml.Unmarshal(sitemapBytes, &index) 202 | if errIndex == nil && len(index.Sitemaps) > 0 { 203 | sitemapLog.Infof("Parsed as Sitemap Index, found %d references.", len(index.Sitemaps)) 204 | queuedCount := 0 205 | for _, sitemapEntry := range index.Sitemaps { 206 | nestedSmURL := sitemapEntry.Loc 207 | nestedSmLog := sitemapLog.WithField("nested_sitemap", nestedSmURL) 208 | _, nestedErr := url.ParseRequestURI(nestedSmURL) 209 | if nestedErr != nil { 210 | nestedSmLog.Warnf("Invalid nested sitemap URL: %v", nestedErr) 211 | continue // Skip invalid URLs 212 | } 213 | 214 | // --- Queue Nested Sitemap (check if already processed) --- 215 | // MarkSitemapProcessed is thread-safe and returns true if newly marked 216 | if sp.MarkSitemapProcessed(nestedSmURL) { 217 | // It's a new sitemap for the processor, increment WG for the eventual task 218 | sp.wg.Add(1) 219 | 220 | select { 221 | case sp.sitemapQueue <- nestedSmURL: // Attempt to send 222 | queuedCount++ 223 | nestedSmLog.Debug("Successfully queued nested sitemap.") 224 | // WG remains incremented, will be decremented when task completes 225 | 226 | case <-ctx.Done(): // Check if main context is cancelled 227 | nestedSmLog.Warnf("Context cancelled while trying to queue nested sitemap '%s': %v", nestedSmURL, ctx.Err()) 228 | // --- UNDO STATE --- 229 | sp.sitemapsProcessedMu.Lock() 230 | delete(sp.sitemapsProcessed, nestedSmURL) // Remove from processed map 231 | sp.sitemapsProcessedMu.Unlock() 232 | sp.wg.Done() // Decrement main WG as task won't be processed 233 | 234 | case <-time.After(5 * time.Second): // Timeout for queue send 235 | nestedSmLog.Error("Timeout sending nested sitemap. Undoing WG and processed state.") 236 | // --- UNDO STATE --- 237 | sp.sitemapsProcessedMu.Lock() 238 | delete(sp.sitemapsProcessed, nestedSmURL) // Remove from processed map 239 | sp.sitemapsProcessedMu.Unlock() 240 | sp.wg.Done() // Decrement main WG as task won't be processed 241 | } 242 | } else { 243 | nestedSmLog.Debugf("Nested sitemap already processed/queued: %s", nestedSmURL) 244 | // Already marked, so no WG increment/decrement needed here 245 | } 246 | } // End loop through nested sitemap entries 247 | sitemapLog.Infof("Queued %d nested sitemaps.", queuedCount) 248 | return // Return after processing index 249 | } 250 | 251 | // --- Try Parsing as URL Set --- 252 | var urlSet parse.XMLURLSet 253 | errURLSet := xml.Unmarshal(sitemapBytes, &urlSet) 254 | if errURLSet != nil { 255 | // Only log error if it wasn't successfully parsed as an index either 256 | if errIndex != nil { 257 | sitemapLog.Errorf("Failed parse XML (Index err=%v; URLSet err=%v)", errIndex, errURLSet) 258 | } else { 259 | sitemapLog.Warnf("Content was not a valid Sitemap Index or URL Set (URLSet err=%v)", errURLSet) 260 | } 261 | return 262 | } 263 | 264 | // --- Process URL Set --- 265 | sitemapLog.Infof("Parsed as URL Set, found %d URLs.", len(urlSet.URLs)) 266 | queuedCount := 0 267 | dbErrorCount := 0 268 | for _, urlEntry := range urlSet.URLs { 269 | pageURL := urlEntry.Loc 270 | pageLastMod := urlEntry.LastMod 271 | 272 | // Log the URL and its last modified date (if present) 273 | if pageLastMod != "" { 274 | sitemapLog.Debugf("Found URL: %s (LastMod: %s)", pageURL, pageLastMod) 275 | } else { 276 | sitemapLog.Debugf("Found URL: %s (No LastMod specified)", pageURL) 277 | } 278 | 279 | // --- Scope Check Logic --- 280 | parsedPageURL, err := url.Parse(pageURL) 281 | if err != nil { 282 | sitemapLog.Warnf("Sitemap URL parse error: %v", err) 283 | continue 284 | } 285 | if parsedPageURL.Scheme != "http" && parsedPageURL.Scheme != "https" { 286 | continue 287 | } 288 | if parsedPageURL.Hostname() != sp.siteCfg.AllowedDomain { 289 | continue 290 | } 291 | targetPath := parsedPageURL.Path 292 | if targetPath == "" { 293 | targetPath = "/" 294 | } 295 | if !strings.HasPrefix(targetPath, sp.siteCfg.AllowedPathPrefix) { 296 | continue 297 | } 298 | isDisallowed := false 299 | for _, pattern := range sp.compiledDisallowedPatterns { 300 | if pattern.MatchString(parsedPageURL.Path) { 301 | isDisallowed = true 302 | break 303 | } 304 | } 305 | if isDisallowed { 306 | continue 307 | } 308 | // --- End Scope Check --- 309 | 310 | // Normalize URL 311 | normalizedPageURL, _, errNorm := parse.ParseAndNormalize(pageURL) 312 | if errNorm != nil { 313 | sitemapLog.Warnf("Sitemap URL normalize error: %v", errNorm) 314 | continue 315 | } 316 | 317 | // --- Check/Add to DB and Queue if New --- 318 | // Use store's MarkPageVisited 319 | added, visitErr := sp.store.MarkPageVisited(normalizedPageURL) 320 | if visitErr != nil { 321 | sitemapLog.Errorf("Sitemap URL DB mark error: %v", visitErr) 322 | dbErrorCount++ 323 | continue // Skip this URL if DB error occurs 324 | } 325 | 326 | if added { // If markVisited added the URL (it was new) 327 | sp.wg.Add(1) // Increment main WaitGroup 328 | // Add with Depth 0 as sitemaps don't have inherent depth 329 | sitemapWorkItem := models.WorkItem{URL: pageURL, Depth: 0} 330 | sp.pq.Add(&sitemapWorkItem) // Add to the main priority queue 331 | queuedCount++ 332 | } 333 | } // End loop through URL entries 334 | 335 | if dbErrorCount > 0 { 336 | sitemapLog.Warnf("Finished URL Set. Queued %d new URLs, encountered %d DB errors.", queuedCount, dbErrorCount) 337 | } else { 338 | sitemapLog.Infof("Finished URL Set. Queued %d new URLs.", queuedCount) 339 | } 340 | 341 | }(sitemapURL) // End the anonymous goroutine for processing a single sitemap 342 | } 343 | } 344 | } 345 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/JohannesKaufmann/html-to-markdown v1.6.0 h1:04VXMiE50YYfCfLboJCLcgqF5x+rHJnb1ssNmqpLH/k= 2 | github.com/JohannesKaufmann/html-to-markdown v1.6.0/go.mod h1:NUI78lGg/a7vpEJTz/0uOcYMaibytE4BUOQS8k78yPQ= 3 | github.com/PuerkitoBio/goquery v1.9.2/go.mod h1:GHPCaP0ODyyxqcNoFGYlAprUFH81NuRPd0GX3Zu2Mvk= 4 | github.com/PuerkitoBio/goquery v1.10.3 h1:pFYcNSqHxBD06Fpj/KsbStFRsgRATgnf3LeXiUkhzPo= 5 | github.com/PuerkitoBio/goquery v1.10.3/go.mod h1:tMUX0zDMHXYlAQk6p35XxQMqMweEKB7iK7iLNd4RH4Y= 6 | github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU= 7 | github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM= 8 | github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA= 9 | github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= 10 | github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= 11 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 12 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 13 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 14 | github.com/dgraph-io/badger/v4 v4.7.0 h1:Q+J8HApYAY7UMpL8d9owqiB+odzEc0zn/aqOD9jhc6Y= 15 | github.com/dgraph-io/badger/v4 v4.7.0/go.mod h1:He7TzG3YBy3j4f5baj5B7Zl2XyfNe5bl4Udl0aPemVA= 16 | github.com/dgraph-io/ristretto/v2 v2.2.0 h1:bkY3XzJcXoMuELV8F+vS8kzNgicwQFAaGINAEJdWGOM= 17 | github.com/dgraph-io/ristretto/v2 v2.2.0/go.mod h1:RZrm63UmcBAaYWC1DotLYBmTvgkrs0+XhBd7Npn7/zI= 18 | github.com/dgryski/go-farm v0.0.0-20240924180020-3414d57e47da h1:aIftn67I1fkbMa512G+w+Pxci9hJPB8oMnkcP3iZF38= 19 | github.com/dgryski/go-farm v0.0.0-20240924180020-3414d57e47da/go.mod h1:SqUrOPUnsFjfmXRMNPybcSiG0BgUW2AuFH8PAnS2iTw= 20 | github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= 21 | github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto= 22 | github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= 23 | github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= 24 | github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= 25 | github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= 26 | github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= 27 | github.com/google/flatbuffers v25.2.10+incompatible h1:F3vclr7C3HpB1k9mxCGRMXq6FdUalZ6H/pNX4FP1v0Q= 28 | github.com/google/flatbuffers v25.2.10+incompatible/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8= 29 | github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= 30 | github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= 31 | github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= 32 | github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= 33 | github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= 34 | github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= 35 | github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= 36 | github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= 37 | github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= 38 | github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= 39 | github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= 40 | github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= 41 | github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= 42 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 43 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 44 | github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= 45 | github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= 46 | github.com/sebdah/goldie/v2 v2.5.3 h1:9ES/mNN+HNUbNWpVAlrzuZ7jE+Nrczbj8uFRjM7624Y= 47 | github.com/sebdah/goldie/v2 v2.5.3/go.mod h1:oZ9fp0+se1eapSRjfYbsV/0Hqhbuu3bJVvKI/NNtssI= 48 | github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo= 49 | github.com/sergi/go-diff v1.3.1 h1:xkr+Oxo4BOQKmkn/B9eMK0g5Kg/983T9DqqPHwYqD+8= 50 | github.com/sergi/go-diff v1.3.1/go.mod h1:aMJSSKb2lpPvRNec0+w3fl7LP9IOFzdc9Pa4NFbPK1I= 51 | github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= 52 | github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= 53 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 54 | github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= 55 | github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= 56 | github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= 57 | github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= 58 | github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= 59 | github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg= 60 | github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo= 61 | github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= 62 | github.com/yuin/goldmark v1.7.1 h1:3bajkSilaCbjdKVsKdZjZCLBNPL9pYzrCakKaf4U49U= 63 | github.com/yuin/goldmark v1.7.1/go.mod h1:uzxRWxtg69N339t3louHJ7+O03ezfj6PlliRlaOzY1E= 64 | go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= 65 | go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= 66 | go.opentelemetry.io/otel v1.35.0 h1:xKWKPxrxB6OtMCbmMY021CqC45J+3Onta9MqjhnusiQ= 67 | go.opentelemetry.io/otel v1.35.0/go.mod h1:UEqy8Zp11hpkUrL73gSlELM0DupHoiq72dR+Zqel/+Y= 68 | go.opentelemetry.io/otel/metric v1.35.0 h1:0znxYu2SNyuMSQT4Y9WDWej0VpcsxkuklLa4/siN90M= 69 | go.opentelemetry.io/otel/metric v1.35.0/go.mod h1:nKVFgxBZ2fReX6IlyW28MgZojkoAkJGaE8CpgeAU3oE= 70 | go.opentelemetry.io/otel/trace v1.35.0 h1:dPpEfJu1sDIqruz7BHFG3c7528f6ddfSWfFDVt/xgMs= 71 | go.opentelemetry.io/otel/trace v1.35.0/go.mod h1:WUk7DtFp1Aw2MkvqGdwiXYDZZNvA/1J8o6xRXLrIkyc= 72 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= 73 | golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= 74 | golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc= 75 | golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU= 76 | golang.org/x/crypto v0.22.0/go.mod h1:vr6Su+7cTlO45qkww3VDJlzDn0ctJvRgYbC2NvXHt+M= 77 | golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8= 78 | golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk= 79 | golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= 80 | golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= 81 | golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= 82 | golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= 83 | golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= 84 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= 85 | golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= 86 | golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= 87 | golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= 88 | golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= 89 | golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= 90 | golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= 91 | golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= 92 | golang.org/x/net v0.24.0/go.mod h1:2Q7sJY5mzlzWjKtYUEXSlBWCdyaioyXzRB2RtU8KVE8= 93 | golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= 94 | golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4= 95 | golang.org/x/net v0.39.0 h1:ZCu7HMWDxpXpaiKdhzIfaltL9Lp31x/3fCP11bc6/fY= 96 | golang.org/x/net v0.39.0/go.mod h1:X7NRbYVEA+ewNkCNyJ513WmMdQ3BineSwVtN2zD/d+E= 97 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 98 | golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 99 | golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 100 | golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= 101 | golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= 102 | golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= 103 | golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= 104 | golang.org/x/sync v0.13.0 h1:AauUjRAJ9OSnvULf/ARrrVywoJDy0YS2AwQ98I37610= 105 | golang.org/x/sync v0.13.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= 106 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 107 | golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 108 | golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 109 | golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 110 | golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 111 | golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 112 | golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 113 | golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 114 | golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 115 | golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 116 | golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= 117 | golang.org/x/sys v0.19.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= 118 | golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= 119 | golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= 120 | golang.org/x/sys v0.32.0 h1:s77OFDvIQeibCmezSnk/q6iAfkdiQaJi4VzroCFrN20= 121 | golang.org/x/sys v0.32.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= 122 | golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE= 123 | golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= 124 | golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= 125 | golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= 126 | golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY= 127 | golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= 128 | golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU= 129 | golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk= 130 | golang.org/x/term v0.19.0/go.mod h1:2CuTdWZ7KHSQwUzKva0cbMg6q2DMI3Mmxp+gKJbskEk= 131 | golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY= 132 | golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM= 133 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 134 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= 135 | golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= 136 | golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= 137 | golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= 138 | golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= 139 | golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= 140 | golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= 141 | golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= 142 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 143 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= 144 | golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= 145 | golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= 146 | golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58= 147 | golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= 148 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 149 | google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= 150 | google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= 151 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 152 | gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 153 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= 154 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= 155 | gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= 156 | gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= 157 | gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= 158 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 159 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= 160 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 161 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 🕸️ LLM Documentation Scraper (`doc-scraper`) 2 | 3 | [![Go Version](https://img.shields.io/github/go-mod/go-version/Sriram-PR/doc-scraper)](https://golang.org/) 4 | 5 | > A configurable, concurrent, and resumable web crawler written in Go. Specifically designed to scrape technical documentation websites, extract core content, convert it cleanly to Markdown format suitable for ingestion by Large Language Models (LLMs), and save the results locally. 6 | 7 | ## 📖 Overview 8 | 9 | This project provides a powerful command-line tool to crawl documentation sites based on settings defined in a `config.yaml` file. It navigates the site structure, extracts content from specified HTML sections using CSS selectors, and converts it into clean Markdown files. 10 | 11 | ### Why Use This Tool? 12 | 13 | - **Built for LLM Training & RAG Systems** - Creates clean, consistent Markdown optimized for ingestion 14 | - **Preserves Documentation Structure** - Maintains the original site hierarchy for context preservation 15 | - **Production-Ready Features** - Offers resumable crawls, rate limiting, and graceful error handling 16 | - **High Performance** - Uses Go's concurrency model for efficient parallel processing 17 | 18 | ## 🎯 Goal: Preparing Documentation for LLMs 19 | 20 | The main objective of this tool is to automate the often tedious process of gathering and cleaning web-based documentation for use with Large Language Models. By converting structured web content into clean Markdown, it aims to provide a dataset that is: 21 | 22 | * **Text-Focused:** Prioritizes the textual content extracted via CSS selectors 23 | * **Structured:** Maintains the directory hierarchy of the original documentation site, preserving context 24 | * **Cleaned:** Converts HTML to Markdown, removing web-specific markup and clutter 25 | * **Locally Accessible:** Provides the content as local files for easier processing and pipeline integration 26 | 27 | ## ✨ Key Features 28 | 29 | | Feature | Description | 30 | |---------|-------------| 31 | | **Configurable Crawling** | Uses YAML for global and site-specific settings | 32 | | **Scope Control** | Limits crawling by domain, path prefix, and disallowed path patterns (regex) | 33 | | **Content Extraction** | Extracts main content using CSS selectors | 34 | | **HTML-to-Markdown** | Converts extracted HTML to clean Markdown | 35 | | **Image Handling** | Optional downloading and local rewriting of image links with domain and size filtering | 36 | | **Link Rewriting** | Rewrites internal links to relative paths for local structure | 37 | | **URL-to-File Mapping** | Optional TSV file logging saved file paths and their corresponding original URLs | 38 | | **YAML Metadata Output** | Optional detailed YAML file per site with crawl stats and per-page metadata (including content hashes) | 39 | | **Concurrency** | Configurable worker pools and semaphore-based request limits (global and per-host) | 40 | | **Rate Limiting** | Configurable per-host delays with jitter | 41 | | **Robots.txt & Sitemaps** | Respects `robots.txt` and processes discovered sitemaps | 42 | | **State Persistence** | Uses BadgerDB for state; supports resuming crawls via `-resume` flag | 43 | | **Graceful Shutdown** | Handles `SIGINT`/`SIGTERM` with proper cleanup | 44 | | **HTTP Retries** | Exponential backoff with jitter for transient errors | 45 | | **Observability** | Structured logging (`logrus`) and optional `pprof` endpoint | 46 | | **Modular Code** | Organized into packages for clarity and maintainability | 47 | 48 | ## 🚀 Getting Started 49 | 50 | ### Prerequisites 51 | 52 | * **Go:** Version 1.21 or later 53 | * **Git:** For cloning the repository 54 | * **Disk Space:** Sufficient for storing crawled content and state database 55 | 56 | ### Installation 57 | 58 | 1. **Clone the repository:** 59 | ```bash 60 | git clone https://github.com/Sriram-PR/doc-scraper.git 61 | cd doc-scraper 62 | ``` 63 | 64 | 2. **Install Dependencies:** 65 | ```bash 66 | go mod tidy 67 | ``` 68 | 69 | 3. **Build the Binary:** 70 | ```bash 71 | go build -o crawler.exe ./cmd/crawler/... 72 | ``` 73 | This creates an executable file named `crawler` in the project root directory. 74 | 75 | ### Quick Start 76 | 77 | 1. Create a basic `config.yaml` file (see [Configuration](#-configuration-configyaml) section) 78 | 2. Run the crawler: 79 | ```bash 80 | ./crawler.exe -site your_site_key -loglevel info 81 | ``` 82 | 3. Find your crawled documentation in the `./crawled_docs/` directory 83 | 84 | ## ⚙️ Configuration (`config.yaml`) 85 | 86 | A `config.yaml` file is **required** to run the crawler. Create this file in the project root or specify its path using the `-config` flag. 87 | 88 | ### Key Settings for LLM Use 89 | 90 | When configuring for LLM documentation processing, pay special attention to these settings: 91 | 92 | - `sites..content_selector`: Define precisely to capture only relevant text 93 | - `sites..allowed_domain` / `allowed_path_prefix`: Define scope accurately 94 | - `skip_images`: Set to `true` globally or per-site if images aren't needed for the LLM 95 | - Adjust concurrency/delay settings based on the target site and your resources 96 | 97 | ### Example Configuration 98 | 99 | ```yaml 100 | # Global settings (applied if not overridden by site) 101 | default_delay_per_host: 500ms 102 | num_workers: 8 103 | num_image_workers: 8 104 | max_requests: 48 105 | max_requests_per_host: 4 106 | output_base_dir: "./crawled_docs" 107 | state_dir: "./crawler_state" 108 | max_retries: 4 109 | initial_retry_delay: 1s 110 | max_retry_delay: 30s 111 | semaphore_acquire_timeout: 30s 112 | global_crawl_timeout: 0s 113 | skip_images: false # Set to true to skip images globally 114 | max_image_size_bytes: 10485760 # 10 MiB 115 | enable_output_mapping: true 116 | output_mapping_filename: "global_url_map.tsv" 117 | enable_metadata_yaml: true 118 | metadata_yaml_filename: "crawl_meta.yaml" 119 | 120 | # HTTP Client Settings 121 | http_client_settings: 122 | timeout: 45s 123 | max_idle_conns_per_host: 6 124 | 125 | # Site-specific configurations 126 | sites: 127 | # Key used with -site flag 128 | pytorch_docs: 129 | start_urls: 130 | - "https://pytorch.org/docs/stable/" 131 | allowed_domain: "pytorch.org" 132 | allowed_path_prefix: "/docs/stable/" 133 | content_selector: "article.pytorch-article .body" 134 | max_depth: 0 # 0 for unlimited depth 135 | skip_images: false 136 | # Override global mapping filename for this site 137 | output_mapping_filename: "pytorch_docs_map.txt" 138 | metadata_yaml_filename: "pytorch_metadata_output.yaml" 139 | disallowed_path_patterns: 140 | - "/docs/stable/.*/_modules/.*" 141 | - "/docs/stable/.*\.html#.*" 142 | 143 | tensorflow_docs: 144 | start_urls: 145 | - "https://www.tensorflow.org/guide" 146 | - "https://www.tensorflow.org/tutorials" 147 | allowed_domain: "www.tensorflow.org" 148 | allowed_path_prefix: "/" 149 | content_selector: ".devsite-article-body" 150 | max_depth: 0 151 | delay_per_host: 1s # Site-specific override 152 | # Disable mapping for this site, overriding global 153 | enable_output_mapping: false 154 | enable_metadata_yaml: false 155 | disallowed_path_patterns: 156 | - "/install/.*" 157 | - "/js/.*" 158 | ``` 159 | 160 | ### Full Configuration Options 161 | 162 | | Option | Type | Description | Default | 163 | |--------|------|-------------|---------| 164 | | `default_delay_per_host` | Duration | Time to wait between requests to the same host | `500ms` | 165 | | `num_workers` | Integer | Number of concurrent crawl workers | `8` | 166 | | `num_image_workers` | Integer | Number of concurrent image download workers | `8` | 167 | | `max_requests` | Integer | Maximum concurrent requests (global) | `48` | 168 | | `max_requests_per_host` | Integer | Maximum concurrent requests per host | `4` | 169 | | `output_base_dir` | String | Base directory for crawled content | `"./crawled_docs"` | 170 | | `state_dir` | String | Directory for BadgerDB state data | `"./crawler_state"` | 171 | | `max_retries` | Integer | Maximum retry attempts for HTTP requests | `4` | 172 | | `initial_retry_delay` | Duration | Initial delay for retry backoff | `1s` | 173 | | `max_retry_delay` | Duration | Maximum delay for retry backoff | `30s` | 174 | | `skip_images` | Boolean | Whether to skip downloading images | `false` | 175 | | `max_image_size_bytes` | Integer | Maximum allowed image size | `10485760` (10 MiB) | 176 | | `enable_output_mapping` | Boolean | Enable URL-to-file mapping log | `false` | 177 | | `output_mapping_filename` | String | Filename for the URL-to-file mapping log | `"url_to_file_map.tsv"` (if enabled and not set) | 178 | | `enable_metadata_yaml` | Boolean | Enable detailed YAML metadata output file | `false` | 179 | | `metadata_yaml_filename` | String | Filename for the YAML metadata output file | `"metadata.yaml"` (if enabled & not set) | 180 | | `http_client_settings` | Object | HTTP client configuration | *(see below)* | 181 | | `sites` | Map | Site-specific configurations | *(required)* | 182 | 183 | **HTTP Client Settings:** 184 | *(These are global and cannot be overridden per site in the current structure)* 185 | - `timeout`: Overall request timeout (Default in code: `45s`) 186 | - `max_idle_conns`: Total idle connections (Default in code: `100`) 187 | - `max_idle_conns_per_host`: Idle connections per host (Default in code: `6`) 188 | - `idle_conn_timeout`: Timeout for idle connections (Default in code: `90s`) 189 | - `tls_handshake_timeout`: TLS handshake timeout (Default in code: `10s`) 190 | - `expect_continue_timeout`: "100 Continue" timeout (Default in code: `1s`) 191 | - `force_attempt_http2`: `null` (use Go default), `true`, or `false`. (Default in code: `null`) 192 | - `dialer_timeout`: TCP connection timeout (Default in code: `15s`) 193 | - `dialer_keep_alive`: TCP keep-alive interval (Default in code: `30s`) 194 | 195 | **Site-Specific Configuration Options:** 196 | - `start_urls`: Array of starting URLs for crawling (Required) 197 | - `allowed_domain`: Restrict crawling to this domain (Required) 198 | - `allowed_path_prefix`: Further restrict crawling to URLs with this prefix (Required) 199 | - `content_selector`: CSS selector for main content extraction (Required) 200 | - `max_depth`: Maximum crawl depth from start URLs (0 = unlimited) 201 | - `delay_per_host`: Override global delay setting for this site 202 | - `disallowed_path_patterns`: Array of regex patterns for URLs to skip 203 | - `skip_images`: Override global image setting for this site 204 | - `allowed_image_domains`: Array of domains from which to download images 205 | - `enable_output_mapping`: `true` or `false`. Override global URL-to-file mapping enablement for this site. 206 | - `output_mapping_filename`: String. Override global URL-to-file mapping filename for this site. 207 | - `enable_metadata_yaml`: `true` or `false`. Override global YAML metadata output enablement for this site. 208 | - `metadata_yaml_filename`: String. Override global YAML metadata filename for this site. 209 | 210 | ## 🛠️ Usage 211 | 212 | Execute the compiled binary from the project root directory: 213 | 214 | ```bash 215 | ./crawler -config -site [flags...] 216 | ``` 217 | 218 | ### Command-Line Flags 219 | 220 | | Flag | Description | Default | 221 | |------|-------------|---------| 222 | | `-config ` | Path to config file | `config.yaml` | 223 | | `-site ` | **Required.** Key identifying the site config entry | - | 224 | | `-loglevel ` | Logging level (`trace`, `debug`, `info`, `warn`, `error`, `fatal`) | `info` | 225 | | `-resume` | Attempt to resume using the existing state database | `false` | 226 | | `-write-visited-log` | Output a list of visited URLs after the crawl | `false` | 227 | | `-pprof ` | Enable pprof endpoint | `localhost:6060` | 228 | 229 | ### Example Usage Scenarios 230 | 231 | **Basic Crawl:** 232 | ```bash 233 | ./crawler -site tensorflow_docs -loglevel info 234 | ``` 235 | 236 | **Resume a Large Crawl:** 237 | ```bash 238 | ./crawler -site pytorch_docs -resume -loglevel info 239 | ``` 240 | 241 | **High Performance Crawl with Profiling:** 242 | ```bash 243 | ./crawler -site small_docs -loglevel warn -pprof localhost:6060 244 | ``` 245 | 246 | **Debug Mode for Troubleshooting:** 247 | ```bash 248 | ./crawler -site test_site -loglevel debug 249 | ``` 250 | 251 | ## 📁 Output Structure 252 | 253 | Crawled content is saved under the `output_base_dir` defined in the config, organized by domain and preserving the site structure: 254 | 255 | ``` 256 | / 257 | └── / # e.g., docs.example.com 258 | ├── images/ # Only present if skip_images: false 259 | │ ├── image1.png 260 | │ └── image2.jpg 261 | ├── index.md # Markdown for the root path 262 | ├── 263 | ├── 264 | ├── topic_one/ 265 | │ ├── index.md 266 | │ └── subtopic_a.md 267 | └── topic_two.md 268 | └── ... (files/dirs mirroring site structure) 269 | ``` 270 | 271 | ### Output Format 272 | 273 | Each generated Markdown file contains: 274 | - Original page title as level-1 heading 275 | - Clean content converted from HTML to Markdown 276 | - Relative links to other pages (when within the allowed domain) 277 | - Local image references (if images are enabled) 278 | - A footer with metadata including source URL and crawl timestamp 279 | 280 | ## 🔍 Directory Structure Output 281 | 282 | After a successful crawl for a specific site, the crawler automatically generates a text file named `_structure.txt` within the global `output_base_dir` (alongside the site's content folder). This file contains a visual tree representation of the generated directory structure for the crawled site, which can be helpful for verification and analysis. 283 | 284 | **Example Location:** 285 | If `output_base_dir` is `./crawled_docs` and you crawled `docs.example.com`, the structure file will be: 286 | `./crawled_docs/docs.example.com_structure.txt` 287 | 288 | ## 🗺️ URL-to-File Mapping Output (URL Map) 289 | 290 | When enabled via configuration, the crawler generates a mapping file (typically a `.tsv` or `.txt` file) for each crawled site. This file logs each successfully processed page's final absolute URL and the corresponding local filesystem path where its content was saved. 291 | 292 | **Format:** 293 | Each line in the file typically follows a tab-separated format: 294 | `` 295 | 296 | This feature is controlled by the `enable_output_mapping` and `output_mapping_filename` settings in `config.yaml`. 297 | 298 | ## 📋 YAML Metadata Output 299 | 300 | In addition to (or instead of) the simple TSV mapping, the crawler can generate a comprehensive YAML file for each crawled site. This file (`metadata.yaml` by default, configurable) contains overall crawl statistics and detailed metadata for every successfully processed page. 301 | 302 | The filename can be configured globally and overridden per site using `enable_metadata_yaml` and `metadata_yaml_filename` in `config.yaml`. 303 | 304 | ## 🤝 Contributing 305 | 306 | Contributions are welcome! Please feel free to open an issue to discuss bugs, suggest features, or propose changes. 307 | 308 | **Pull Request Process:** 309 | 1. Fork the repository 310 | 2. Create a feature branch (`git checkout -b feature/amazing-feature`) 311 | 3. Commit your changes (`git commit -m 'Add some amazing feature'`) 312 | 4. Push to the branch (`git push origin feature/amazing-feature`) 313 | 5. Open a Pull Request 314 | 315 | Please ensure code adheres to Go best practices and includes appropriate documentation. 316 | 317 | ## 📝 License 318 | 319 | This project is licensed under the [Apache-2.0 License](https://github.com/Sriram-PR/doc-scraper/blob/main/LICENSE.txt). 320 | 321 | ## 🙏 Acknowledgements 322 | 323 | - [GoQuery](https://github.com/PuerkitoBio/goquery) for HTML parsing 324 | - [html-to-markdown](https://github.com/JohannesKaufmann/html-to-markdown) for conversion 325 | - [BadgerDB](https://github.com/dgraph-io/badger) for state persistence 326 | - [Logrus](https://github.com/sirupsen/logrus) for structured logging 327 | 328 | --- 329 | 330 | *Made with ❤️ for the LLM and Machine Learning community* -------------------------------------------------------------------------------- /cmd/crawler/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "errors" 6 | "flag" 7 | "fmt" 8 | "net/http" 9 | _ "net/http/pprof" 10 | "os" 11 | "os/signal" 12 | "path/filepath" 13 | "runtime" 14 | "strings" 15 | "syscall" 16 | "time" 17 | 18 | "github.com/sirupsen/logrus" 19 | "gopkg.in/yaml.v3" 20 | 21 | "doc-scraper/pkg/config" 22 | "doc-scraper/pkg/crawler" 23 | "doc-scraper/pkg/fetch" 24 | "doc-scraper/pkg/storage" 25 | "doc-scraper/pkg/utils" 26 | ) 27 | 28 | func main() { 29 | // --- Set profiling rates ---= 30 | runtime.SetBlockProfileRate(1000) 31 | runtime.SetMutexProfileFraction(1000) 32 | 33 | // --- Early Initialization & Flags --- 34 | log := logrus.New() 35 | log.SetFormatter(&logrus.TextFormatter{FullTimestamp: true, TimestampFormat: "15:04:05.000"}) 36 | log.SetLevel(logrus.InfoLevel) // Default level 37 | 38 | configFileFlag := flag.String("config", "config.yaml", "Path to YAML config file") 39 | siteKeyFlag := flag.String("site", "", "Site key from config file (required)") 40 | logLevelFlag := flag.String("loglevel", "info", "Log level (debug, info, warn, error, fatal)") 41 | resumeFlag := flag.Bool("resume", false, "Resume crawl using existing state DB") 42 | writeVisitedLogFlag := flag.Bool("write-visited-log", false, "Write a final log file of all visited URLs from DB") 43 | pprofAddr := flag.String("pprof", "localhost:6060", "Address for pprof HTTP server (e.g., ':6060', empty to disable)") 44 | flag.Parse() 45 | 46 | // --- Logger Configuration --- 47 | level, err := logrus.ParseLevel(*logLevelFlag) 48 | if err != nil { 49 | log.Warnf("Invalid log level '%s', using default 'info'. Error: %v", *logLevelFlag, err) 50 | } else { 51 | log.SetLevel(level) 52 | log.Infof("Setting log level to: %s", level.String()) 53 | } 54 | 55 | // --- Load Application Configuration --- 56 | log.Infof("Loading configuration from %s", *configFileFlag) 57 | yamlFile, err := os.ReadFile(*configFileFlag) 58 | if err != nil { 59 | log.Fatalf("Read config file '%s' error: %v", *configFileFlag, err) 60 | } 61 | var appCfg config.AppConfig 62 | err = yaml.Unmarshal(yamlFile, &appCfg) 63 | if err != nil { 64 | log.Fatalf("Parse config file '%s' error: %v", *configFileFlag, err) 65 | } 66 | 67 | // --- Validate Global App Configuration --- 68 | validateAppConfig(&appCfg, log) // Pass by pointer to modify defaults 69 | 70 | // Log effective global config 71 | logAppConfig(&appCfg, log) 72 | 73 | // --- Select and Validate Site-Specific Configuration --- 74 | if *siteKeyFlag == "" { 75 | log.Fatalf("Error: -site flag is required.") 76 | } 77 | siteCfg, ok := appCfg.Sites[*siteKeyFlag] 78 | if !ok { 79 | log.Fatalf("Error: Site key '%s' not found in config file '%s'", *siteKeyFlag, *configFileFlag) 80 | } 81 | // Validate Site Config (Keep validation here or move to config package) 82 | if err := validateSiteConfig(&siteCfg, log); err != nil { 83 | log.Fatalf("Site '%s' configuration error: %v", *siteKeyFlag, err) 84 | } 85 | log.Infof("Site Config for '%s': Domain: %s, Prefix: %s, ContentSel: '%s', ...", 86 | *siteKeyFlag, siteCfg.AllowedDomain, siteCfg.AllowedPathPrefix, siteCfg.ContentSelector) 87 | 88 | // --- Start pprof HTTP Server (Optional) --- 89 | if *pprofAddr != "" { 90 | go func() { 91 | log.Infof("Starting pprof HTTP server on: http://%s/debug/pprof/", *pprofAddr) 92 | if err := http.ListenAndServe(*pprofAddr, nil); err != nil { 93 | log.Errorf("Pprof server failed to start on %s: %v", *pprofAddr, err) 94 | } 95 | }() 96 | } else { 97 | log.Info("Pprof server disabled (no -pprof address provided).") 98 | } 99 | 100 | // =========================================================== 101 | // == Setup Global Context & Signal Handling == 102 | // =========================================================== 103 | var crawlCtx context.Context 104 | var cancelCrawl context.CancelFunc 105 | 106 | if appCfg.GlobalCrawlTimeout > 0 { 107 | log.Infof("Setting global crawl timeout: %v", appCfg.GlobalCrawlTimeout) 108 | crawlCtx, cancelCrawl = context.WithTimeout(context.Background(), appCfg.GlobalCrawlTimeout) 109 | } else { 110 | log.Info("No global crawl timeout set.") 111 | crawlCtx, cancelCrawl = context.WithCancel(context.Background()) 112 | } 113 | // Defer cancel() *very early* to ensure it's called on any exit path 114 | defer cancelCrawl() 115 | 116 | // Channel to listen for OS signals for graceful shutdown 117 | sigChan := make(chan os.Signal, 1) 118 | signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) 119 | 120 | // Goroutine to handle signals -> cancel context -> force exit on second signal 121 | go func() { 122 | sig := <-sigChan 123 | log.Warnf("Received signal: %v. Initiating graceful shutdown...", sig) 124 | cancelCrawl() // Trigger shutdown via context cancellation 125 | 126 | // Allow force exit on second signal or timeout 127 | select { 128 | case sig = <-sigChan: 129 | log.Warnf("Received second signal: %v. Forcing exit.", sig) 130 | os.Exit(1) 131 | case <-time.After(30 * time.Second): // Graceful shutdown timeout 132 | log.Warn("Graceful shutdown period exceeded after signal. Forcing exit.") 133 | os.Exit(1) 134 | } 135 | }() 136 | // Stop signal handling explicitly *before* main exits normally 137 | defer signal.Stop(sigChan) 138 | 139 | // =========================================================== 140 | // == Initialize Components == 141 | // =========================================================== 142 | log.Info("Initializing components...") 143 | 144 | // --- Storage --- 145 | store, err := storage.NewBadgerStore(crawlCtx, appCfg.StateDir, siteCfg.AllowedDomain, *resumeFlag, log) 146 | if err != nil { 147 | log.Fatalf("Failed to initialize visited DB: %v", err) 148 | } 149 | defer store.Close() // Ensure DB is closed on exit 150 | 151 | // Start DB GC goroutine 152 | go store.RunGC(crawlCtx, 10*time.Minute) // Pass context for cancellation 153 | 154 | // --- HTTP Fetching Components --- 155 | httpClient := fetch.NewClient(appCfg.HTTPClientSettings, log) 156 | fetcher := fetch.NewFetcher(httpClient, appCfg, log) 157 | rateLimiter := fetch.NewRateLimiter(appCfg.DefaultDelayPerHost, log) 158 | 159 | // --- Crawler Instance --- 160 | crawlerInstance, err := crawler.NewCrawler( 161 | appCfg, 162 | siteCfg, 163 | *siteKeyFlag, 164 | log, 165 | store, // Inject store 166 | fetcher, // Inject fetcher 167 | rateLimiter, // Inject rate limiter 168 | crawlCtx, // Pass context 169 | cancelCrawl, // Pass cancel func 170 | *resumeFlag, 171 | ) 172 | if err != nil { 173 | log.Fatalf("Failed to initialize crawler: %v", err) 174 | } 175 | 176 | // =========================================================== 177 | // == Start Crawler Execution == 178 | // =========================================================== 179 | err = crawlerInstance.Run(*resumeFlag) // Run the crawl 180 | 181 | // =========================================================== 182 | // == Post-Crawl Actions == 183 | // =========================================================== 184 | 185 | // --- Generate Directory Structure File (Only on Success) --- 186 | if err == nil { 187 | log.Info("Crawl completed successfully, generating directory structure file...") 188 | targetDir := filepath.Join(appCfg.OutputBaseDir, utils.SanitizeFilename(siteCfg.AllowedDomain)) 189 | outputFileName := fmt.Sprintf("%s_structure.txt", utils.SanitizeFilename(siteCfg.AllowedDomain)) 190 | outputFilePath := filepath.Join(appCfg.OutputBaseDir, outputFileName) 191 | 192 | // Call the utility function, passing the logger 193 | if treeErr := utils.GenerateAndSaveTreeStructure(targetDir, outputFilePath, log); treeErr != nil { 194 | log.Errorf("Failed to generate or save directory structure: %v", treeErr) 195 | } else { 196 | log.Infof("Successfully saved directory structure to %s", outputFilePath) 197 | } 198 | } else { 199 | log.Warnf("Skipping directory structure generation due to crawl error: %v", err) 200 | } 201 | 202 | // --- Final Visited Log File Generation (Optional) --- 203 | // Needs to run *after* crawler finishes but *before* DB is closed by defer 204 | // Check context error first, might not make sense to write log if cancelled early 205 | if crawlCtx.Err() != nil { 206 | log.Warnf("Skipping final visited log due to crawl context error: %v", crawlCtx.Err()) 207 | } else if *writeVisitedLogFlag { 208 | visitedFilename := fmt.Sprintf("%s-visited.txt", utils.SanitizeFilename(siteCfg.AllowedDomain)) 209 | visitedFilePath := filepath.Join(appCfg.OutputBaseDir, visitedFilename) 210 | if writeErr := store.WriteVisitedLog(visitedFilePath); writeErr != nil { 211 | log.Errorf("Error writing final visited log: %v", writeErr) 212 | } 213 | } else { 214 | log.Info("Skipping final visited URL log file generation.") 215 | } 216 | 217 | // --- Exit --- 218 | // Check the error returned by the crawler run 219 | if err != nil { 220 | if errors.Is(err, context.Canceled) { 221 | log.Warn("Crawl cancelled gracefully.") 222 | // os.Exit(1) 223 | os.Exit(0) 224 | } else if errors.Is(err, context.DeadlineExceeded) { 225 | log.Error("Crawl timed out (global timeout).") 226 | os.Exit(1) 227 | } else { 228 | log.Errorf("Crawl finished with error: %v", err) 229 | os.Exit(1) 230 | } 231 | } 232 | 233 | log.Info("Crawl completed successfully.") 234 | os.Exit(0) 235 | } 236 | 237 | // validateAppConfig checks global config values and sets defaults 238 | // Operates on a pointer to modify the struct directly 239 | func validateAppConfig(appCfg *config.AppConfig, log *logrus.Logger) { 240 | if appCfg.NumWorkers <= 0 { 241 | log.Warnf("num_workers should be > 0, defaulting to 4") 242 | appCfg.NumWorkers = 4 243 | } 244 | if appCfg.NumImageWorkers <= 0 { 245 | log.Warnf("num_image_workers not specified or invalid, defaulting to num_workers (%d)", appCfg.NumWorkers) 246 | appCfg.NumImageWorkers = appCfg.NumWorkers 247 | } 248 | if appCfg.MaxRequests <= 0 { 249 | log.Warnf("max_requests should be > 0, defaulting to 10") 250 | appCfg.MaxRequests = 10 251 | } 252 | if appCfg.MaxRequestsPerHost <= 0 { 253 | log.Warnf("max_requests_per_host should be > 0, defaulting to 2") 254 | appCfg.MaxRequestsPerHost = 2 255 | } 256 | if appCfg.OutputBaseDir == "" { 257 | log.Warn("output_base_dir is empty, defaulting to './crawled_docs'") 258 | appCfg.OutputBaseDir = "./crawled_docs" 259 | } 260 | if appCfg.StateDir == "" { 261 | log.Warn("state_dir is empty, defaulting to './crawler_state'") 262 | appCfg.StateDir = "./crawler_state" 263 | } 264 | if appCfg.MaxRetries < 0 { // Allow 0 retries 265 | log.Warnf("max_retries cannot be negative, setting to 0") 266 | appCfg.MaxRetries = 0 267 | } else if appCfg.MaxRetries == 0 && appCfg.InitialRetryDelay == 0 { 268 | // Set default only if not explicitly set 269 | appCfg.MaxRetries = 3 // Default from original 270 | } 271 | 272 | if appCfg.InitialRetryDelay <= 0 { 273 | if appCfg.MaxRetries > 0 { // Only default delay if retries are enabled 274 | appCfg.InitialRetryDelay = 1 * time.Second 275 | } 276 | } 277 | if appCfg.MaxRetryDelay <= 0 { 278 | if appCfg.MaxRetries > 0 { 279 | appCfg.MaxRetryDelay = 30 * time.Second 280 | } 281 | } 282 | if appCfg.InitialRetryDelay > appCfg.MaxRetryDelay && appCfg.MaxRetryDelay > 0 { 283 | log.Warnf("initial_retry_delay (%v) > max_retry_delay (%v), using max_retry_delay for initial", appCfg.InitialRetryDelay, appCfg.MaxRetryDelay) 284 | appCfg.InitialRetryDelay = appCfg.MaxRetryDelay 285 | } 286 | if appCfg.SemaphoreAcquireTimeout <= 0 { 287 | appCfg.SemaphoreAcquireTimeout = 30 * time.Second 288 | } 289 | if appCfg.GlobalCrawlTimeout < 0 { 290 | log.Warnf("global_crawl_timeout cannot be negative, disabling timeout") 291 | appCfg.GlobalCrawlTimeout = 0 // 0 means disabled 292 | } 293 | if appCfg.MaxImageSizeBytes < 0 { 294 | log.Warnf("max_image_size_bytes cannot be negative, setting to 0 (unlimited)") 295 | appCfg.MaxImageSizeBytes = 0 // 0 means unlimited 296 | } 297 | 298 | // Validate HTTP Client Settings 299 | if appCfg.HTTPClientSettings.Timeout <= 0 { 300 | appCfg.HTTPClientSettings.Timeout = 45 * time.Second 301 | } 302 | if appCfg.HTTPClientSettings.MaxIdleConns <= 0 { 303 | appCfg.HTTPClientSettings.MaxIdleConns = 100 304 | } 305 | if appCfg.HTTPClientSettings.MaxIdleConnsPerHost <= 0 { 306 | appCfg.HTTPClientSettings.MaxIdleConnsPerHost = 2 307 | } 308 | if appCfg.HTTPClientSettings.IdleConnTimeout <= 0 { 309 | appCfg.HTTPClientSettings.IdleConnTimeout = 90 * time.Second 310 | } 311 | if appCfg.HTTPClientSettings.TLSHandshakeTimeout <= 0 { 312 | appCfg.HTTPClientSettings.TLSHandshakeTimeout = 10 * time.Second 313 | } 314 | if appCfg.HTTPClientSettings.ExpectContinueTimeout <= 0 { 315 | appCfg.HTTPClientSettings.ExpectContinueTimeout = 1 * time.Second 316 | } 317 | if appCfg.HTTPClientSettings.DialerTimeout <= 0 { 318 | appCfg.HTTPClientSettings.DialerTimeout = 15 * time.Second 319 | } 320 | if appCfg.HTTPClientSettings.DialerKeepAlive <= 0 { 321 | appCfg.HTTPClientSettings.DialerKeepAlive = 30 * time.Second 322 | } 323 | 324 | if appCfg.EnableOutputMapping && appCfg.OutputMappingFilename == "" { 325 | log.Warnf("Global 'enable_output_mapping' is true but global 'output_mapping_filename' is empty. Defaulting global filename to 'url_to_file_map.tsv'") 326 | appCfg.OutputMappingFilename = "url_to_file_map.tsv" 327 | } 328 | if appCfg.EnableMetadataYAML && appCfg.MetadataYAMLFilename == "" { 329 | log.Warnf("Global 'enable_metadata_yaml' is true but global 'metadata_yaml_filename' is empty. Defaulting to 'metadata.yaml'") 330 | appCfg.MetadataYAMLFilename = "metadata.yaml" 331 | } 332 | } 333 | 334 | // logAppConfig logs the effective global configuration 335 | func logAppConfig(appCfg *config.AppConfig, log *logrus.Logger) { 336 | log.Infof("Global Config: Workers:%d, ImageWorkers:%d, MaxReqs:%d, MaxReqPerHost:%d", 337 | appCfg.NumWorkers, appCfg.NumImageWorkers, appCfg.MaxRequests, appCfg.MaxRequestsPerHost) 338 | log.Infof("Global Config: DefaultDelay:%v, StateDir:%s, OutputDir:%s", 339 | appCfg.DefaultDelayPerHost, appCfg.StateDir, appCfg.OutputBaseDir) 340 | log.Infof("Global Config Retries: Max:%d, InitialDelay:%v, MaxDelay:%v", 341 | appCfg.MaxRetries, appCfg.InitialRetryDelay, appCfg.MaxRetryDelay) 342 | log.Infof("Global Config Timeouts: SemaphoreAcquire:%v, GlobalCrawl:%v", 343 | appCfg.SemaphoreAcquireTimeout, appCfg.GlobalCrawlTimeout) 344 | log.Infof("Global Config Images: Skip:%t, MaxSize:%d bytes", 345 | appCfg.SkipImages, appCfg.MaxImageSizeBytes) 346 | log.Infof("Global Config HTTP Client: Timeout:%v, MaxIdle:%d, MaxIdlePerHost:%d, IdleTimeout:%v, TLSTimeout:%v, DialerTimeout:%v", 347 | appCfg.HTTPClientSettings.Timeout, appCfg.HTTPClientSettings.MaxIdleConns, appCfg.HTTPClientSettings.MaxIdleConnsPerHost, 348 | appCfg.HTTPClientSettings.IdleConnTimeout, appCfg.HTTPClientSettings.TLSHandshakeTimeout, appCfg.HTTPClientSettings.DialerTimeout) 349 | log.Infof("Global Config Output Mapping: Enabled Globally:%t, Default Global Filename:'%s'", 350 | appCfg.EnableOutputMapping, appCfg.OutputMappingFilename) 351 | log.Infof("Global Config YAML Metadata: Enabled Globally:%t, Default Global Filename:'%s'", 352 | appCfg.EnableMetadataYAML, appCfg.MetadataYAMLFilename) 353 | } 354 | 355 | // validateSiteConfig checks site-specific config - Operates on pointer to modify prefix 356 | func validateSiteConfig(siteCfg *config.SiteConfig, log *logrus.Logger) error { 357 | if len(siteCfg.StartURLs) == 0 { 358 | return fmt.Errorf("%w: site has no start_urls", utils.ErrConfigValidation) 359 | } 360 | if siteCfg.AllowedDomain == "" { 361 | return fmt.Errorf("%w: site needs allowed_domain", utils.ErrConfigValidation) 362 | } 363 | // Normalize path prefix 364 | if siteCfg.AllowedPathPrefix == "" { 365 | siteCfg.AllowedPathPrefix = "/" 366 | } else if !strings.HasPrefix(siteCfg.AllowedPathPrefix, "/") { 367 | siteCfg.AllowedPathPrefix = "/" + siteCfg.AllowedPathPrefix 368 | } 369 | if siteCfg.ContentSelector == "" { 370 | return fmt.Errorf("%w: site needs content_selector", utils.ErrConfigValidation) 371 | } 372 | // Check integer/duration fields for sanity if needed 373 | if siteCfg.MaxDepth < 0 { 374 | log.Warnf("Site MaxDepth cannot be negative, setting to 0 (unlimited)") 375 | siteCfg.MaxDepth = 0 376 | } 377 | if siteCfg.MaxImageSizeBytes != nil && *siteCfg.MaxImageSizeBytes < 0 { 378 | log.Warnf("Site MaxImageSizeBytes cannot be negative, setting to 0 (unlimited override)") 379 | *siteCfg.MaxImageSizeBytes = 0 380 | } 381 | 382 | // Note: Start URL validation (format, scope) happens in Crawler.Run 383 | return nil 384 | } 385 | -------------------------------------------------------------------------------- /pkg/storage/badger_store.go: -------------------------------------------------------------------------------- 1 | package storage 2 | 3 | import ( 4 | "bufio" 5 | "bytes" 6 | "context" 7 | "encoding/json" 8 | "errors" 9 | "fmt" 10 | "os" 11 | "path/filepath" 12 | "runtime" 13 | "time" 14 | 15 | badger "github.com/dgraph-io/badger/v4" 16 | "github.com/sirupsen/logrus" 17 | 18 | "doc-scraper/pkg/log" 19 | "doc-scraper/pkg/models" 20 | "doc-scraper/pkg/utils" 21 | ) 22 | 23 | const ( 24 | pageKeyPrefix = "page:" // Prefix for page URL keys in DB 25 | imageKeyPrefix = "img:" // Prefix for image URL keys in DB 26 | visitedDBDir = "visited_db" // Subdirectory name within stateDir for Badger DB files 27 | ) 28 | 29 | // BadgerStore implements the VisitedStore interface using BadgerDB 30 | type BadgerStore struct { 31 | db *badger.DB 32 | log *logrus.Logger 33 | ctx context.Context // Parent context 34 | } 35 | 36 | // NewBadgerStore initializes and returns a new BadgerStore 37 | func NewBadgerStore(ctx context.Context, stateDir, siteDomain string, resume bool, logger *logrus.Logger) (*BadgerStore, error) { 38 | store := &BadgerStore{ 39 | log: logger, 40 | ctx: ctx, 41 | } 42 | 43 | // Create a unique directory path for this site's DB within the base state directory 44 | dbDirName := utils.SanitizeFilename(siteDomain) + "_" + visitedDBDir // Use sanitize func 45 | dbPath := filepath.Join(stateDir, dbDirName) 46 | 47 | if !resume { 48 | logger.Warnf("Resume flag is false. REMOVING existing state directory: %s", dbPath) 49 | if err := os.RemoveAll(dbPath); err != nil { 50 | // Log error but attempt to continue; Badger might recover or create new files 51 | logger.Errorf("Failed to remove existing state directory %s: %v", dbPath, err) 52 | } 53 | } 54 | 55 | logger.Infof("Initializing visited URL database at: %s (Resume: %v)", dbPath, resume) 56 | 57 | if err := os.MkdirAll(dbPath, 0755); err != nil { 58 | return nil, fmt.Errorf("cannot create state directory %s: %w", dbPath, err) 59 | } 60 | 61 | // Configure Badger options 62 | badgerLogger := log.NewBadgerLogrusAdapter(logger.WithField("component", "badgerdb")) 63 | opts := badger.DefaultOptions(dbPath). 64 | WithLogger(badgerLogger). // Use custom logrus adapter 65 | WithNumVersionsToKeep(1) // Only keep the latest state (visited or not) 66 | 67 | // Open the database 68 | var err error 69 | store.db, err = badger.Open(opts) 70 | if err != nil { 71 | return nil, fmt.Errorf("failed to open badger database at %s: %w", dbPath, err) 72 | } 73 | 74 | logger.Info("Visited URL database initialized successfully.") 75 | return store, nil 76 | } 77 | 78 | // MarkPageVisited implements the VisitedStore interface 79 | func (s *BadgerStore) MarkPageVisited(normalizedPageURL string) (bool, error) { 80 | if s.db == nil { 81 | return false, errors.New("visitedDB not initialized") 82 | } 83 | added := false 84 | key := []byte(pageKeyPrefix + normalizedPageURL) 85 | 86 | err := s.db.Update(func(txn *badger.Txn) error { 87 | _, errGet := txn.Get(key) 88 | if errors.Is(errGet, badger.ErrKeyNotFound) { 89 | // Key doesn't exist, add it with an empty value. 90 | e := badger.NewEntry(key, []byte{}) 91 | errSet := txn.SetEntry(e) 92 | if errSet == nil { 93 | added = true 94 | } 95 | return errSet 96 | } 97 | // Key already exists or another error occurred 98 | return errGet // Return the original error (could be nil if key exists) 99 | }) 100 | 101 | if err != nil { 102 | s.log.WithField("key", string(key)).Errorf("DB Update error in MarkPageVisited: %v", err) 103 | return false, fmt.Errorf("%w: marking page key '%s': %w", utils.ErrDatabase, string(key), err) 104 | } 105 | 106 | return added, nil 107 | } 108 | 109 | // CheckPageStatus implements the VisitedStore interface 110 | func (s *BadgerStore) CheckPageStatus(normalizedPageURL string) (string, *models.PageDBEntry, error) { 111 | status := "not_found" 112 | var entry *models.PageDBEntry = nil 113 | key := []byte(pageKeyPrefix + normalizedPageURL) 114 | 115 | errView := s.db.View(func(txn *badger.Txn) error { 116 | item, errGet := txn.Get(key) 117 | if errors.Is(errGet, badger.ErrKeyNotFound) { 118 | status = "not_found" // Explicitly set status 119 | return nil // Key not found is not an error for this function's purpose 120 | } 121 | if errGet != nil { 122 | return fmt.Errorf("%w: failed getting page key '%s': %w", utils.ErrDatabase, string(key), errGet) 123 | } 124 | 125 | // Key found, now get the value 126 | return item.Value(func(val []byte) error { 127 | if len(val) == 0 { 128 | status = "pending" // Key exists but has no data yet 129 | // s.log.Debugf("Page key '%s' found with empty value, status: pending", string(key)) 130 | return nil 131 | } 132 | 133 | // Value is not empty, try to decode 134 | var decodedEntry models.PageDBEntry 135 | if errJson := json.Unmarshal(val, &decodedEntry); errJson != nil { 136 | s.log.Warnf("Failed to unmarshal PageDBEntry for key '%s': %v. Treating as 'pending'.", string(key), errJson) 137 | status = "pending" // Treat unmarshal error as pending state? Or db_error? Let's stick to pending. 138 | return nil // Return nil to continue View, status is set 139 | } 140 | 141 | // Successfully decoded 142 | entry = &decodedEntry 143 | status = decodedEntry.Status 144 | s.log.Debugf("Page key '%s' found, decoded status: %s", string(key), status) 145 | return nil 146 | }) 147 | }) 148 | 149 | if errView != nil { 150 | s.log.Errorf("DB View error in CheckPageStatus for key '%s': %v", string(key), errView) 151 | status = "db_error" // Set status to indicate DB error 152 | return status, nil, errView // Return the DB error 153 | } 154 | 155 | // No DB error occurred during View/Get/Value 156 | return status, entry, nil 157 | } 158 | 159 | // UpdatePageStatus implements the VisitedStore interface 160 | func (s *BadgerStore) UpdatePageStatus(normalizedPageURL string, entry *models.PageDBEntry) error { 161 | if s.db == nil { 162 | return errors.New("visitedDB not initialized") 163 | } 164 | key := []byte(pageKeyPrefix + normalizedPageURL) 165 | 166 | entryBytes, errJson := json.Marshal(entry) 167 | if errJson != nil { 168 | wrappedErr := fmt.Errorf("%w: failed to marshal PageDBEntry for key '%s': %w", utils.ErrParsing, string(key), errJson) 169 | s.log.Error(wrappedErr) 170 | return wrappedErr 171 | } 172 | 173 | err := s.db.Update(func(txn *badger.Txn) error { 174 | e := badger.NewEntry(key, entryBytes) 175 | return txn.SetEntry(e) 176 | }) 177 | 178 | if err != nil { 179 | s.log.WithField("key", string(key)).Errorf("DB Update error in UpdatePageStatus: %v", err) 180 | return fmt.Errorf("%w: failed setting page status for key '%s': %w", utils.ErrDatabase, string(key), err) 181 | } 182 | 183 | s.log.Debugf("Successfully updated page status for key '%s' to '%s'", string(key), entry.Status) 184 | return nil 185 | } 186 | 187 | // CheckImageStatus implements the VisitedStore interface 188 | func (s *BadgerStore) CheckImageStatus(normalizedImgURL string) (string, *models.ImageDBEntry, error) { 189 | status := "not_found" 190 | var entry *models.ImageDBEntry = nil 191 | key := []byte(imageKeyPrefix + normalizedImgURL) 192 | 193 | errView := s.db.View(func(txn *badger.Txn) error { 194 | item, errGet := txn.Get(key) 195 | if errors.Is(errGet, badger.ErrKeyNotFound) { 196 | status = "not_found" 197 | return nil 198 | } 199 | if errGet != nil { 200 | return fmt.Errorf("%w: failed getting image key '%s': %w", utils.ErrDatabase, string(key), errGet) 201 | } 202 | 203 | return item.Value(func(val []byte) error { 204 | // Image entries should never be empty if written correctly 205 | if len(val) == 0 { 206 | s.log.Warnf("Image key '%s' found with empty value, invalid state. Treating as 'not_found'.", string(key)) 207 | status = "not_found" 208 | return nil 209 | } 210 | 211 | var decodedEntry models.ImageDBEntry 212 | if errJson := json.Unmarshal(val, &decodedEntry); errJson != nil { 213 | s.log.Warnf("Failed to unmarshal ImageDBEntry for key '%s': %v. Treating as 'not_found'.", string(key), errJson) 214 | status = "not_found" 215 | return nil 216 | } 217 | 218 | entry = &decodedEntry 219 | status = decodedEntry.Status 220 | return nil 221 | }) 222 | }) 223 | 224 | if errView != nil { 225 | s.log.Errorf("DB View error in CheckImageStatus for key '%s': %v", string(key), errView) 226 | status = "db_error" 227 | return status, nil, errView 228 | } 229 | 230 | return status, entry, nil 231 | } 232 | 233 | // UpdateImageStatus implements the VisitedStore interface 234 | func (s *BadgerStore) UpdateImageStatus(normalizedImgURL string, entry *models.ImageDBEntry) error { 235 | if s.db == nil { 236 | return errors.New("visitedDB not initialized") 237 | } 238 | key := []byte(imageKeyPrefix + normalizedImgURL) 239 | 240 | entryBytes, errJson := json.Marshal(entry) 241 | if errJson != nil { 242 | wrappedErr := fmt.Errorf("%w: failed to marshal ImageDBEntry for key '%s': %w", utils.ErrParsing, string(key), errJson) 243 | s.log.Error(wrappedErr) 244 | return wrappedErr 245 | } 246 | 247 | err := s.db.Update(func(txn *badger.Txn) error { 248 | e := badger.NewEntry(key, entryBytes) 249 | return txn.SetEntry(e) 250 | }) 251 | 252 | if err != nil { 253 | s.log.WithField("key", string(key)).Errorf("DB Update error in UpdateImageStatus: %v", err) 254 | return fmt.Errorf("%w: failed setting image status for key '%s': %w", utils.ErrDatabase, string(key), err) 255 | } 256 | 257 | return nil 258 | } 259 | 260 | // GetVisitedCount implements the VisitedStore interface 261 | func (s *BadgerStore) GetVisitedCount() (int, error) { 262 | if s.db == nil || s.db.IsClosed() { 263 | return 0, errors.New("DB not initialized or closed") 264 | } 265 | count := 0 266 | err := s.db.View(func(txn *badger.Txn) error { 267 | opts := badger.DefaultIteratorOptions 268 | opts.PrefetchValues = false // We only need to count keys 269 | opts.PrefetchSize = 100 // Default prefetch size 270 | it := txn.NewIterator(opts) 271 | defer it.Close() 272 | for it.Rewind(); it.Valid(); it.Next() { 273 | count++ 274 | } 275 | return nil 276 | }) 277 | if err != nil { 278 | s.log.Errorf("Error counting visited items: %v", err) 279 | return -1, fmt.Errorf("counting items: %w", err) // Indicate error 280 | } 281 | return count, nil 282 | } 283 | 284 | // RunGC runs BadgerDB's garbage collection periodically 285 | func (s *BadgerStore) RunGC(ctx context.Context, interval time.Duration) { 286 | if interval <= 0 { 287 | interval = 10 * time.Minute // Default interval 288 | } 289 | ticker := time.NewTicker(interval) 290 | defer ticker.Stop() 291 | 292 | s.log.Info("BadgerDB GC goroutine started.") 293 | 294 | for { 295 | select { 296 | case <-ticker.C: 297 | // Check if DB is valid before running GC 298 | if s.db == nil || s.db.IsClosed() { 299 | s.log.Info("DB GC: Database is nil or closed, skipping GC cycle.") 300 | continue 301 | } 302 | 303 | s.log.Info("Running BadgerDB value log garbage collection...") 304 | var err error 305 | // Loop GC until it returns ErrNoRewrite or another error 306 | for { 307 | // Run GC if log is at least 50% reclaimable space 308 | err = s.db.RunValueLogGC(0.5) 309 | if err == nil { 310 | s.log.Info("BadgerDB GC cycle completed.") 311 | } else { 312 | break // Exit loop if GC finished (ErrNoRewrite) or encountered an error 313 | } 314 | } 315 | 316 | // Log outcome 317 | if errors.Is(err, badger.ErrNoRewrite) { 318 | s.log.Info("BadgerDB GC finished (no rewrite needed).") 319 | } else { 320 | s.log.Errorf("BadgerDB GC error: %v", err) 321 | } 322 | 323 | // Hint to Go's runtime GC after Badger finishes, potentially freeing more memory 324 | runtime.GC() 325 | 326 | case <-ctx.Done(): // Check if stop signal received via context cancellation 327 | s.log.Infof("Stopping BadgerDB garbage collection goroutine due to context cancellation: %v", ctx.Err()) 328 | return 329 | } 330 | } 331 | } 332 | 333 | // RequeueIncomplete implements the VisitedStore interface 334 | func (s *BadgerStore) RequeueIncomplete(ctx context.Context, workChan chan<- models.WorkItem) (int, int, error) { 335 | s.log.Info("Resume Mode: Scanning database for incomplete tasks to requeue...") 336 | requeuedCount := 0 337 | scanErrors := 0 338 | scanStartTime := time.Now() 339 | 340 | scanErr := s.db.View(func(txn *badger.Txn) error { 341 | opts := badger.DefaultIteratorOptions 342 | opts.PrefetchValues = true // Need values to check status 343 | it := txn.NewIterator(opts) 344 | defer it.Close() 345 | 346 | keyPrefixBytes := []byte(pageKeyPrefix) 347 | 348 | for it.Seek(keyPrefixBytes); it.ValidForPrefix(keyPrefixBytes); it.Next() { 349 | // Check context cancellation within the loop 350 | select { 351 | case <-ctx.Done(): 352 | s.log.Warnf("Resume scan interrupted by context cancellation: %v", ctx.Err()) 353 | return ctx.Err() // Stop iteration 354 | default: 355 | // Continue processing item 356 | } 357 | 358 | item := it.Item() 359 | keyBytesWithPrefix := item.KeyCopy(nil) 360 | keyBytes := keyBytesWithPrefix[len(keyPrefixBytes):] // Strip prefix 361 | urlToRequeue := string(keyBytes) 362 | 363 | errGetValue := item.Value(func(valBytes []byte) error { 364 | valCopy := make([]byte, len(valBytes)) 365 | copy(valCopy, valBytes) 366 | shouldRequeue := false 367 | requeueDepth := 0 368 | 369 | if len(valCopy) == 0 { // Case 1: Empty value (implicitly pending) 370 | s.log.Debugf("Resume Scan: Found empty value for '%s'. Requeueing (Depth 0).", urlToRequeue) 371 | shouldRequeue = true 372 | requeueDepth = 0 // Fallback depth 373 | } else { // Case 2: Decode PageDBEntry 374 | var entry models.PageDBEntry 375 | if errJson := json.Unmarshal(valCopy, &entry); errJson != nil { 376 | s.log.Errorf("Resume Scan: Failed unmarshal PageDBEntry for '%s': %v. Skipping.", urlToRequeue, errJson) 377 | scanErrors++ 378 | return nil // Continue iteration 379 | } 380 | // Case 3: Check status 381 | if entry.Status == "failure" || entry.Status == "pending" { 382 | s.log.Debugf("Resume Scan: Requeueing '%s' (Status: %s, Depth: %d)", urlToRequeue, entry.Status, entry.Depth) 383 | shouldRequeue = true 384 | requeueDepth = entry.Depth // Use stored depth 385 | } 386 | } 387 | 388 | if shouldRequeue { 389 | // Send to channel, respecting context cancellation 390 | select { 391 | case workChan <- models.WorkItem{URL: urlToRequeue, Depth: requeueDepth}: 392 | requeuedCount++ 393 | case <-ctx.Done(): 394 | s.log.Warnf("Resume scan interrupted while sending '%s' to queue: %v", urlToRequeue, ctx.Err()) 395 | return ctx.Err() // Stop iteration 396 | } 397 | } 398 | return nil 399 | }) 400 | 401 | if errGetValue != nil { 402 | // Check if the error was context cancellation propagated from Value func 403 | if errors.Is(errGetValue, context.Canceled) || errors.Is(errGetValue, context.DeadlineExceeded) { 404 | return errGetValue // Propagate context error to stop iteration 405 | } 406 | s.log.Errorf("Resume Scan: Error getting value for key '%s': %v", urlToRequeue, errGetValue) 407 | scanErrors++ 408 | // Decide whether to continue or stop on other value errors 409 | // return errGetValue // Optionally stop iteration on error 410 | } 411 | } 412 | return nil 413 | }) 414 | 415 | durationScan := time.Since(scanStartTime) 416 | if scanErr != nil && !(errors.Is(scanErr, context.Canceled) || errors.Is(scanErr, context.DeadlineExceeded)) { 417 | // Log scan error only if it wasn't a context cancellation 418 | s.log.Errorf("Error during DB scan for resume: %v.", scanErr) 419 | } 420 | s.log.Infof("Resume Scan Complete: Requeued %d tasks in %v. Errors: %d.", requeuedCount, durationScan, scanErrors) 421 | 422 | // Check if the error was cancellation, otherwise return the scan error 423 | if errors.Is(scanErr, context.Canceled) || errors.Is(scanErr, context.DeadlineExceeded) { 424 | return requeuedCount, scanErrors, scanErr // Return context error 425 | } 426 | return requeuedCount, scanErrors, scanErr // Return potential DB error 427 | } 428 | 429 | // WriteVisitedLog implements the VisitedStore interface. 430 | func (s *BadgerStore) WriteVisitedLog(filePath string) error { 431 | s.log.Info("Writing list of visited page and image URLs (from DB)...") 432 | file, err := os.Create(filePath) 433 | if err != nil { 434 | s.log.Errorf("Failed create visited log '%s': %v", filePath, err) 435 | return fmt.Errorf("create visited log '%s': %w", filePath, err) 436 | } 437 | defer file.Close() // Ensure file is closed 438 | 439 | writer := bufio.NewWriter(file) 440 | s.log.Info("Iterating visited DB to write log file...") 441 | var dbErr error 442 | writtenCount := 0 443 | 444 | iterErr := s.db.View(func(txn *badger.Txn) error { 445 | opts := badger.DefaultIteratorOptions 446 | opts.PrefetchValues = false 447 | it := txn.NewIterator(opts) 448 | defer it.Close() 449 | pagePrefixBytes := []byte(pageKeyPrefix) 450 | imgPrefixBytes := []byte(imageKeyPrefix) 451 | 452 | for it.Rewind(); it.Valid(); it.Next() { 453 | // Check context cancellation within the loop 454 | select { 455 | case <-s.ctx.Done(): 456 | s.log.Warnf("WriteVisitedLog scan interrupted by context cancellation: %v", s.ctx.Err()) 457 | return s.ctx.Err() // Stop iteration 458 | default: 459 | // Continue processing item 460 | } 461 | 462 | item := it.Item() 463 | keyBytesWithPrefix := item.KeyCopy(nil) // Copy key with prefix 464 | var keyToWrite string 465 | // Check prefix and strip 466 | if bytes.HasPrefix(keyBytesWithPrefix, pagePrefixBytes) { 467 | keyToWrite = string(keyBytesWithPrefix[len(pagePrefixBytes):]) 468 | } else if bytes.HasPrefix(keyBytesWithPrefix, imgPrefixBytes) { 469 | keyToWrite = string(keyBytesWithPrefix[len(imgPrefixBytes):]) 470 | } else { 471 | s.log.Warnf("Skipping unexpected key in DB (no page/img prefix): %s", string(keyBytesWithPrefix)) 472 | continue // Skip keys without expected prefixes 473 | } 474 | 475 | _, writeErr := writer.WriteString(keyToWrite + "\n") // Write stripped key 476 | if writeErr != nil { 477 | if dbErr == nil { // Store first write error 478 | dbErr = writeErr 479 | } 480 | s.log.Errorf("Error writing URL '%s' to visited log: %v", keyToWrite, writeErr) 481 | // Continue writing other URLs if possible 482 | } 483 | writtenCount++ 484 | if writtenCount%5000 == 0 { 485 | s.log.Debugf("Flushing visited writer after %d entries...", writtenCount) 486 | if flushErr := writer.Flush(); flushErr != nil { 487 | if dbErr == nil { // Store first flush error 488 | dbErr = flushErr 489 | } 490 | s.log.Errorf("Error flushing visited writer: %v", flushErr) 491 | // Continue if possible 492 | } 493 | } 494 | } 495 | return nil 496 | }) 497 | 498 | // Handle errors after iteration 499 | if iterErr != nil && !(errors.Is(iterErr, context.Canceled) || errors.Is(iterErr, context.DeadlineExceeded)) { 500 | s.log.Errorf("Error during visited DB iteration for log: %v", iterErr) 501 | if dbErr == nil { 502 | dbErr = iterErr 503 | } 504 | } 505 | 506 | // Final flush 507 | if flushErr := writer.Flush(); flushErr != nil { 508 | s.log.Errorf("Failed final flush for visited log '%s': %v", filePath, flushErr) 509 | if dbErr == nil { 510 | dbErr = flushErr 511 | } 512 | } 513 | 514 | // Close error check is handled by defer 515 | 516 | if iterErr == nil && dbErr == nil { 517 | s.log.Infof("Finished writing %d URLs to visited log: %s", writtenCount, filePath) 518 | } else { 519 | s.log.Warnf("Finished writing visited log with errors. Wrote ~%d URLs to %s", writtenCount, filePath) 520 | } 521 | 522 | // Return context error if iteration was cancelled, otherwise return first IO/DB error 523 | if errors.Is(iterErr, context.Canceled) || errors.Is(iterErr, context.DeadlineExceeded) { 524 | return iterErr 525 | } 526 | return dbErr 527 | } 528 | 529 | // Close implements the VisitedStore interface 530 | func (s *BadgerStore) Close() error { 531 | if s.db != nil && !s.db.IsClosed() { 532 | s.log.Info("Closing visited DB...") 533 | err := s.db.Close() 534 | if err != nil { 535 | s.log.Errorf("Error closing visited DB: %v", err) 536 | return err 537 | } 538 | s.log.Info("Visited DB closed.") 539 | return nil 540 | } 541 | s.log.Info("Visited DB already closed or was not initialized.") 542 | return nil 543 | } 544 | -------------------------------------------------------------------------------- /pkg/process/image.go: -------------------------------------------------------------------------------- 1 | package process 2 | 3 | import ( 4 | "context" 5 | "crypto/md5" 6 | "errors" 7 | "fmt" 8 | "io" 9 | "mime" 10 | "net/http" 11 | "net/url" 12 | "os" 13 | "path" 14 | "path/filepath" 15 | "runtime/debug" 16 | "strconv" 17 | "strings" 18 | "sync" 19 | "time" 20 | 21 | "github.com/PuerkitoBio/goquery" 22 | "github.com/sirupsen/logrus" 23 | 24 | "doc-scraper/pkg/config" 25 | "doc-scraper/pkg/fetch" 26 | "doc-scraper/pkg/models" 27 | "doc-scraper/pkg/parse" 28 | "doc-scraper/pkg/storage" 29 | "doc-scraper/pkg/utils" 30 | 31 | "golang.org/x/sync/semaphore" 32 | ) 33 | 34 | const ( 35 | ImageDir = "images" // Subdirectory name within siteOutputDir for images 36 | ) 37 | 38 | // ImageDownloadTask holds information needed for an image worker to process one image 39 | type ImageDownloadTask struct { 40 | AbsImgURL string 41 | NormImgURL string 42 | BaseImgURL *url.URL // Parsed absolute URL 43 | ImgHost string 44 | ExtractedCaption string 45 | ImgLogEntry *logrus.Entry // Logger with image-specific context 46 | Ctx context.Context // Context for this specific task 47 | } 48 | 49 | // ImageProcessor handles the orchestration of image downloading and processing 50 | type ImageProcessor struct { 51 | store storage.VisitedStore // DB interaction 52 | fetcher *fetch.Fetcher // HTTP fetching 53 | robotsHandler *fetch.RobotsHandler // Robots checks 54 | rateLimiter *fetch.RateLimiter // Rate limiting 55 | globalSemaphore *semaphore.Weighted // Global concurrency limit 56 | hostSemaphores map[string]*semaphore.Weighted // Per-host limits 57 | hostSemaphoresMu sync.Mutex // Mutex for hostSemaphores map 58 | appCfg config.AppConfig // Global config 59 | log *logrus.Logger 60 | } 61 | 62 | // NewImageProcessor creates a new ImageProcessor 63 | func NewImageProcessor( 64 | store storage.VisitedStore, 65 | fetcher *fetch.Fetcher, 66 | robotsHandler *fetch.RobotsHandler, 67 | rateLimiter *fetch.RateLimiter, 68 | globalSemaphore *semaphore.Weighted, 69 | appCfg config.AppConfig, 70 | log *logrus.Logger, 71 | ) *ImageProcessor { 72 | return &ImageProcessor{ 73 | store: store, 74 | fetcher: fetcher, 75 | robotsHandler: robotsHandler, 76 | rateLimiter: rateLimiter, 77 | globalSemaphore: globalSemaphore, 78 | hostSemaphores: make(map[string]*semaphore.Weighted), 79 | appCfg: appCfg, 80 | log: log, 81 | } 82 | } 83 | 84 | // getHostSemaphore retrieves or creates a semaphore for rate limiting requests to a specific image host 85 | // This is specific to the image processor as it uses its own map 86 | func (ip *ImageProcessor) getHostSemaphore(host string) *semaphore.Weighted { 87 | ip.hostSemaphoresMu.Lock() 88 | defer ip.hostSemaphoresMu.Unlock() 89 | 90 | sem, exists := ip.hostSemaphores[host] 91 | if !exists { 92 | // Get limit from config, use a default if invalid 93 | limit := int64(ip.appCfg.MaxRequestsPerHost) 94 | if limit <= 0 { 95 | limit = 2 // Sensible default if config is 0 or negative 96 | ip.log.Warnf("max_requests_per_host invalid or zero for image host %s, defaulting to %d", host, limit) 97 | } 98 | sem = semaphore.NewWeighted(limit) 99 | ip.hostSemaphores[host] = sem 100 | ip.log.WithFields(logrus.Fields{"host": host, "limit": limit}).Debug("Created new image host semaphore") 101 | } 102 | return sem 103 | } 104 | 105 | // ProcessImages finds images within the main content, checks status, dispatches downloads to a worker pool, and returns a map of successfully processed images and any errors 106 | // It modifies the 'data-crawl-status' attribute on img tags in the selection 107 | func (ip *ImageProcessor) ProcessImages( 108 | mainContent *goquery.Selection, // Operate on the selection 109 | finalURL *url.URL, // Base URL of the page containing the images 110 | siteCfg config.SiteConfig, // Need site-specific image settings 111 | siteOutputDir string, // Need for calculating local paths 112 | taskLog *logrus.Entry, // Logger for the parent page task 113 | ctx context.Context, // Parent context 114 | ) (imageMap map[string]models.ImageData, imageErrs []error) { 115 | taskLog.Debug("Processing images...") 116 | imageMap = make(map[string]models.ImageData) 117 | imageErrs = make([]error, 0) // Collect non-fatal errors here 118 | 119 | // --- Determine Effective Image Handling Settings --- 120 | skipImages := config.GetEffectiveSkipImages(siteCfg, ip.appCfg) 121 | allowedDomains := siteCfg.AllowedImageDomains 122 | disallowedDomains := siteCfg.DisallowedImageDomains 123 | 124 | if skipImages { 125 | taskLog.Info("Skipping all image processing based on configuration.") 126 | mainContent.Find("img").SetAttr("data-crawl-status", "skipped-config") 127 | return imageMap, imageErrs // Return empty map and no errors 128 | } 129 | 130 | // --- Setup for Worker Pool --- 131 | var imgWg sync.WaitGroup 132 | var imgErrMu sync.Mutex // Protects imageMap and imageErrs slice 133 | 134 | // Create buffered channel for image download tasks 135 | numImageWorkers := ip.appCfg.NumImageWorkers 136 | if numImageWorkers <= 0 { 137 | numImageWorkers = ip.appCfg.NumWorkers // Fallback if image workers not set 138 | } 139 | imageTaskChan := make(chan ImageDownloadTask, numImageWorkers*2) // Buffer size heuristic 140 | 141 | // Launch the fixed pool of image workers 142 | taskLog.Infof("Launching %d image download workers", numImageWorkers) 143 | for i := 1; i <= numImageWorkers; i++ { 144 | go ip.imageWorker(i, imageTaskChan, siteCfg, siteOutputDir, imageMap, &imageErrs, &imgErrMu, &imgWg) 145 | } 146 | // --- End Worker Pool Setup --- 147 | 148 | // Ensure base image directory exists 149 | localImageDir := filepath.Join(siteOutputDir, ImageDir) 150 | if mkDirErr := os.MkdirAll(localImageDir, 0755); mkDirErr != nil { 151 | wrappedErr := fmt.Errorf("%w: creating base image directory '%s': %w", utils.ErrFilesystem, localImageDir, mkDirErr) 152 | taskLog.Error(wrappedErr) 153 | // Collect error but continue - workers might handle individual file errors 154 | imgErrMu.Lock() 155 | imageErrs = append(imageErrs, wrappedErr) 156 | imgErrMu.Unlock() 157 | } 158 | 159 | // --- Iterate Through Image Tags and Dispatch Tasks --- 160 | mainContent.Find("img").Each(func(index int, element *goquery.Selection) { 161 | // --- Synchronous Checks --- 162 | element.SetAttr("data-crawl-status", "pending") // Initial status 163 | imgSrc, exists := element.Attr("src") 164 | if !exists || imgSrc == "" { 165 | element.SetAttr("data-crawl-status", "skipped-empty-src") 166 | return 167 | } 168 | // Skip data URIs early 169 | if strings.HasPrefix(imgSrc, "data:") { 170 | element.SetAttr("data-crawl-status", "skipped-data-uri") 171 | return 172 | } 173 | 174 | // Resolve relative URL to absolute 175 | imgURL, imgParseErr := finalURL.Parse(imgSrc) 176 | if imgParseErr != nil { 177 | taskLog.Warnf("Image src parse error '%s': %v", imgSrc, imgParseErr) 178 | element.SetAttr("data-crawl-status", "error-parse") 179 | return 180 | } 181 | absoluteImgURL := imgURL.String() 182 | imgHost := imgURL.Hostname() 183 | imgLog := taskLog.WithFields(logrus.Fields{"img_url": absoluteImgURL, "img_host": imgHost}) 184 | 185 | // Scheme Check 186 | if imgURL.Scheme != "http" && imgURL.Scheme != "https" { 187 | element.SetAttr("data-crawl-status", "skipped-scheme") 188 | return 189 | } 190 | 191 | // Domain Filtering 192 | isAllowed := true 193 | for _, pattern := range disallowedDomains { 194 | if matchDomain(imgHost, pattern) { 195 | isAllowed = false 196 | break 197 | } 198 | } 199 | if isAllowed && len(allowedDomains) > 0 { 200 | isAllowed = false 201 | for _, pattern := range allowedDomains { 202 | if matchDomain(imgHost, pattern) { 203 | isAllowed = true 204 | break 205 | } 206 | } 207 | } 208 | if !isAllowed { 209 | element.SetAttr("data-crawl-status", "skipped-domain") 210 | return 211 | } 212 | 213 | // Robots Check (uses the robots handler passed to ImageProcessor) 214 | // Determine user agent for robots check 215 | userAgent := siteCfg.UserAgent 216 | if userAgent == "" { 217 | userAgent = ip.appCfg.DefaultUserAgent 218 | } 219 | if !ip.robotsHandler.TestAgent(imgURL, userAgent, ctx) { 220 | element.SetAttr("data-crawl-status", "skipped-robots") 221 | return 222 | } 223 | 224 | // Normalize URL 225 | imgNormURLStr, _, imgNormErr := parse.ParseAndNormalize(absoluteImgURL) 226 | if imgNormErr != nil { 227 | imgLog.Warnf("Cannot normalize image URL: %v", imgNormErr) 228 | element.SetAttr("data-crawl-status", "error-normalize") 229 | return 230 | } 231 | 232 | // DB Check (uses the store passed to ImageProcessor) 233 | dbStatus, dbEntry, dbErr := ip.store.CheckImageStatus(imgNormURLStr) 234 | if dbErr != nil { 235 | // Log and collect DB error, Skip if DB check fails 236 | wrappedErr := fmt.Errorf("image DB check failed for '%s': %w", imgNormURLStr, dbErr) 237 | imgLog.Error(wrappedErr) 238 | imgErrMu.Lock() 239 | imageErrs = append(imageErrs, wrappedErr) 240 | imgErrMu.Unlock() 241 | element.SetAttr("data-crawl-status", "error-db") 242 | return 243 | } 244 | 245 | // Extract Caption (Alt/Figcaption) 246 | caption := "" 247 | figure := element.Closest("figure") // Find closest ancestor figure 248 | if figure.Length() > 0 { 249 | figcaption := figure.Find("figcaption").First() // Find figcaption within that figure 250 | if figcaption.Length() > 0 { 251 | caption = strings.TrimSpace(figcaption.Text()) 252 | } 253 | } 254 | // Fallback to alt attribute if no figcaption found or caption is empty 255 | if caption == "" { 256 | if alt, altExists := element.Attr("alt"); altExists { 257 | caption = strings.TrimSpace(alt) 258 | } 259 | } 260 | 261 | // --- Determine if Download Task Needs Dispatching --- 262 | shouldDispatch := false 263 | if dbStatus == "success" { 264 | if dbEntry != nil && dbEntry.LocalPath != "" { 265 | // Successfully downloaded previously, reuse data 266 | element.SetAttr("data-crawl-status", "success") // Mark success (cached) 267 | imgErrMu.Lock() 268 | imageMap[absoluteImgURL] = models.ImageData{ 269 | OriginalURL: absoluteImgURL, 270 | LocalPath: dbEntry.LocalPath, 271 | Caption: caption, // Use newly extracted caption 272 | } 273 | imgErrMu.Unlock() 274 | } else { 275 | // DB state is inconsistent (success but no path) 276 | imgLog.Warnf("Image DB status 'success' but invalid entry (missing path) for '%s'. Re-scheduling download.", imgNormURLStr) 277 | shouldDispatch = true 278 | element.SetAttr("data-crawl-status", "pending-download") // Mark for download 279 | } 280 | } else if dbStatus == "failure" { 281 | // Previously failed, try again 282 | errMsg := "Unknown reason" 283 | if dbEntry != nil { 284 | errMsg = dbEntry.ErrorType 285 | } 286 | imgLog.Warnf("Image previously failed download ('%s'). Re-scheduling.", errMsg) 287 | shouldDispatch = true 288 | element.SetAttr("data-crawl-status", "pending-download") // Mark for download 289 | } else { // "not_found" or "db_error" (though we return early on db_error now) 290 | imgLog.Debugf("Image '%s' new or previously failed check ('%s'). Scheduling download.", imgSrc, dbStatus) 291 | shouldDispatch = true 292 | element.SetAttr("data-crawl-status", "pending-download") // Mark for download 293 | } 294 | 295 | // --- Dispatch Task to Worker Pool (if needed) --- 296 | if shouldDispatch { 297 | imgLog.Debug("Dispatching image download task to worker pool.") 298 | task := ImageDownloadTask{ 299 | AbsImgURL: absoluteImgURL, 300 | NormImgURL: imgNormURLStr, 301 | BaseImgURL: imgURL, // Pass the parsed URL object 302 | ImgHost: imgHost, 303 | ExtractedCaption: caption, 304 | ImgLogEntry: imgLog, // Pass the specific logger 305 | Ctx: ctx, // Pass the parent context 306 | } 307 | imgWg.Add(1) // Increment WG *before* sending 308 | 309 | // Send task to the channel (blocks if buffer is full) 310 | select { 311 | case imageTaskChan <- task: 312 | // Successfully sent 313 | case <-ctx.Done(): 314 | imgLog.Warnf("Context cancelled while trying to dispatch image task for '%s': %v", imgSrc, ctx.Err()) 315 | imgWg.Done() // Decrement WG as task won't be processed 316 | element.SetAttr("data-crawl-status", "error-dispatch-context") // Mark as error 317 | } 318 | } 319 | }) // End mainContent.Find("img").Each 320 | 321 | // --- Signal Workers and Wait --- 322 | taskLog.Debug("Finished dispatching all image tasks for this page. Closing task channel.") 323 | close(imageTaskChan) // Close channel to signal workers no more tasks are coming for *this page* 324 | 325 | taskLog.Debug("Waiting for image download workers to finish...") 326 | imgWg.Wait() // Wait for all tasks dispatched *for this page* to complete 327 | taskLog.Debug("All image download workers finished for this page.") 328 | // --- End Signal and Wait --- 329 | 330 | if len(imageErrs) > 0 { 331 | taskLog.Warnf("Finished image processing for page with %d non-fatal error(s).", len(imageErrs)) 332 | } else { 333 | taskLog.Debug("Image processing complete for page.") 334 | } 335 | return imageMap, imageErrs 336 | } 337 | 338 | // imageWorker processes image download tasks received from a channel 339 | func (ip *ImageProcessor) imageWorker( 340 | id int, // Worker ID for logging 341 | taskChan <-chan ImageDownloadTask, // Channel to receive tasks 342 | siteCfg config.SiteConfig, // Pass siteCfg for UA, delay etc. 343 | siteOutputDir string, // Pass output dir base 344 | imageMap map[string]models.ImageData, // Shared map for results (needs mutex) 345 | imageErrs *[]error, // Shared slice for errors (needs mutex) 346 | imgErrMu *sync.Mutex, // Mutex for map and slice 347 | imgWg *sync.WaitGroup, // WaitGroup to signal task completion 348 | ) { 349 | workerLog := ip.log.WithField("image_worker_id", id) 350 | workerLog.Debug("Image worker started") 351 | 352 | // Process tasks from the channel until it's closed 353 | for task := range taskChan { 354 | // Call the function to process this single task 355 | // Pass necessary dependencies down 356 | ip.processSingleImageTask(task, siteCfg, siteOutputDir, imageMap, imageErrs, imgErrMu, imgWg) 357 | } 358 | 359 | workerLog.Debug("Image worker finished (task channel closed)") 360 | } 361 | 362 | // processSingleImageTask handles the download, saving, and DB update for one image 363 | func (ip *ImageProcessor) processSingleImageTask( 364 | task ImageDownloadTask, // The specific task data 365 | siteCfg config.SiteConfig, // Need site specific settings 366 | siteOutputDir string, // Need output base 367 | imageMap map[string]models.ImageData, // Shared map 368 | imageErrs *[]error, // Shared slice 369 | imgErrMu *sync.Mutex, // Mutex 370 | imgWg *sync.WaitGroup, // WaitGroup 371 | ) { 372 | // --- Get context from task --- 373 | ctx := task.Ctx 374 | if ctx == nil { // Safety check 375 | ctx = context.Background() 376 | } 377 | 378 | // --- Setup variables specific to this task --- 379 | imgLogEntry := task.ImgLogEntry // Use logger passed in task 380 | imgHost := task.ImgHost 381 | var imgTaskErr error // Primary error for this specific task 382 | imgDownloaded := false 383 | imgLocalPath := "" // Relative path, set on success 384 | var copiedBytes int64 = 0 385 | 386 | // --- Defer DB update, panic recovery, and WaitGroup decrement --- 387 | defer func() { 388 | // --- Panic Recovery --- 389 | if r := recover(); r != nil { 390 | imgTaskErr = fmt.Errorf("panic processing img '%s': %v", task.AbsImgURL, r) 391 | stackTrace := string(debug.Stack()) 392 | imgLogEntry.WithFields(logrus.Fields{"panic_info": r, "stack_trace": stackTrace}).Error("PANIC Recovered in processSingleImageTask") 393 | // Collect error (needs mutex) 394 | imgErrMu.Lock() 395 | *imageErrs = append(*imageErrs, imgTaskErr) // Use the error created above 396 | imgErrMu.Unlock() 397 | } 398 | 399 | // --- DB Status Update --- 400 | now := time.Now() 401 | var entryToSave models.ImageDBEntry 402 | if imgTaskErr == nil && imgDownloaded { // Success path 403 | entryToSave = models.ImageDBEntry{ 404 | Status: "success", 405 | LocalPath: imgLocalPath, 406 | Caption: task.ExtractedCaption, 407 | LastAttempt: now, 408 | } 409 | } else { // Failure path 410 | errorType := "UnknownDownloadFailure" 411 | if imgTaskErr != nil { 412 | errorType = utils.CategorizeError(imgTaskErr) // Use utility function 413 | } 414 | entryToSave = models.ImageDBEntry{ 415 | Status: "failure", 416 | ErrorType: errorType, 417 | LastAttempt: now, 418 | } 419 | // Log & Collect the primary error if it wasn't a panic 420 | if imgTaskErr != nil && recover() == nil { // Avoid double-collecting/logging panic errors 421 | imgLogEntry.Warnf("Image download/save failed: %v", imgTaskErr) 422 | imgErrMu.Lock() 423 | *imageErrs = append(*imageErrs, imgTaskErr) 424 | imgErrMu.Unlock() 425 | } 426 | } 427 | // Update DB using the store interface 428 | if updateErr := ip.store.UpdateImageStatus(task.NormImgURL, &entryToSave); updateErr != nil { 429 | // If DB update fails, log it and collect the error too 430 | dbUpdateErr := fmt.Errorf("failed update DB status img '%s' to '%s': %w", task.NormImgURL, entryToSave.Status, updateErr) 431 | imgLogEntry.Error(dbUpdateErr) 432 | imgErrMu.Lock() 433 | *imageErrs = append(*imageErrs, dbUpdateErr) 434 | imgErrMu.Unlock() 435 | } 436 | 437 | // --- Signal WaitGroup Completion --- 438 | imgWg.Done() 439 | }() // --- End Defer --- 440 | 441 | // --- Determine Effective Settings --- 442 | userAgent := siteCfg.UserAgent 443 | if userAgent == "" { 444 | userAgent = ip.appCfg.DefaultUserAgent 445 | } 446 | imgHostDelay := siteCfg.DelayPerHost 447 | if imgHostDelay <= 0 { 448 | imgHostDelay = ip.appCfg.DefaultDelayPerHost 449 | } 450 | semTimeout := ip.appCfg.SemaphoreAcquireTimeout 451 | effectiveMaxBytes := config.GetEffectiveMaxImageSize(siteCfg, ip.appCfg) 452 | localImageDir := filepath.Join(siteOutputDir, ImageDir) // Base image dir for this site 453 | 454 | // --- Acquire Semaphores & Apply Rate Limit --- 455 | // Use a closure to manage semaphore release with scoped defer 456 | semAcquireErr := func() error { 457 | // 1. Acquire Host Semaphore 458 | imgHostSem := ip.getHostSemaphore(imgHost) // Use processor's method 459 | ctxIH, cancelIH := context.WithTimeout(ctx, semTimeout) 460 | defer cancelIH() 461 | semErr := imgHostSem.Acquire(ctxIH, 1) 462 | if semErr != nil { 463 | if errors.Is(semErr, context.DeadlineExceeded) { 464 | return fmt.Errorf("%w: acquiring host semaphore for img '%s': %w", utils.ErrSemaphoreTimeout, task.AbsImgURL, semErr) 465 | } 466 | return fmt.Errorf("failed acquiring host semaphore for img '%s': %w", task.AbsImgURL, semErr) 467 | } 468 | // Release host semaphore when closure returns 469 | defer imgHostSem.Release(1) 470 | 471 | // 2. Acquire Global Semaphore 472 | ctxIG, cancelIG := context.WithTimeout(ctx, semTimeout) 473 | defer cancelIG() 474 | semErr = ip.globalSemaphore.Acquire(ctxIG, 1) // Use processor's global semaphore 475 | if semErr != nil { 476 | // Release host semaphore here as global failed *after* acquiring host 477 | // Note: defer above handles release if *this function* returns error 478 | if errors.Is(semErr, context.DeadlineExceeded) { 479 | return fmt.Errorf("%w: acquiring global semaphore for img '%s': %w", utils.ErrSemaphoreTimeout, task.AbsImgURL, semErr) 480 | } 481 | return fmt.Errorf("failed acquiring global semaphore for img '%s': %w", task.AbsImgURL, semErr) 482 | } 483 | // Release global semaphore when closure returns 484 | defer ip.globalSemaphore.Release(1) 485 | 486 | // 3. Apply Rate Limit (After acquiring semaphores) 487 | ip.rateLimiter.ApplyDelay(imgHost, imgHostDelay) // Use processor's rate limiter 488 | 489 | return nil // Success 490 | }() // Execute the closure immediately 491 | 492 | if semAcquireErr != nil { 493 | imgTaskErr = semAcquireErr // Assign the semaphore/rate limit error 494 | return // Return triggers defer cleanup (DB update to failure, WG done) 495 | } 496 | 497 | // --- Fetch Image Request --- 498 | imgLogEntry.Debug("Attempting fetch image request") 499 | imgReq, reqErr := http.NewRequestWithContext(ctx, "GET", task.AbsImgURL, nil) 500 | if reqErr != nil { 501 | imgTaskErr = fmt.Errorf("%w: creating request for img '%s': %w", utils.ErrRequestCreation, task.AbsImgURL, reqErr) 502 | ip.rateLimiter.UpdateLastRequestTime(imgHost) 503 | return 504 | } 505 | imgReq.Header.Set("User-Agent", userAgent) 506 | 507 | // --- Perform Fetch with Retries --- 508 | imgResp, imgFetchErr := ip.fetcher.FetchWithRetry(imgReq, ctx) // Use processor's fetcher 509 | ip.rateLimiter.UpdateLastRequestTime(imgHost) // Update last request time *after* the attempt 510 | 511 | if imgFetchErr != nil { 512 | imgTaskErr = fmt.Errorf("fetch failed for img '%s': %w", task.AbsImgURL, imgFetchErr) 513 | // Ensure body is closed if fetch failed but returned a response 514 | if imgResp != nil { 515 | io.Copy(io.Discard, imgResp.Body) 516 | imgResp.Body.Close() 517 | } 518 | return // Triggers defer 519 | } 520 | // If fetch succeeded, imgResp is non-nil and status is 2xx 521 | defer imgResp.Body.Close() 522 | 523 | // --- Header Size Check --- 524 | headerSizeStr := imgResp.Header.Get("Content-Length") 525 | if headerSizeStr != "" { 526 | if headerSize, parseHdrErr := strconv.ParseInt(headerSizeStr, 10, 64); parseHdrErr == nil { 527 | if effectiveMaxBytes > 0 && headerSize > effectiveMaxBytes { 528 | imgTaskErr = fmt.Errorf("image '%s' exceeds max size based on header (%d > %d bytes)", task.AbsImgURL, headerSize, effectiveMaxBytes) 529 | io.Copy(io.Discard, imgResp.Body) // Drain body before returning 530 | return // Triggers defer 531 | } 532 | } else { 533 | imgLogEntry.Warnf("Could not parse Content-Length header '%s'", headerSizeStr) 534 | } 535 | } 536 | 537 | // --- Generate Local Filename --- 538 | localFilename, fileExtErr := generateLocalFilename(task.BaseImgURL, task.AbsImgURL, imgResp.Header.Get("Content-Type"), imgLogEntry) 539 | if fileExtErr != nil { 540 | imgTaskErr = fileExtErr // Assign error from filename generation 541 | io.Copy(io.Discard, imgResp.Body) 542 | return // Triggers defer 543 | } 544 | localFilePath := filepath.Join(localImageDir, localFilename) 545 | 546 | // Calculate relative path for storing in DB and map 547 | relativeFilePath, relErr := filepath.Rel(siteOutputDir, localFilePath) 548 | if relErr != nil { 549 | // This shouldn't typically fail if siteOutputDir and localFilePath are sane 550 | imgLogEntry.Warnf("Could not calculate relative path from '%s' to '%s': %v. Using filename only.", siteOutputDir, localFilePath, relErr) 551 | relativeFilePath = localFilename // Fallback to just the filename 552 | } 553 | // Ensure forward slashes for storage consistency 554 | relativeFilePath = filepath.ToSlash(relativeFilePath) 555 | imgLogEntry.Debugf("Final image save path: %s (Relative: %s)", localFilePath, relativeFilePath) 556 | 557 | // --- Ensure Output Directory Exists --- 558 | // MkdirAll is idempotent, safe to call even if check was done earlier 559 | if mkDirErr := os.MkdirAll(localImageDir, 0755); mkDirErr != nil { 560 | imgTaskErr = fmt.Errorf("%w: ensuring image directory '%s' exists: %w", utils.ErrFilesystem, localImageDir, mkDirErr) 561 | io.Copy(io.Discard, imgResp.Body) 562 | return // Triggers defer 563 | } 564 | 565 | // --- Create Destination File --- 566 | outFile, createErr := os.Create(localFilePath) 567 | if createErr != nil { 568 | imgTaskErr = fmt.Errorf("%w: creating image file '%s': %w", utils.ErrFilesystem, localFilePath, createErr) 569 | io.Copy(io.Discard, imgResp.Body) 570 | return // Triggers defer 571 | } 572 | // Use defer for closing outFile to handle errors during copy 573 | defer func() { 574 | if err := outFile.Close(); err != nil && imgTaskErr == nil { 575 | // Only capture close error if no other error happened before it 576 | imgTaskErr = fmt.Errorf("%w: closing image file '%s' after write: %w", utils.ErrFilesystem, localFilePath, err) 577 | } 578 | }() 579 | 580 | // --- Stream Data using io.Copy with Size Limit --- 581 | var reader io.Reader = imgResp.Body 582 | if effectiveMaxBytes > 0 { 583 | reader = io.LimitReader(imgResp.Body, effectiveMaxBytes) 584 | } 585 | 586 | imgLogEntry.Debugf("Streaming image data to %s", localFilePath) 587 | var copyErr error 588 | copiedBytes, copyErr = io.Copy(outFile, reader) 589 | 590 | // Drain any remaining data from the original response body to allow connection reuse 591 | // This is important especially if LimitReader stopped reading early 592 | _, drainErr := io.Copy(io.Discard, imgResp.Body) 593 | if drainErr != nil { 594 | imgLogEntry.Warnf("Error draining response body after copy: %v", drainErr) 595 | // Don't override primary copy error if one occurred 596 | } 597 | // Response body is closed by the earlier defer imgResp.Body.Close() 598 | 599 | // --- Handle io.Copy Errors --- 600 | if copyErr != nil { 601 | imgTaskErr = fmt.Errorf("%w: copying image data to '%s' (copied %d bytes): %w", utils.ErrFilesystem, localFilePath, copiedBytes, copyErr) 602 | // Need to close before removing on some OS (Windows) 603 | outFile.Close() // Close explicitly before remove 604 | os.Remove(localFilePath) // Attempt cleanup 605 | return // Triggers main defer 606 | } 607 | 608 | // --- Check if Size Limit Was Hit During Copy --- 609 | // We need to check this *after* draining the original body, 610 | // because LimitReader might have stopped reading *exactly* at the limit 611 | // The check needs to see if the limit was *reached*. 612 | if effectiveMaxBytes > 0 && copiedBytes >= effectiveMaxBytes { 613 | // Check if the original Content-Length was also above the limit 614 | sizeExceeded := true 615 | if headerSizeStr != "" { 616 | if headerSize, _ := strconv.ParseInt(headerSizeStr, 10, 64); headerSize <= effectiveMaxBytes { 617 | imgLogEntry.Warnf("Copied bytes (%d) >= limit (%d), but Content-Length (%d) was <= limit.", copiedBytes, effectiveMaxBytes, headerSize) 618 | } 619 | } 620 | 621 | if sizeExceeded { 622 | imgTaskErr = fmt.Errorf("image '%s' exceeds max size (%d >= %d bytes, download truncated)", task.AbsImgURL, copiedBytes, effectiveMaxBytes) 623 | outFile.Close() // Close explicitly before remove 624 | os.Remove(localFilePath) // Attempt cleanup 625 | return // Triggers main defer 626 | } 627 | } 628 | 629 | // --- File Save Success --- 630 | // The deferred outFile.Close() will run. If it errors, it sets imgTaskErr 631 | // Check imgTaskErr *after* the defer has potentially run (conceptually) 632 | // The defer logic already handles this: it updates DB based on imgTaskErr 633 | 634 | // If no error occurred up to this point (including potential outFile.Close error) 635 | if imgTaskErr == nil { 636 | imgDownloaded = true 637 | imgLocalPath = relativeFilePath // Use the calculated relative path 638 | 639 | // Update the shared map (requires mutex) 640 | imgErrMu.Lock() 641 | imageMap[task.AbsImgURL] = models.ImageData{ 642 | OriginalURL: task.AbsImgURL, 643 | LocalPath: imgLocalPath, 644 | Caption: task.ExtractedCaption, 645 | } 646 | imgErrMu.Unlock() 647 | 648 | imgLogEntry.Debugf("Successfully saved image (%d bytes)", copiedBytes) 649 | } 650 | // --- Task processing finished --- 651 | // Return will trigger the main defer block for cleanup/DB update/WG decrement 652 | } 653 | 654 | // generateLocalFilename creates a unique and safe filename for a downloaded image 655 | func generateLocalFilename(baseImgURL *url.URL, absImgURL string, contentType string, imgLogEntry *logrus.Entry) (string, error) { 656 | // 1. Get Base Name and Original Extension from URL Path 657 | originalExt := path.Ext(baseImgURL.Path) 658 | imgBaseName := utils.SanitizeFilename(strings.TrimSuffix(path.Base(baseImgURL.Path), originalExt)) 659 | if imgBaseName == "" || imgBaseName == "_" { 660 | // Handle cases where sanitization results in empty/underscore 661 | // Use a hash of the URL or a default name 662 | urlHashOnly := fmt.Sprintf("%x", md5.Sum([]byte(absImgURL)))[:12] // Longer hash if base name is missing 663 | imgBaseName = "image_" + urlHashOnly 664 | imgLogEntry.Debugf("Sanitized base name was empty, using hash fallback: %s", imgBaseName) 665 | } 666 | 667 | // 2. Determine Final Extension based on Content-Type and URL 668 | finalExt := originalExt // Start with extension from URL 669 | 670 | if contentType != "" { 671 | mimeType, _, mimeErr := mime.ParseMediaType(contentType) 672 | if mimeErr == nil { 673 | // Prefer common extensions if MIME type matches 674 | switch mimeType { 675 | case "image/jpeg": 676 | finalExt = ".jpg" 677 | case "image/png": 678 | finalExt = ".png" 679 | case "image/gif": 680 | finalExt = ".gif" 681 | case "image/webp": 682 | finalExt = ".webp" 683 | case "image/svg+xml": 684 | finalExt = ".svg" 685 | default: 686 | // Try getting extensions from MIME type if not common 687 | extensions, extErr := mime.ExtensionsByType(mimeType) 688 | if extErr == nil && len(extensions) > 0 { 689 | // Prioritize known good extensions if multiple exist 690 | preferredExt := "" 691 | for _, ext := range extensions { 692 | if ext == ".jpg" || ext == ".jpeg" || ext == ".png" || ext == ".gif" || ext == ".webp" || ext == ".svg" { 693 | preferredExt = ext 694 | break 695 | } 696 | } 697 | if preferredExt != "" { 698 | finalExt = preferredExt 699 | } else if finalExt == "" { // Use first extension if URL had none 700 | finalExt = extensions[0] 701 | } 702 | // Keep originalExt if MIME type didn't yield a better one 703 | } else if finalExt == "" { 704 | // Cannot determine extension from MIME, and URL had none 705 | return "", fmt.Errorf("cannot determine file extension (MIME: %s, MIME extensions error: %v, URL Ext: none)", mimeType, extErr) 706 | } 707 | } 708 | } else { 709 | imgLogEntry.Warnf("Could not parse Content-Type header '%s': %v", contentType, mimeErr) 710 | if finalExt == "" { 711 | // Cannot determine extension from Content-Type, and URL had none 712 | return "", fmt.Errorf("cannot determine file extension (unparsable Content-Type, no URL extension)") 713 | } 714 | } 715 | } else if finalExt == "" { 716 | // Cannot determine extension (no Content-Type, no URL extension) 717 | return "", fmt.Errorf("cannot determine file extension (no Content-Type, no URL extension)") 718 | } 719 | 720 | // Ensure extension starts with a dot 721 | if finalExt != "" && !strings.HasPrefix(finalExt, ".") { 722 | finalExt = "." + finalExt 723 | } 724 | 725 | // 3. Add Hash for Uniqueness 726 | // Use a short hash of the *full absolute URL* to disambiguate files with the same base name but different paths/queries 727 | urlHash := fmt.Sprintf("%x", md5.Sum([]byte(absImgURL)))[:8] // 8-char hex hash 728 | 729 | // 4. Construct Final Filename 730 | // Format: sanitizedBaseName_hash.finalExt 731 | localFilename := fmt.Sprintf("%s_%s%s", imgBaseName, urlHash, finalExt) 732 | 733 | return localFilename, nil 734 | } 735 | 736 | // matchDomain checks if a host matches a pattern (exact or simple wildcard *. TLD) 737 | // This is a helper used by ProcessImages. 738 | func matchDomain(host string, pattern string) bool { 739 | host = strings.ToLower(host) 740 | pattern = strings.ToLower(pattern) 741 | 742 | if strings.HasPrefix(pattern, "*.") { 743 | // Wildcard match: *.example.com matches host=sub.example.com or host=example.com 744 | suffix := pattern[1:] // Get ".example.com" 745 | // Check if host ends with the suffix OR if host is exactly the suffix without the leading dot 746 | return strings.HasSuffix(host, suffix) || (len(suffix) > 1 && host == suffix[1:]) 747 | } 748 | // Exact match 749 | return host == pattern 750 | } 751 | --------------------------------------------------------------------------------