├── .dockerignore
├── .github
    ├── FUNDING.yml
    └── workflows
    │   ├── codacy.yml
    │   ├── codeql.yml
    │   ├── go-ossf-slsa3-publish.yml
    │   ├── power-ci.yaml
    │   └── release.yml
├── .gitignore
├── .golangci.yml
├── .goreleaser.yaml
├── Dockerfile
├── Dockerfile.goreleaser
├── LICENSE
├── Makefile
├── README.md
├── all_logs_list.json
├── cmd
    └── rxtls
    │   └── main.go
├── go.mod
├── go.sum
└── internal
    ├── certlib
        ├── api.go
        ├── domain_normalization_test.go
        ├── models.go
        └── models_test.go
    ├── client
        └── http.go
    ├── core
        ├── common.go
        ├── constants.go
        ├── domain_extractor.go
        ├── download_manager.go
        ├── error.go
        ├── list.go
        ├── ratelimiter.go
        ├── scheduler.go
        ├── scheduler_stub.go
        └── work.go
    ├── io
        └── buffer.go
    ├── metrics
        └── metrics.go
    └── util
        └── filename.go


/.dockerignore:
--------------------------------------------------------------------------------
 1 | # Git files
 2 | .git
 3 | .gitignore
 4 | 
 5 | # Documentation
 6 | *.md
 7 | docs/
 8 | LICENSE
 9 | 
10 | # Build artifacts
11 | dist/
12 | *.exe
13 | rxtls
14 | 
15 | # Development files
16 | .goreleaser.yaml
17 | .golangci.yml
18 | Makefile
19 | .github/
20 | 
21 | # Test files
22 | *_test.go
23 | testdata/
24 | 
25 | # IDE files
26 | .vscode/
27 | .idea/
28 | *.swp
29 | *.swo
30 | 
31 | # OS files
32 | .DS_Store
33 | Thumbs.db


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
 1 | # These are supported funding model platforms
 2 | 
 3 | github: xstp
 4 | patreon: # Replace with a single Patreon username
 5 | open_collective: # Replace with a single Open Collective username
 6 | ko_fi: securetheplanet
 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
 9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
12 | polar: # Replace with a single Polar username
13 | buy_me_a_coffee: xstp
14 | thanks_dev: # Replace with a single thanks.dev username
15 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
16 | 


--------------------------------------------------------------------------------
/.github/workflows/codacy.yml:
--------------------------------------------------------------------------------
 1 | # This workflow uses actions that are not certified by GitHub.
 2 | # They are provided by a third-party and are governed by
 3 | # separate terms of service, privacy policy, and support
 4 | # documentation.
 5 | 
 6 | # This workflow checks out code, performs a Codacy security scan
 7 | # and integrates the results with the
 8 | # GitHub Advanced Security code scanning feature.  For more information on
 9 | # the Codacy security scan action usage and parameters, see
10 | # https://github.com/codacy/codacy-analysis-cli-action.
11 | # For more information on Codacy Analysis CLI in general, see
12 | # https://github.com/codacy/codacy-analysis-cli.
13 | 
14 | name: Codacy Security Scan
15 | 
16 | on:
17 |   push:
18 |     branches: [ "main" ]
19 |   pull_request:
20 |     # The branches below must be a subset of the branches above
21 |     branches: [ "main" ]
22 |   schedule:
23 |     - cron: '32 7 * * 0'
24 | 
25 | permissions:
26 |   contents: read
27 | 
28 | jobs:
29 |   codacy-security-scan:
30 |     permissions:
31 |       contents: read # for actions/checkout to fetch code
32 |       security-events: write # for github/codeql-action/upload-sarif to upload SARIF results
33 |       actions: read # only required for a private repository by github/codeql-action/upload-sarif to get the Action run status
34 |     name: Codacy Security Scan
35 |     runs-on: ubuntu-latest
36 |     steps:
37 |       # Checkout the repository to the GitHub Actions runner
38 |       - name: Checkout code
39 |         uses: actions/checkout@v4
40 | 
41 |       # Execute Codacy Analysis CLI and generate a SARIF output with the security issues identified during the analysis
42 |       - name: Run Codacy Analysis CLI
43 |         uses: codacy/codacy-analysis-cli-action@d840f886c4bd4edc059706d09c6a1586111c540b
44 |         with:
45 |           # Check https://github.com/codacy/codacy-analysis-cli#project-token to get your project token from your Codacy repository
46 |           # You can also omit the token and run the tools that support default configurations
47 |           project-token: ${{ secrets.CODACY_PROJECT_TOKEN }}
48 |           verbose: true
49 |           output: results.sarif
50 |           format: sarif
51 |           # Adjust severity of non-security issues
52 |           gh-code-scanning-compat: true
53 |           # Force 0 exit code to allow SARIF file generation
54 |           # This will handover control about PR rejection to the GitHub side
55 |           max-allowed-issues: 2147483647
56 | 
57 |       # Upload the SARIF file generated in the previous step
58 |       - name: Upload SARIF results file
59 |         uses: github/codeql-action/upload-sarif@v3
60 |         with:
61 |           sarif_file: results.sarif
62 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql.yml:
--------------------------------------------------------------------------------
  1 | # For most projects, this workflow file will not need changing; you simply need
  2 | # to commit it to your repository.
  3 | #
  4 | # You may wish to alter this file to override the set of languages analyzed,
  5 | # or to provide custom queries or build logic.
  6 | #
  7 | # ******** NOTE ********
  8 | # We have attempted to detect the languages in your repository. Please check
  9 | # the `language` matrix defined below to confirm you have the correct set of
 10 | # supported CodeQL languages.
 11 | #
 12 | name: "CodeQL Advanced"
 13 | 
 14 | on:
 15 |   push:
 16 |     branches: [ "main" ]
 17 |   pull_request:
 18 |     branches: [ "main" ]
 19 |   schedule:
 20 |     - cron: '18 18 * * 1'
 21 | 
 22 | jobs:
 23 |   analyze:
 24 |     name: Analyze (${{ matrix.language }})
 25 |     # Runner size impacts CodeQL analysis time. To learn more, please see:
 26 |     #   - https://gh.io/recommended-hardware-resources-for-running-codeql
 27 |     #   - https://gh.io/supported-runners-and-hardware-resources
 28 |     #   - https://gh.io/using-larger-runners (GitHub.com only)
 29 |     # Consider using larger runners or machines with greater resources for possible analysis time improvements.
 30 |     runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }}
 31 |     permissions:
 32 |       # required for all workflows
 33 |       security-events: write
 34 | 
 35 |       # required to fetch internal or private CodeQL packs
 36 |       packages: read
 37 | 
 38 |       # only required for workflows in private repositories
 39 |       actions: read
 40 |       contents: read
 41 | 
 42 |     strategy:
 43 |       fail-fast: false
 44 |       matrix:
 45 |         include:
 46 |         - language: actions
 47 |           build-mode: none
 48 |         - language: go
 49 |           build-mode: autobuild
 50 |         # CodeQL supports the following values keywords for 'language': 'actions', 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'swift'
 51 |         # Use `c-cpp` to analyze code written in C, C++ or both
 52 |         # Use 'java-kotlin' to analyze code written in Java, Kotlin or both
 53 |         # Use 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both
 54 |         # To learn more about changing the languages that are analyzed or customizing the build mode for your analysis,
 55 |         # see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning.
 56 |         # If you are analyzing a compiled language, you can modify the 'build-mode' for that language to customize how
 57 |         # your codebase is analyzed, see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/codeql-code-scanning-for-compiled-languages
 58 |     steps:
 59 |     - name: Checkout repository
 60 |       uses: actions/checkout@v4
 61 | 
 62 |     # Add any setup steps before running the `github/codeql-action/init` action.
 63 |     # This includes steps like installing compilers or runtimes (`actions/setup-node`
 64 |     # or others). This is typically only required for manual builds.
 65 |     # - name: Setup runtime (example)
 66 |     #   uses: actions/setup-example@v1
 67 | 
 68 |     # Initializes the CodeQL tools for scanning.
 69 |     - name: Initialize CodeQL
 70 |       uses: github/codeql-action/init@v3
 71 |       with:
 72 |         languages: ${{ matrix.language }}
 73 |         build-mode: ${{ matrix.build-mode }}
 74 |         # If you wish to specify custom queries, you can do so here or in a config file.
 75 |         # By default, queries listed here will override any specified in a config file.
 76 |         # Prefix the list here with "+" to use these queries and those in the config file.
 77 | 
 78 |         # For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
 79 |         # queries: security-extended,security-and-quality
 80 | 
 81 |     # If the analyze step fails for one of the languages you are analyzing with
 82 |     # "We were unable to automatically build your code", modify the matrix above
 83 |     # to set the build mode to "manual" for that language. Then modify this step
 84 |     # to build your code.
 85 |     # ℹ️ Command-line programs to run using the OS shell.
 86 |     # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
 87 |     - if: matrix.build-mode == 'manual'
 88 |       shell: bash
 89 |       run: |
 90 |         echo 'If you are using a "manual" build mode for one or more of the' \
 91 |           'languages you are analyzing, replace this with the commands to build' \
 92 |           'your code, for example:'
 93 |         echo '  make bootstrap'
 94 |         echo '  make release'
 95 |         exit 1
 96 | 
 97 |     - name: Perform CodeQL Analysis
 98 |       uses: github/codeql-action/analyze@v3
 99 |       with:
100 |         category: "/language:${{matrix.language}}"
101 | 


--------------------------------------------------------------------------------
/.github/workflows/go-ossf-slsa3-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflow uses actions that are not certified by GitHub.
 2 | # They are provided by a third-party and are governed by
 3 | # separate terms of service, privacy policy, and support
 4 | # documentation.
 5 | 
 6 | # This workflow lets you compile your Go project using a SLSA3 compliant builder.
 7 | # This workflow will generate a so-called "provenance" file describing the steps
 8 | # that were performed to generate the final binary.
 9 | # The project is an initiative of the OpenSSF (openssf.org) and is developed at
10 | # https://github.com/slsa-framework/slsa-github-generator.
11 | # The provenance file can be verified using https://github.com/slsa-framework/slsa-verifier.
12 | # For more information about SLSA and how it improves the supply-chain, visit slsa.dev.
13 | 
14 | name: SLSA Go releaser
15 | on:
16 |   workflow_dispatch:
17 |   release:
18 |     types: [created]
19 | 
20 | permissions: read-all
21 | 
22 | jobs:
23 |   # ========================================================================================================================================
24 |   #     Prerequesite: Create a .slsa-goreleaser.yml in the root directory of your project.
25 |   #       See format in https://github.com/slsa-framework/slsa-github-generator/blob/main/internal/builders/go/README.md#configuration-file
26 |   #=========================================================================================================================================
27 |   build:
28 |     permissions:
29 |       id-token: write # To sign.
30 |       contents: write # To upload release assets.
31 |       actions: read   # To read workflow path.
32 |     uses: slsa-framework/slsa-github-generator/.github/workflows/builder_go_slsa3.yml@v1.4.0
33 |     with:
34 |       go-version: 1.24
35 |       # =============================================================================================================
36 |       #     Optional: For more options, see https://github.com/slsa-framework/slsa-github-generator#golang-projects
37 |       # =============================================================================================================
38 | 
39 | 


--------------------------------------------------------------------------------
/.github/workflows/power-ci.yaml:
--------------------------------------------------------------------------------
 1 | name: PowerPC CI
 2 | 
 3 | permissions:
 4 |   contents: read
 5 | 
 6 | on:
 7 |   pull_request:
 8 |     branches: [ main ]
 9 |   push:
10 |     branches: [ main ]
11 | 
12 | jobs:
13 |   build-ppc64le:
14 |     name: Build on ppc64le
15 |     runs-on: ubuntu-latest
16 |     steps:
17 |       - name: Checkout code
18 |         uses: actions/checkout@v3
19 | 
20 |       - name: Set up Go
21 |         uses: actions/setup-go@v4
22 |         with:
23 |           go-version: 1.22
24 | 
25 |       - name: Install QEMU
26 |         run: sudo apt-get install -y qemu-user-static
27 | 
28 |       - name: Cross-compile to ppc64le
29 |         run: |
30 |           GOARCH=ppc64le GOOS=linux go build -v ./...
31 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Release
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - 'v*'
 7 | 
 8 | permissions:
 9 |   contents: write
10 |   packages: write
11 |   id-token: write
12 | 
13 | jobs:
14 |   release:
15 |     runs-on: ubuntu-latest
16 |     steps:
17 |       - name: Checkout
18 |         uses: actions/checkout@v4
19 |         with:
20 |           fetch-depth: 0
21 |       
22 |       - name: Set up Go
23 |         uses: actions/setup-go@v5
24 |         with:
25 |           go-version: '1.24'
26 |       
27 |       - name: Set up QEMU
28 |         uses: docker/setup-qemu-action@v3
29 |       
30 |       - name: Set up Docker Buildx
31 |         uses: docker/setup-buildx-action@v3
32 |       
33 |       - name: Log in to GitHub Container Registry
34 |         uses: docker/login-action@v3
35 |         with:
36 |           registry: ghcr.io
37 |           username: ${{ github.actor }}
38 |           password: ${{ secrets.GITHUB_TOKEN }}
39 |       
40 |       - name: Run GoReleaser
41 |         uses: goreleaser/goreleaser-action@v6
42 |         with:
43 |           version: latest
44 |           args: release --clean
45 |         env:
46 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
47 |           HOMEBREW_TAP_TOKEN: ${{ secrets.HOMEBREW_TAP_TOKEN }}
48 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | output/certs/*
 2 | output/domains/*
 3 | 
 4 | # GoReleaser artifacts
 5 | dist/
 6 | *.snap
 7 | 
 8 | # Binary
 9 | rxtls
10 | 
11 | # IDE
12 | .vscode/
13 | .idea/
14 | 
15 | # Test coverage
16 | *.out
17 | coverage.html
18 | 
19 | # Temporary files
20 | *.tmp
21 | *.log
22 | 


--------------------------------------------------------------------------------
/.golangci.yml:
--------------------------------------------------------------------------------
  1 | # golangci-lint configuration for rxtls
  2 | # This config enables comprehensive linting with focus on security and code quality
  3 | 
  4 | run:
  5 |   # Timeout for analysis
  6 |   timeout: 5m
  7 |   
  8 |   # Include test files
  9 |   tests: true
 10 |   
 11 |   # Skip directories
 12 |   skip-dirs:
 13 |     - vendor
 14 |     - third_party
 15 |     - testdata
 16 |     - examples
 17 |     - dist
 18 |   
 19 |   # Skip files
 20 |   skip-files:
 21 |     - ".*\\.pb\\.go$"
 22 |     - ".*\\.gen\\.go$"
 23 | 
 24 | # Output configuration
 25 | output:
 26 |   # Format of the output
 27 |   format: colored-line-number
 28 |   
 29 |   # Print lines of code with issue
 30 |   print-issued-lines: true
 31 |   
 32 |   # Print linter name in the end of issue text
 33 |   print-linter-name: true
 34 | 
 35 | linters:
 36 |   enable:
 37 |     # Security
 38 |     - gosec        # Security checker
 39 |     - exportloopref # Checks for pointers to enclosing loop variables
 40 |     
 41 |     # Bug detection
 42 |     - staticcheck  # Advanced static analysis
 43 |     - bodyclose    # Checks HTTP response body is closed
 44 |     - nilerr       # Finds code returning nil even if it checks for error
 45 |     - errcheck     # Checks for unchecked errors
 46 |     - ineffassign  # Detects ineffectual assignments
 47 |     
 48 |     # Code quality
 49 |     - revive       # Fast, configurable linter
 50 |     - govet        # Reports suspicious constructs
 51 |     - gofmt        # Checks formatting
 52 |     - goimports    # Checks imports formatting
 53 |     - misspell     # Finds misspelled words
 54 |     - unconvert    # Removes unnecessary type conversions
 55 |     - prealloc     # Finds slice declarations that could be preallocated
 56 |     - nakedret     # Finds naked returns in long functions
 57 |     
 58 |     # Style
 59 |     - gocritic     # Highly extensible Go source code linter
 60 |     - gocyclo      # Checks cyclomatic complexity
 61 |     - gocognit     # Checks cognitive complexity
 62 |     - funlen       # Checks function length
 63 |     - lll          # Reports long lines
 64 |     
 65 |     # Performance
 66 |     - goconst      # Finds repeated strings that could be constants
 67 |     - gosimple     # Simplifies code
 68 |     
 69 |     # Best practices
 70 |     - unparam      # Reports unused function parameters
 71 |     - dogsled      # Checks assignments with too many blank identifiers
 72 |     - dupl         # Checks for duplicated code
 73 |     - godox        # Detects TODO/FIXME/etc comments
 74 |     
 75 |   disable:
 76 |     - gomnd        # Magic number detector - too noisy for CT log processing
 77 |     - wsl          # Whitespace linter - too strict
 78 |     - nlreturn     # Too strict about newlines
 79 |     - gochecknoinits # We might need init functions
 80 |     - gochecknoglobals # We use some globals for configuration
 81 | 
 82 | linters-settings:
 83 |   # Security
 84 |   gosec:
 85 |     severity: "high"
 86 |     confidence: "medium"
 87 |     # Rules to exclude
 88 |     excludes:
 89 |       - G104 # Unhandled errors - we use errcheck for this
 90 |       - G304 # File path provided as taint input - we need this for user-specified paths
 91 |     config:
 92 |       global:
 93 |         audit: true
 94 | 
 95 |   # Code quality
 96 |   revive:
 97 |     severity: warning
 98 |     enable-all-rules: false
 99 |     rules:
100 |       - name: blank-imports
101 |       - name: context-as-argument
102 |       - name: context-keys-type
103 |       - name: dot-imports
104 |       - name: error-return
105 |       - name: error-strings
106 |       - name: error-naming
107 |       - name: exported
108 |       - name: if-return
109 |       - name: increment-decrement
110 |       - name: var-naming
111 |       - name: var-declaration
112 |       - name: package-comments
113 |       - name: range
114 |       - name: receiver-naming
115 |       - name: time-naming
116 |       - name: unexported-return
117 |       - name: indent-error-flow
118 |       - name: errorf
119 |       - name: empty-block
120 |       - name: superfluous-else
121 |       - name: unused-parameter
122 |       - name: unreachable-code
123 |       - name: redefines-builtin-id
124 | 
125 |   # Bug detection
126 |   staticcheck:
127 |     checks: ["all", "-ST1000", "-ST1003", "-ST1016"]
128 | 
129 |   errcheck:
130 |     # Report about not checking errors in type assertions
131 |     check-type-assertions: true
132 |     # Report about assignment of errors to blank identifier
133 |     check-blank: true
134 | 
135 |   # Style
136 |   gocritic:
137 |     enabled-tags:
138 |       - diagnostic
139 |       - performance
140 |       - style
141 |       - opinionated
142 |     disabled-checks:
143 |       - dupImport # Already covered by goimports
144 |       - ifElseChain # Sometimes if-else is clearer
145 |       - octalLiteral # We don't use octal
146 |       - whyNoLint # We might need nolint sometimes
147 |       - wrapperFunc # Too restrictive
148 | 
149 |   gocyclo:
150 |     # Minimal cyclomatic complexity to report
151 |     min-complexity: 15
152 | 
153 |   gocognit:
154 |     # Minimal cognitive complexity to report
155 |     min-complexity: 20
156 | 
157 |   funlen:
158 |     lines: 100
159 |     statements: 50
160 | 
161 |   lll:
162 |     line-length: 140 # Slightly more than default 120
163 | 
164 |   # Performance
165 |   goconst:
166 |     min-len: 3
167 |     min-occurrences: 3
168 | 
169 |   prealloc:
170 |     # Report preallocation suggestions only on simple loops
171 |     simple: true
172 |     range-loops: true
173 |     for-loops: true
174 | 
175 |   # Best practices
176 |   dogsled:
177 |     # Maximum number of blank identifiers
178 |     max-blank-identifiers: 2
179 | 
180 |   dupl:
181 |     # Minimum lines to consider as duplicate
182 |     threshold: 150
183 | 
184 |   godox:
185 |     keywords:
186 |       - TODO
187 |       - FIXME
188 |       - BUG
189 |       - HACK
190 |       - XXX
191 | 
192 | issues:
193 |   # Excluding configuration per-path, per-linter, per-text and per-source
194 |   exclude-rules:
195 |     # Exclude some linters from running on tests files
196 |     - path: _test\.go
197 |       linters:
198 |         - gocyclo
199 |         - errcheck
200 |         - dupl
201 |         - gosec
202 |         - funlen
203 | 
204 |     # Exclude known issues in main.go (legacy code)
205 |     - path: cmd/rxtls/main.go
206 |       linters:
207 |         - funlen
208 |         - gocyclo
209 |         - gocognit
210 | 
211 |     # Allow TODO comments in certain files
212 |     - path: "(.*)?TODO(.*)?go"
213 |       linters:
214 |         - godox
215 | 
216 |     # Exclude vendor, if any
217 |     - path: vendor/
218 |       linters: [all]
219 | 
220 |   # Maximum issues count per one linter
221 |   max-issues-per-linter: 50
222 | 
223 |   # Maximum count of issues with the same text
224 |   max-same-issues: 3
225 | 
226 |   # Show only new issues
227 |   new: false
228 | 
229 | severity:
230 |   # Default value is empty string
231 |   default-severity: warning
232 |   
233 |   # If set to true, the severity-rules regular expressions become case-sensitive
234 |   case-sensitive: false
235 |   
236 |   rules:
237 |     - linters:
238 |         - gosec
239 |       severity: error
240 |     - linters:
241 |         - staticcheck
242 |         - errcheck
243 |         - bodyclose
244 |       severity: error
245 |     - linters:
246 |         - revive
247 |         - govet
248 |       severity: warning


--------------------------------------------------------------------------------
/.goreleaser.yaml:
--------------------------------------------------------------------------------
  1 | # yaml-language-server: $schema=https://goreleaser.com/static/schema.json
  2 | version: 2
  3 | 
  4 | before:
  5 |   hooks:
  6 |     - go mod download
  7 |     - go mod verify
  8 | 
  9 | builds:
 10 |   - main: ./cmd/rxtls
 11 |     binary: rxtls
 12 |     id: rxtls
 13 |     env:
 14 |       - CGO_ENABLED=0
 15 |     goos: [windows, linux, darwin, freebsd]
 16 |     goarch: [amd64, '386', arm, arm64]
 17 |     goarm: ['6', '7']
 18 |     ignore:
 19 |       - goos: windows
 20 |         goarch: arm
 21 |       - goos: windows
 22 |         goarch: arm64
 23 |       - goos: darwin
 24 |         goarch: '386'
 25 |       - goos: darwin
 26 |         goarch: arm
 27 |       - goos: freebsd
 28 |         goarch: arm
 29 |     flags:
 30 |       - -trimpath
 31 |     ldflags:
 32 |       - -s -w
 33 |       - -X main.version={{.Version}}
 34 |       - -X main.commit={{.ShortCommit}}
 35 |       - -X main.date={{.Date}}
 36 |       - -X main.builtBy=goreleaser
 37 | 
 38 | archives:
 39 |   - id: rxtls
 40 |     ids: [rxtls]
 41 |     name_template: '{{ .ProjectName }}_{{ .Version }}_{{ if eq .Os "darwin" }}macOS{{ else }}{{ .Os }}{{ end }}_{{ .Arch }}{{ if .Arm }}v{{ .Arm }}{{ end }}'
 42 |     format_overrides:
 43 |       - goos: windows
 44 |         formats: [zip]
 45 |     files:
 46 |       - LICENSE
 47 |       - README.md
 48 | 
 49 | checksum:
 50 |   name_template: '{{ .ProjectName }}_{{ .Version }}_checksums.txt'
 51 |   algorithm: sha256
 52 | 
 53 | snapshot:
 54 |   version_template: '{{ incpatch .Version }}-dev'
 55 | 
 56 | changelog:
 57 |   sort: asc
 58 |   use: github
 59 |   filters:
 60 |     exclude:
 61 |       - '^docs:'
 62 |       - '^test:'
 63 |       - '^chore:'
 64 |       - typo
 65 |       - Merge pull request
 66 |       - Merge branch
 67 |   groups:
 68 |     - title: 'New Features'
 69 |       regexp: '^.*?feat(\([[:word:]]+\))??!?:.+$'
 70 |       order: 0
 71 |     - title: 'Bug Fixes'
 72 |       regexp: '^.*?fix(\([[:word:]]+\))??!?:.+$'
 73 |       order: 1
 74 |     - title: 'Performance Improvements'
 75 |       regexp: '^.*?perf(\([[:word:]]+\))??!?:.+$'
 76 |       order: 2
 77 |     - title: 'Code Refactoring'
 78 |       regexp: '^.*?refactor(\([[:word:]]+\))??!?:.+$'
 79 |       order: 3
 80 |     - title: Other
 81 |       order: 999
 82 | 
 83 | release:
 84 |   github:
 85 |     owner: x-stp
 86 |     name: rxtls
 87 |   prerelease: auto
 88 |   draft: false
 89 |   name_template: '{{ .Tag }}'
 90 |   header: |
 91 |     ## rxtls {{ .Tag }}
 92 |     
 93 |     High-Performance Certificate Transparency Processor
 94 |   footer: |
 95 |     ## Installation
 96 |     
 97 |     ### Binary
 98 |     Download the appropriate binary for your platform from the assets below.
 99 |     
100 |     ### Homebrew (macOS/Linux)
101 |     ```bash
102 |     brew tap x-stp/rxtls
103 |     brew install rxtls
104 |     ```
105 |     
106 |     ### Docker
107 |     ```bash
108 |     docker pull ghcr.io/x-stp/rxtls:{{ .Tag }}
109 |     ```
110 |     
111 |     **Full documentation**: https://github.com/x-stp/rxtls#readme
112 | 
113 | dockers:
114 |   - image_templates:
115 |       - 'ghcr.io/x-stp/{{ .ProjectName }}:{{ .Tag }}-amd64'
116 |       - 'ghcr.io/x-stp/{{ .ProjectName }}:v{{ .Major }}.{{ .Minor }}-amd64'
117 |       - 'ghcr.io/x-stp/{{ .ProjectName }}:v{{ .Major }}-amd64'
118 |       - 'ghcr.io/x-stp/{{ .ProjectName }}:latest-amd64'
119 |     dockerfile: Dockerfile.goreleaser
120 |     use: buildx
121 |     build_flag_templates:
122 |       - '--pull'
123 |       - '--platform=linux/amd64'
124 |       - '--label=org.opencontainers.image.created={{ .Date }}'
125 |       - '--label=org.opencontainers.image.title={{ .ProjectName }}'
126 |       - '--label=org.opencontainers.image.revision={{ .FullCommit }}'
127 |       - '--label=org.opencontainers.image.version={{ .Version }}'
128 |       - '--label=org.opencontainers.image.source=https://github.com/x-stp/rxtls'
129 |       - '--label=org.opencontainers.image.licenses=AGPL-3.0'
130 |     goarch: amd64
131 |     goos: linux
132 |     
133 |   - image_templates:
134 |       - 'ghcr.io/x-stp/{{ .ProjectName }}:{{ .Tag }}-arm64'
135 |       - 'ghcr.io/x-stp/{{ .ProjectName }}:v{{ .Major }}.{{ .Minor }}-arm64'
136 |       - 'ghcr.io/x-stp/{{ .ProjectName }}:v{{ .Major }}-arm64'
137 |       - 'ghcr.io/x-stp/{{ .ProjectName }}:latest-arm64'
138 |     dockerfile: Dockerfile.goreleaser
139 |     use: buildx
140 |     build_flag_templates:
141 |       - '--pull'
142 |       - '--platform=linux/arm64'
143 |       - '--label=org.opencontainers.image.created={{ .Date }}'
144 |       - '--label=org.opencontainers.image.title={{ .ProjectName }}'
145 |       - '--label=org.opencontainers.image.revision={{ .FullCommit }}'
146 |       - '--label=org.opencontainers.image.version={{ .Version }}'
147 |       - '--label=org.opencontainers.image.source=https://github.com/x-stp/rxtls'
148 |       - '--label=org.opencontainers.image.licenses=AGPL-3.0'
149 |     goarch: arm64
150 |     goos: linux
151 | 
152 | docker_manifests:
153 |   - name_template: 'ghcr.io/x-stp/{{ .ProjectName }}:{{ .Tag }}'
154 |     image_templates:
155 |       - 'ghcr.io/x-stp/{{ .ProjectName }}:{{ .Tag }}-amd64'
156 |       - 'ghcr.io/x-stp/{{ .ProjectName }}:{{ .Tag }}-arm64'
157 |       
158 |   - name_template: 'ghcr.io/x-stp/{{ .ProjectName }}:v{{ .Major }}.{{ .Minor }}'
159 |     image_templates:
160 |       - 'ghcr.io/x-stp/{{ .ProjectName }}:v{{ .Major }}.{{ .Minor }}-amd64'
161 |       - 'ghcr.io/x-stp/{{ .ProjectName }}:v{{ .Major }}.{{ .Minor }}-arm64'
162 |       
163 |   - name_template: 'ghcr.io/x-stp/{{ .ProjectName }}:v{{ .Major }}'
164 |     image_templates:
165 |       - 'ghcr.io/x-stp/{{ .ProjectName }}:v{{ .Major }}-amd64'
166 |       - 'ghcr.io/x-stp/{{ .ProjectName }}:v{{ .Major }}-arm64'
167 |       
168 |   - name_template: 'ghcr.io/x-stp/{{ .ProjectName }}:latest'
169 |     image_templates:
170 |       - 'ghcr.io/x-stp/{{ .ProjectName }}:latest-amd64'
171 |       - 'ghcr.io/x-stp/{{ .ProjectName }}:latest-arm64'
172 | 
173 | homebrew_casks:
174 |   - repository:
175 |       owner: x-stp
176 |       name: homebrew-rxtls
177 |       token: '{{ .Env.HOMEBREW_TAP_TOKEN }}'
178 |     name: rxtls
179 |     directory: Casks
180 |     homepage: 'https://github.com/x-stp/rxtls'
181 |     description: 'High-Performance Certificate Transparency Processor'
182 |     license: 'AGPL-3.0'
183 |     conflicts:
184 |       - formula: rxtls
185 | 
186 | nfpms:
187 |   - id: rxtls
188 |     package_name: rxtls
189 |     formats:
190 |       - deb
191 |       - rpm
192 |       - apk
193 |     vendor: 'x-stp'
194 |     homepage: 'https://github.com/x-stp/rxtls'
195 |     maintainer: 'Pepijn van der Stap <rxtls@vanderstap.info>'
196 |     description: 'High-Performance Certificate Transparency Processor'
197 |     license: 'AGPL-3.0'
198 |     dependencies:
199 |       - ca-certificates
200 |     section: net
201 |     priority: optional
202 |     contents:
203 |       - src: ./LICENSE
204 |         dst: /usr/share/doc/rxtls/LICENSE
205 |       - src: ./README.md
206 |         dst: /usr/share/doc/rxtls/README.md


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Build stage with multi-arch support
 2 | FROM --platform=$BUILDPLATFORM golang:1.24-alpine AS builder
 3 | 
 4 | # Build arguments for cross-compilation
 5 | ARG TARGETOS
 6 | ARG TARGETARCH
 7 | ARG TARGETVARIANT
 8 | 
 9 | # Install git and ca-certificates
10 | RUN apk add --no-cache git ca-certificates
11 | 
12 | # Set working directory
13 | WORKDIR /build
14 | 
15 | # Copy go mod files
16 | COPY go.mod go.sum ./
17 | 
18 | # Download dependencies
19 | RUN go mod download
20 | 
21 | # Copy source code
22 | COPY . .
23 | 
24 | # Build the binary for target architecture
25 | RUN CGO_ENABLED=0 GOOS=${TARGETOS} GOARCH=${TARGETARCH} go build \
26 |     -ldflags="-w -s" \
27 |     -o rxtls \
28 |     ./cmd/rxtls
29 | 
30 | # Final stage - use scratch for minimal image
31 | FROM scratch
32 | 
33 | # Copy ca-certificates from builder
34 | COPY --from=builder /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/
35 | 
36 | # Copy the binary
37 | COPY --from=builder /build/rxtls /usr/local/bin/rxtls
38 | 
39 | # Set entrypoint
40 | 
41 | ENTRYPOINT ["/usr/local/bin/rxtls"]
42 | 


--------------------------------------------------------------------------------
/Dockerfile.goreleaser:
--------------------------------------------------------------------------------
 1 | # Dockerfile for GoReleaser
 2 | # This is a minimal Dockerfile that only copies the pre-built binary
 3 | FROM scratch
 4 | 
 5 | # Copy ca-certificates for HTTPS connections
 6 | COPY --from=alpine:latest /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/
 7 | 
 8 | # Copy the pre-built binary from GoReleaser
 9 | COPY rxtls /usr/local/bin/rxtls
10 | 
11 | # Set entrypoint
12 | ENTRYPOINT ["/usr/local/bin/rxtls"]


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for rxtls
  2 | 
  3 | # Variables
  4 | BINARY_NAME := rxtls
  5 | MAIN_PATH := ./cmd/rxtls
  6 | BUILD_DIR := ./dist
  7 | COVERAGE_FILE := coverage.out
  8 | 
  9 | # Go parameters
 10 | GOCMD := go
 11 | GOBUILD := $(GOCMD) build
 12 | GOCLEAN := $(GOCMD) clean
 13 | GOTEST := $(GOCMD) test
 14 | GOGET := $(GOCMD) get
 15 | GOMOD := $(GOCMD) mod
 16 | GOFMT := gofmt
 17 | GOVET := $(GOCMD) vet
 18 | 
 19 | # Build flags
 20 | LDFLAGS := -s -w
 21 | BUILD_FLAGS := -trimpath -ldflags "$(LDFLAGS)"
 22 | 
 23 | # Linting
 24 | GOLANGCI_LINT_VERSION := v1.54.2
 25 | GOLANGCI_LINT := $(shell which golangci-lint 2> /dev/null)
 26 | 
 27 | .PHONY: all build clean test lint lint-install lint-fix security fmt vet tidy help
 28 | 
 29 | # Default target
 30 | all: lint test build
 31 | 
 32 | # Build the binary
 33 | build:
 34 | 	@echo "Building $(BINARY_NAME)..."
 35 | 	@$(GOBUILD) $(BUILD_FLAGS) -o $(BINARY_NAME) $(MAIN_PATH)
 36 | 	@echo "Build complete: ./$(BINARY_NAME)"
 37 | 
 38 | # Clean build artifacts
 39 | clean:
 40 | 	@echo "Cleaning..."
 41 | 	@$(GOCLEAN)
 42 | 	@rm -f $(BINARY_NAME)
 43 | 	@rm -rf $(BUILD_DIR)
 44 | 	@rm -f $(COVERAGE_FILE)
 45 | 	@echo "Clean complete"
 46 | 
 47 | # Run tests
 48 | test:
 49 | 	@echo "Running tests..."
 50 | 	@$(GOTEST) -v -race -cover ./...
 51 | 
 52 | # Run tests with coverage
 53 | test-coverage:
 54 | 	@echo "Running tests with coverage..."
 55 | 	@$(GOTEST) -v -race -coverprofile=$(COVERAGE_FILE) -covermode=atomic ./...
 56 | 	@$(GOCMD) tool cover -html=$(COVERAGE_FILE) -o coverage.html
 57 | 	@echo "Coverage report generated: coverage.html"
 58 | 
 59 | # Install golangci-lint if not present
 60 | lint-install:
 61 | ifndef GOLANGCI_LINT
 62 | 	@echo "Installing golangci-lint $(GOLANGCI_LINT_VERSION)..."
 63 | 	@curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(shell go env GOPATH)/bin $(GOLANGCI_LINT_VERSION)
 64 | else
 65 | 	@echo "golangci-lint is already installed at $(GOLANGCI_LINT)"
 66 | endif
 67 | 
 68 | # Run linters
 69 | lint: lint-install
 70 | 	@echo "Running linters..."
 71 | 	@golangci-lint run --timeout=5m ./...
 72 | 
 73 | # Run linters and fix issues where possible
 74 | lint-fix: lint-install
 75 | 	@echo "Running linters with auto-fix..."
 76 | 	@golangci-lint run --fix --timeout=5m ./...
 77 | 
 78 | # Run security-focused linters only
 79 | security: lint-install
 80 | 	@echo "Running security checks..."
 81 | 	@golangci-lint run --disable-all --enable=gosec,exportloopref,bodyclose --timeout=5m ./...
 82 | 
 83 | # Run gosec directly with more detailed output
 84 | gosec:
 85 | 	@echo "Running gosec security scanner..."
 86 | 	@gosec -fmt=json -out=gosec-report.json -stdout -verbose=text -severity=medium ./... || true
 87 | 	@echo "Security report saved to gosec-report.json"
 88 | 
 89 | # Format code
 90 | fmt:
 91 | 	@echo "Formatting code..."
 92 | 	@$(GOFMT) -s -w .
 93 | 	@$(GOCMD) fmt ./...
 94 | 
 95 | # Run go vet
 96 | vet:
 97 | 	@echo "Running go vet..."
 98 | 	@$(GOVET) ./...
 99 | 
100 | # Tidy dependencies
101 | tidy:
102 | 	@echo "Tidying dependencies..."
103 | 	@$(GOMOD) tidy
104 | 	@$(GOMOD) verify
105 | 
106 | # Quick check - format, vet, and lint
107 | check: fmt vet lint
108 | 
109 | # CI/CD oriented target - strict checking
110 | ci: tidy fmt vet lint test
111 | 
112 | # Install all development dependencies
113 | dev-deps: lint-install
114 | 	@echo "Installing development dependencies..."
115 | 	@$(GOGET) github.com/securego/gosec/v2/cmd/gosec@latest
116 | 	@echo "Development dependencies installed"
117 | 
118 | # Show help
119 | help:
120 | 	@echo "Available targets:"
121 | 	@echo "  make build         - Build the binary"
122 | 	@echo "  make test          - Run tests"
123 | 	@echo "  make test-coverage - Run tests with coverage report"
124 | 	@echo "  make lint          - Run all linters"
125 | 	@echo "  make lint-fix      - Run linters with auto-fix"
126 | 	@echo "  make security      - Run security-focused linters"
127 | 	@echo "  make gosec         - Run gosec security scanner"
128 | 	@echo "  make fmt           - Format code"
129 | 	@echo "  make vet           - Run go vet"
130 | 	@echo "  make tidy          - Tidy go modules"
131 | 	@echo "  make check         - Quick check (fmt, vet, lint)"
132 | 	@echo "  make ci            - Full CI check"
133 | 	@echo "  make clean         - Clean build artifacts"
134 | 	@echo "  make dev-deps      - Install development dependencies"
135 | 	@echo "  make help          - Show this help message"


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # rxtls - High-Performance Certificate Transparency Processor
  2 | 
  3 | rxtls is a high-throughput, fault-tolerant Certificate Transparency log processor designed for hyperscale environments. It provides efficient processing of CT logs with dynamic backpressure handling, adaptive rate limiting, and comprehensive observability.
  4 | 
  5 | ## Features
  6 | 
  7 | - **High Throughput**: Process CT logs efficiently with a worker pool architecture
  8 | - **Fault Tolerance**: Automatic retries, backpressure handling, and graceful shutdown
  9 | - **Dynamic Rate Limiting**: Adaptive rate control based on success/failure patterns
 10 | - **Observability**: Prometheus metrics for monitoring and alerting
 11 | - **Configurable**: CLI flags for customizing behavior and CT log sources
 12 | - **Versatile**: Download raw certificates or extract domains from CT logs
 13 | 
 14 | ## Architecture
 15 | 
 16 | The system consists of several key components:
 17 | 
 18 | ### Scheduler
 19 | - Manages a pool of workers
 20 | - Distributes work using least-loaded worker selection
 21 | - Implements graceful shutdown
 22 | - Provides statistics and metrics
 23 | 
 24 | ### Workers
 25 | - Process work items from their queues
 26 | - Implement backpressure handling
 27 | - Track success/failure metrics
 28 | - Support CPU affinity for optimal performance
 29 | 
 30 | ### Rate Limiter
 31 | - Dynamic rate adjustment based on success/failure
 32 | - Token bucket implementation for smooth rate limiting
 33 | - Backpressure integration
 34 | - Atomic operations for thread safety
 35 | 
 36 | ### Metrics
 37 | - Prometheus integration for monitoring
 38 | - Queue pressure tracking
 39 | - Success/failure rate monitoring
 40 | - Resource utilization metrics
 41 | 
 42 | ## Usage
 43 | 
 44 | The tool provides several subcommands:
 45 | 
 46 | ```bash
 47 | # List available CT logs
 48 | rxtls list
 49 | 
 50 | # Download certificates from CT logs
 51 | rxtls download
 52 | 
 53 | # Extract domains from certificates in CT logs
 54 | rxtls domains
 55 | 
 56 | # Fetch and save the CT logs list to a local file
 57 | rxtls fetch-logs
 58 | 
 59 | # Direct processing with URI (legacy mode)
 60 | rxtls --ct-uri https://ct.example.com/log
 61 | ```
 62 | 
 63 | ### Global Flags
 64 | 
 65 | ```bash
 66 | # Use local logs list instead of fetching from internet
 67 | rxtls --local-logs [command]
 68 | 
 69 | # Customize worker pool size
 70 | rxtls --workers 8
 71 | 
 72 | # Set initial rate limit
 73 | rxtls --rate-limit 1000
 74 | 
 75 | # Enable debug logging
 76 | rxtls --debug
 77 | 
 78 | # Configure Prometheus metrics port
 79 | rxtls --metrics-port 9090
 80 | ```
 81 | 
 82 | ### Download Command
 83 | 
 84 | ```bash
 85 | # Basic download with interactive log selection
 86 | rxtls download
 87 | 
 88 | # Specify output directory
 89 | rxtls download --output /path/to/output
 90 | 
 91 | # Configure concurrency
 92 | rxtls download --concurrency 10
 93 | 
 94 | # Adjust buffer size
 95 | rxtls download --buffer 262144
 96 | 
 97 | # Enable compression
 98 | rxtls download --compress
 99 | 
100 | # Enable high-speed mode
101 | rxtls download --turbo
102 | ```
103 | 
104 | ### Domains Command
105 | 
106 | ```bash
107 | # Basic domain extraction with interactive log selection
108 | rxtls domains
109 | 
110 | # Specify output directory
111 | rxtls domains --output /path/to/domains
112 | 
113 | # Configure concurrency
114 | rxtls domains --concurrency 10
115 | 
116 | # Adjust buffer size
117 | rxtls domains --buffer 32768
118 | 
119 | # Enable compression
120 | rxtls domains --compress
121 | 
122 | # Enable high-speed mode
123 | rxtls domains --turbo
124 | ```
125 | 
126 | ## Configuration
127 | 
128 | ### CLI Flags
129 | 
130 | - `--ct-uri`: CT log URI to process (default: from config)
131 | - `--workers`: Number of worker goroutines (default: runtime.NumCPU())
132 | - `--rate-limit`: Initial rate limit in requests/second (default: 100)
133 | - `--debug`: Enable debug logging
134 | - `--metrics-port`: Prometheus metrics port (default: 9090)
135 | - `--local-logs`: Use local logs list instead of fetching from internet
136 | 
137 | ### Environment Variables
138 | 
139 | - `RXTLS_CONFIG`: Path to config file
140 | - `RXTLS_LOG_LEVEL`: Log level (debug, info, warn, error)
141 | - `RXTLS_METRICS_PORT`: Prometheus metrics port
142 | 
143 | ## Metrics
144 | 
145 | The following Prometheus metrics are exposed:
146 | 
147 | - `rxtls_worker_queue_size`: Current size of worker queues
148 | - `rxtls_worker_queue_pressure`: Queue pressure (0-1)
149 | - `rxtls_worker_processed_total`: Total processed items
150 | - `rxtls_worker_errors_total`: Total errors
151 | - `rxtls_rate_limit_current`: Current rate limit
152 | - `rxtls_rate_limit_success_total`: Total successful requests
153 | - `rxtls_rate_limit_failure_total`: Total failed requests
154 | 
155 | ## Development
156 | 
157 | ### Prerequisites
158 | 
159 | - Go 1.24 or later
160 | 
161 | ### Building
162 | 
163 | ```bash
164 | # Build binary
165 | go build
166 | 
167 | # Run tests
168 | go test ./...
169 | 
170 | # Run benchmarks
171 | go test -bench=. ./...
172 | ```
173 | 
174 | ### Testing
175 | 
176 | The codebase includes comprehensive tests:
177 | 
178 | - Unit tests for all components
179 | - Integration tests for the full pipeline
180 | - Benchmarks for performance testing
181 | - Race condition detection enabled
182 | 
183 | ## License
184 | 
185 | GNU Affero General Public License v3 - see LICENSE file for details
186 | 
187 | 


--------------------------------------------------------------------------------
/cmd/rxtls/main.go:
--------------------------------------------------------------------------------
  1 | /*
  2 | Package main is the entry point for the rxtls command-line application.
  3 | 
  4 | rxtls is a tool designed for interacting with Certificate Transparency (CT) logs.
  5 | Its primary functionalities include:
  6 |   - Listing available CT logs.
  7 |   - Downloading raw certificate entries (as base64 blobs) from specified CT logs.
  8 |   - Extracting domain names (Common Name and Subject Alternative Names) from certificate entries
  9 |     and saving them to CSV files.
 10 |   - Fetching and caching the official list of CT logs.
 11 | 
 12 | The application uses the Cobra library for command-line interface structure and flag parsing.
 13 | It leverages several internal packages:
 14 |   - `internal/certlib`: For CT log interaction logic, data models, and parsing certificate entries.
 15 |   - `internal/client`: For a configurable HTTP client used for network requests.
 16 |   - `internal/core`: For the core processing engine, including a concurrent scheduler, download manager,
 17 |     and domain extractor.
 18 |   - `internal/metrics`: For exposing Prometheus metrics for monitoring application performance.
 19 | 
 20 | Global flags allow users to specify options like using a local log list cache.
 21 | Subcommands (`list`, `download`, `domains`, `fetch-logs`) provide access to different functionalities,
 22 | each with its own set of specific flags for configuration (e.g., output directory, concurrency).
 23 | 
 24 | The main function initializes a Prometheus metrics server and then either processes a single CT log URI
 25 | (if provided directly as a flag without a subcommand) or executes the appropriate Cobra subcommand.
 26 | Graceful shutdown is handled via context cancellation triggered by OS signals (SIGINT, SIGTERM).
 27 | */
 28 | package main
 29 | 
 30 | /*
 31 | rxtls — fast tool in Go for working with Certificate Transparency logs
 32 | Copyright (C) 2025  Pepijn van der Stap <rxtls@vanderstap.info>
 33 | 
 34 | This program is free software: you can redistribute it and/or modify
 35 | it under the terms of the GNU Affero General Public License as published by
 36 | the Free Software Foundation, either version 3 of the License, or
 37 | (at your option) any later version.
 38 | 
 39 | This program is distributed in the hope that it will be useful,
 40 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 41 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 42 | GNU Affero General Public License for more details.
 43 | 
 44 | You should have received a copy of the GNU Affero General Public License
 45 | along with this program.  If not, see <https://www.gnu.org/licenses/>.
 46 | */
 47 | 
 48 | import (
 49 | 	"bufio"
 50 | 	"context"
 51 | 	"errors"
 52 | 	"flag"
 53 | 	"fmt"
 54 | 	"io"
 55 | 	"log"
 56 | 	"net/http"
 57 | 	"os"
 58 | 	"os/signal"
 59 | 	"runtime"
 60 | 	"strconv"
 61 | 	"strings"
 62 | 	"sync"
 63 | 	"syscall"
 64 | 	"time"
 65 | 
 66 | 	"github.com/spf13/cobra"
 67 | 	"github.com/x-stp/rxtls/internal/certlib"
 68 | 	"github.com/x-stp/rxtls/internal/client"
 69 | 	"github.com/x-stp/rxtls/internal/core"
 70 | 	"github.com/x-stp/rxtls/internal/metrics"
 71 | )
 72 | 
 73 | // Global flags (persistent across commands)
 74 | var useLocalLogs bool
 75 | 
 76 | // Flags specific to the download command
 77 | var (
 78 | 	outputDir         string
 79 | 	maxConcurrentLogs int
 80 | 	bufferSize        int
 81 | 	showStats         bool
 82 | 	turbo             bool
 83 | 	compress          bool
 84 | 	logsFile          string // Added for fetch-logs command
 85 | 	ctURI             = flag.String("ct-uri", "", "CT log URI to process (overrides config)")
 86 | 	workers           = flag.Int("workers", runtime.NumCPU(), "Number of worker goroutines")
 87 | 	rateLimit         = flag.Float64("rate-limit", 100, "Initial rate limit in requests/second")
 88 | 	debug             = flag.Bool("debug", false, "Enable debug logging")
 89 | 	metricsPort       = flag.Int("metrics-port", 9090, "Prometheus metrics port")
 90 | )
 91 | 
 92 | var rootCmd = &cobra.Command{
 93 | 	Use:   "rxtls",
 94 | 	Short: "rxtls - A Certificate Transparency Log (domain/b64 blob) downloader and processor",
 95 | 	PersistentPreRun: func(cmd *cobra.Command, args []string) {
 96 | 		// Enable local logs if requested (applies to all commands)
 97 | 		if useLocalLogs {
 98 | 			certlib.UseLocalLogs = true
 99 | 			log.Println("Using local logs list enabled.")
100 | 		}
101 | 	},
102 | }
103 | 
104 | var listCmd = &cobra.Command{
105 | 	Use:   "list",
106 | 	Short: "List all available Certificate Transparency logs",
107 | 	Run: func(cmd *cobra.Command, args []string) {
108 | 		listLogs()
109 | 	},
110 | }
111 | 
112 | var downloadCmd = &cobra.Command{
113 | 	Use:   "download",
114 | 	Short: "Download certificates (full B64 blob) from selected CT logs",
115 | 	Run: func(cmd *cobra.Command, args []string) {
116 | 		// Flags are parsed by Cobra and available via the variables
117 | 		downloadLogs(outputDir, maxConcurrentLogs, bufferSize, showStats, compress, turbo)
118 | 	},
119 | }
120 | 
121 | var domainsCmd = &cobra.Command{
122 | 	Use:   "domains",
123 | 	Short: "Extract domains from selected CT logs and save to CSV",
124 | 	Long:  `Extracts domains (CN and SANs) from certificates found in selected CT logs. Output is a CSV file per log with format: offset,cn,primary_domain,all_domains_json,country,org,issuer_cn,domain_org_hash`,
125 | 	Run: func(cmd *cobra.Command, args []string) {
126 | 		// Call the new core function for domain extraction
127 | 		extractDomains(outputDir, maxConcurrentLogs, bufferSize, showStats, turbo, compress)
128 | 	},
129 | }
130 | 
131 | var fetchLogsCmd = &cobra.Command{
132 | 	Use:   "fetch-logs",
133 | 	Short: "Fetch and save the CT logs list to a local file",
134 | 	Run: func(cmd *cobra.Command, args []string) {
135 | 		fetchAndSaveLogs()
136 | 	},
137 | }
138 | 
139 | func init() {
140 | 	// Persistent flags (available for all commands)
141 | 	rootCmd.PersistentFlags().BoolVar(&useLocalLogs, "local-logs", false, "Use local all_logs_list.json instead of fetching from internet")
142 | 
143 | 	// Flags for the download command
144 | 	downloadCmd.Flags().StringVarP(&outputDir, "output", "o", "output/certs", "Output directory for certificate blobs")
145 | 	downloadCmd.Flags().IntVarP(&maxConcurrentLogs, "concurrency", "c", 0, "Maximum number of concurrent logs to process (0 for auto based on CPU)")
146 | 	downloadCmd.Flags().IntVarP(&bufferSize, "buffer", "b", core.DefaultDiskBufferSize, "Internal buffer size in bytes for disk I/O")
147 | 	downloadCmd.Flags().BoolVarP(&showStats, "stats", "s", true, "Show statistics during processing")
148 | 	downloadCmd.Flags().BoolVar(&compress, "compress", false, "Compress output CSV files")
149 | 	downloadCmd.Flags().BoolVar(&turbo, "turbo", false, "Enable high-speed mode (DNS prewarm, persistent connections)")
150 | 
151 | 	// Flags for the domains command (sharing some with download)
152 | 	domainsCmd.Flags().StringVarP(&outputDir, "output", "o", "output/domains", "Output directory for domain CSV files") // Default to subfolder
153 | 	domainsCmd.Flags().IntVarP(&maxConcurrentLogs, "concurrency", "c", 0, "Maximum number of concurrent logs to process (0 for auto based on CPU)")
154 | 	domainsCmd.Flags().IntVarP(&bufferSize, "buffer", "b", 32768, "Internal buffer size in bytes")
155 | 	domainsCmd.Flags().BoolVarP(&showStats, "stats", "s", true, "Show statistics during processing")
156 | 	domainsCmd.Flags().BoolVar(&turbo, "turbo", false, "Enable high-speed mode (DNS prewarm, persistent connections)") // Added turbo flag
157 | 	domainsCmd.Flags().BoolVar(&compress, "compress", false, "Compress output CSV files")
158 | 
159 | 	// Flags for the fetch-logs command
160 | 	fetchLogsCmd.Flags().StringVarP(&logsFile, "output", "o", certlib.LocalLogsFile, "Output file for CT logs list")
161 | 
162 | 	// Add subcommands to the root command
163 | 	rootCmd.AddCommand(listCmd)
164 | 	rootCmd.AddCommand(downloadCmd)
165 | 	rootCmd.AddCommand(domainsCmd)
166 | 	rootCmd.AddCommand(fetchLogsCmd)
167 | }
168 | 
169 | func main() {
170 | 	flag.Parse()
171 | 
172 | 	// Initialize metrics
173 | 	metrics.EnableMetrics()
174 | 	if err := metrics.StartMetricsServer(fmt.Sprintf(":%d", *metricsPort)); err != nil {
175 | 		log.Fatalf("Failed to start metrics server: %v", err)
176 | 	}
177 | 
178 | 	// Only process -ct-uri directly if specified and no cobra command is used
179 | 	if *ctURI != "" && len(os.Args) == 1 {
180 | 		// Create output directory
181 | 		if err := os.MkdirAll(outputDir, 0755); err != nil {
182 | 			log.Fatalf("Failed to create output directory: %v", err)
183 | 		}
184 | 
185 | 		// Create scheduler
186 | 		ctx := context.Background()
187 | 		scheduler, err := core.NewScheduler(ctx)
188 | 		if err != nil {
189 | 			log.Fatalf("Failed to create scheduler: %v", err)
190 | 		}
191 | 		defer scheduler.Shutdown()
192 | 
193 | 		// Process CT log
194 | 		if err := processCTLog(ctx, *ctURI, scheduler); err != nil {
195 | 			log.Fatalf("Error processing CT log: %v", err)
196 | 		}
197 | 
198 | 		// Wait for all work to complete
199 | 		scheduler.Wait()
200 | 	} else {
201 | 		// Execute cobra command
202 | 		if err := rootCmd.Execute(); err != nil {
203 | 			fmt.Fprintf(os.Stderr, "Error: %v\n", err)
204 | 			os.Exit(1)
205 | 		}
206 | 	}
207 | }
208 | 
209 | func processCTLog(ctx context.Context, uri string, scheduler *core.Scheduler) error {
210 | 	// Create log info
211 | 	logInfo := &certlib.CTLogInfo{
212 | 		URL: uri,
213 | 	}
214 | 
215 | 	// Get log info
216 | 	if err := certlib.GetLogInfo(logInfo); err != nil {
217 | 		return err
218 | 	}
219 | 
220 | 	// Process entries in batches
221 | 	batchSize := 1000
222 | 	for start := 0; start < int(logInfo.TreeSize); start += batchSize {
223 | 		end := min(start+batchSize, int(logInfo.TreeSize))
224 | 
225 | 		// Submit work for this batch
226 | 		err := scheduler.SubmitWork(ctx, logInfo, int64(start), int64(end), func(item *core.WorkItem) error {
227 | 			// Process entries in this batch
228 | 			entries, err := certlib.DownloadEntries(ctx, logInfo, int(item.Start), int(item.End))
229 | 			if err != nil {
230 | 				return err
231 | 			}
232 | 
233 | 			// Process each entry
234 | 			for _, entry := range entries.Entries {
235 | 				// Parse certificate data
236 | 				certData, err := certlib.ParseCertificateEntry(entry.LeafInput, entry.ExtraData, logInfo.URL)
237 | 				if err != nil {
238 | 					log.Printf("Error parsing certificate entry: %v", err)
239 | 					continue
240 | 				}
241 | 
242 | 				// Write domains to file
243 | 				if len(certData.AllDomains) > 0 {
244 | 					// Create domains file for this batch
245 | 					domainsFile := outputDir + "/domains_" + logInfo.URL + "_" + strconv.FormatInt(item.Start, 10) + "_" + strconv.FormatInt(item.End, 10) + ".txt"
246 | 					f, err := os.OpenFile(domainsFile, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
247 | 					if err != nil {
248 | 						log.Printf("Error opening domains file: %v", err)
249 | 						continue
250 | 					}
251 | 
252 | 					// Write domains
253 | 					for _, domain := range certData.AllDomains {
254 | 						if _, err := f.WriteString(domain + "\n"); err != nil {
255 | 							log.Printf("Error writing domain: %v", err)
256 | 						}
257 | 					}
258 | 
259 | 					f.Close()
260 | 				}
261 | 
262 | 				// Write certificate data
263 | 				if certData.AsDER != "" {
264 | 					// Create certificates file for this batch
265 | 					certsFile := outputDir + "/certs_" + logInfo.URL + "_" + strconv.FormatInt(item.Start, 10) + "_" + strconv.FormatInt(item.End, 10) + ".pem"
266 | 					f, err := os.OpenFile(certsFile, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
267 | 					if err != nil {
268 | 						log.Printf("Error opening certificates file: %v", err)
269 | 						continue
270 | 					}
271 | 
272 | 					// Write certificate
273 | 					if _, err := f.WriteString("-----BEGIN CERTIFICATE-----\n"); err != nil {
274 | 						log.Printf("Error writing certificate header: %v", err)
275 | 					}
276 | 					if _, err := f.WriteString(certData.AsDER); err != nil {
277 | 						log.Printf("Error writing certificate data: %v", err)
278 | 					}
279 | 					if _, err := f.WriteString("\n-----END CERTIFICATE-----\n"); err != nil {
280 | 						log.Printf("Error writing certificate footer: %v", err)
281 | 					}
282 | 
283 | 					f.Close()
284 | 				}
285 | 			}
286 | 
287 | 			return nil
288 | 		})
289 | 
290 | 		if err != nil {
291 | 			log.Printf("Error submitting work for batch %d-%d: %v", start, end, err)
292 | 			continue
293 | 		}
294 | 	}
295 | 
296 | 	return nil
297 | }
298 | 
299 | func listLogs() {
300 | 	logs, err := core.ListCTLogs()
301 | 	if err != nil {
302 | 		log.Fatalf("Error listing CT logs: %v", err)
303 | 	}
304 | 
305 | 	// Display each log
306 | 	for _, logEntry := range logs { // Renamed log to logEntry to avoid conflict with log package
307 | 		fmt.Printf("%s\n", logEntry.Description)
308 | 		fmt.Printf("    \\- URL:            %s\n", logEntry.URL)
309 | 		fmt.Printf("    \\- Owner:          %s\n", logEntry.OperatedBy)
310 | 		fmt.Printf("    \\- State:          %s\n", getLogState(logEntry))
311 | 		fmt.Println()
312 | 	}
313 | 
314 | 	// Print final count
315 | 	fmt.Printf("Found %d Certificate Transparency Logs\n", len(logs))
316 | }
317 | 
318 | func getLogState(logInfo certlib.CTLogInfo) string { // Renamed log to logInfo
319 | 	// Get log info to determine state
320 | 	if err := certlib.GetLogInfo(&logInfo); err != nil {
321 | 		return "Unknown (error getting info)"
322 | 	}
323 | 
324 | 	if logInfo.TreeSize == 0 {
325 | 		return "Empty"
326 | 	}
327 | 
328 | 	return fmt.Sprintf("Active (%d certificates)", logInfo.TreeSize)
329 | }
330 | 
331 | // downloadLogs is the handler for the 'download' command.
332 | func downloadLogs(outputDir string, maxConcurrentLogs int, bufferSize int, showStats bool, compress bool, turbo bool) {
333 | 	log.Printf("Starting certificate download: output='%s', concurrency=%d, buffer=%d, stats=%t, compress=%t, turbo=%t",
334 | 		outputDir, maxConcurrentLogs, bufferSize, showStats, compress, turbo)
335 | 
336 | 	// Initialize HTTP client with turbo mode if requested
337 | 	if turbo {
338 | 		log.Println("Enabling turbo mode for HTTP client")
339 | 		client.ConfigureTurboMode()
340 | 	}
341 | 
342 | 	// 1. List logs for selection
343 | 	allLogs, err := core.ListCTLogs()
344 | 	if err != nil {
345 | 		log.Fatalf("Error listing CT logs for selection: %v", err)
346 | 	}
347 | 	if len(allLogs) == 0 {
348 | 		log.Fatalf("No CT logs found to select from.")
349 | 	}
350 | 
351 | 	// 2. Display and prompt for selection
352 | 	fmt.Println("Available Certificate Transparency Logs:")
353 | 	for i, lg := range allLogs {
354 | 		fmt.Printf("  [%d] %s (%s)\n", i+1, lg.Description, lg.URL)
355 | 	}
356 | 	fmt.Println("  [all] Download from all logs")
357 | 	fmt.Print("Enter log number(s) to download from (e.g., 1,3,5 or all): ")
358 | 	reader := bufio.NewReader(os.Stdin)
359 | 	input, _ := reader.ReadString('\n')
360 | 	input = strings.TrimSpace(input)
361 | 	var selectedLogs []certlib.CTLogInfo
362 | 	if strings.ToLower(input) == "all" {
363 | 		selectedLogs = allLogs
364 | 		fmt.Println("Selected all logs for download.")
365 | 	} else {
366 | 		parts := strings.Split(input, ",")
367 | 		selectedIndices := make(map[int]bool)
368 | 		for _, part := range parts {
369 | 			indexStr := strings.TrimSpace(part)
370 | 			if indexStr == "" {
371 | 				continue
372 | 			}
373 | 			index, err := strconv.Atoi(indexStr)
374 | 			if err != nil || index < 1 || index > len(allLogs) {
375 | 				log.Fatalf("Invalid input: %q is not a valid number in the range 1-%d", indexStr, len(allLogs))
376 | 			}
377 | 			if !selectedIndices[index-1] {
378 | 				selectedLogs = append(selectedLogs, allLogs[index-1])
379 | 				selectedIndices[index-1] = true
380 | 			}
381 | 		}
382 | 		if len(selectedLogs) == 0 {
383 | 			log.Fatalf("No valid logs selected.")
384 | 		}
385 | 		fmt.Printf("Selected %d log(s) for download.\n", len(selectedLogs))
386 | 	}
387 | 	// ----------------------------------------------
388 | 
389 | 	// 3. Create and run the download manager
390 | 	log.Printf("Starting download for %d selected logs...", len(selectedLogs))
391 | 	ctx, cancel := context.WithCancel(context.Background())
392 | 	defer cancel()
393 | 
394 | 	// Setup signal handling for graceful shutdown
395 | 	signalChan := make(chan os.Signal, 1)
396 | 	signal.Notify(signalChan, syscall.SIGINT, syscall.SIGTERM)
397 | 	go func() {
398 | 		<-signalChan
399 | 		log.Println("Interrupt received, initiating graceful shutdown...")
400 | 		cancel()
401 | 	}()
402 | 
403 | 	// Create the download manager
404 | 	config := &core.DownloadConfig{
405 | 		OutputDir:         outputDir,
406 | 		BufferSize:        bufferSize,
407 | 		MaxConcurrentLogs: maxConcurrentLogs,
408 | 		CompressOutput:    compress,
409 | 	}
410 | 
411 | 	// 4. Create and Run the Download Manager
412 | 	downloader, errManager := core.NewDownloadManager(ctx, config) // Renamed err to errManager
413 | 	if errManager != nil {
414 | 		log.Fatalf("Failed to create download manager: %v", errManager)
415 | 	}
416 | 
417 | 	// 5. Launch Stats Display Goroutine (if enabled)
418 | 	var statsWg sync.WaitGroup
419 | 	if showStats {
420 | 		statsWg.Add(1)
421 | 		go func() {
422 | 			defer statsWg.Done()
423 | 			displayDownloadStats(ctx, downloader) // Swapped order
424 | 		}()
425 | 	}
426 | 
427 | 	// 6. Start Download Process (BLOCKING)
428 | 	if err := downloader.DownloadCertificates(selectedLogs); err != nil && !errors.Is(err, context.Canceled) && !errors.Is(err, core.ErrDownloadCancelled) {
429 | 		log.Printf("Error during certificate download: %v", err)
430 | 	}
431 | 	log.Println("Main download process finished or cancelled.")
432 | 
433 | 	// 7. Ensure stats goroutine finishes
434 | 	if showStats {
435 | 		log.Println("Waiting for statistics display to finish...")
436 | 		cancel() // Ensure context is cancelled
437 | 		statsWg.Wait()
438 | 	}
439 | 
440 | 	// 8. Display Final Stats
441 | 	displayFinalDownloadStats(downloader)
442 | 	log.Println("Certificate download command complete.")
443 | }
444 | 
445 | // displayDownloadStats periodically shows download progress.
446 | // ctx should be the first parameter for consistency with Go conventions.
447 | func displayDownloadStats(ctx context.Context, downloader *core.DownloadManager) {
448 | 	ticker := time.NewTicker(time.Second * 2)
449 | 	defer ticker.Stop()
450 | 	startTime := downloader.GetStats().StartTime
451 | 	log.Println("Starting download statistics display...")
452 | 	for {
453 | 		select {
454 | 		case <-ticker.C:
455 | 			stats := downloader.GetStats()
456 | 			elapsed := time.Since(startTime).Seconds()
457 | 			if elapsed < 0.1 {
458 | 				elapsed = 0.1
459 | 			}
460 | 			processedEntries := stats.ProcessedEntries.Load()
461 | 			totalEntries := stats.TotalEntries.Load()
462 | 			failedEntries := stats.FailedEntries.Load()
463 | 			entriesPerSec := float64(processedEntries) / elapsed
464 | 			percentDone := 0.0
465 | 			if totalEntries > 0 {
466 | 				percentDone = float64(processedEntries+failedEntries) / float64(totalEntries) * 100
467 | 			}
468 | 			fmt.Printf("\rProcessed: %d/%d logs | Entries: %d / ~%d (%.1f%%) | Failed: %d | Rate: %.0f ent/s | Written: %.2fMB | Retries: %.2f%%",
469 | 				stats.ProcessedLogs.Load(),
470 | 				stats.TotalLogs.Load(),
471 | 				processedEntries,
472 | 				totalEntries,
473 | 				percentDone,
474 | 				failedEntries,
475 | 				entriesPerSec,
476 | 				float64(stats.OutputBytesWritten.Load())/(1024*1024),
477 | 				stats.GetRetryRate()*100,
478 | 			)
479 | 		case <-ctx.Done():
480 | 			fmt.Println("\nDownload stats display stopping.")
481 | 			return
482 | 		}
483 | 	}
484 | }
485 | 
486 | // displayFinalDownloadStats shows the summary download statistics.
487 | func displayFinalDownloadStats(downloader *core.DownloadManager) {
488 | 	stats := downloader.GetStats()
489 | 	elapsed := time.Since(stats.StartTime)
490 | 	processedEntries := stats.ProcessedEntries.Load()
491 | 	rate := 0.0
492 | 	if elapsed.Seconds() > 0 {
493 | 		rate = float64(processedEntries) / elapsed.Seconds()
494 | 	}
495 | 	fmt.Println() // Ensure stats start on a new line
496 | 	fmt.Printf("\n--- Final Download Statistics ---\n")
497 | 	fmt.Printf(" Processing Time: %v\n", elapsed.Round(time.Millisecond))
498 | 	fmt.Printf("   Total Logs: %d\n", stats.TotalLogs.Load())
499 | 	fmt.Printf(" Processed Logs: %d\n", stats.ProcessedLogs.Load())
500 | 	fmt.Printf("    Failed Logs: %d\n", stats.FailedLogs.Load())
501 | 	fmt.Printf("  Total Entries: ~%d\n", stats.TotalEntries.Load())
502 | 	fmt.Printf("Processed Entries: %d (%.2f%% first try)\n",
503 | 		processedEntries,
504 | 		float64(stats.SuccessFirstTry.Load())/float64(processedEntries+1)*100) // +1 to avoid div by zero if no entries
505 | 	fmt.Printf("   Failed Entries: %d\n", stats.FailedEntries.Load())
506 | 	fmt.Printf("     Overall Rate: %.0f entries/sec\n", rate)
507 | 	fmt.Printf("       Retry Rate: %.2f%% (Total Retries: %d)\n",
508 | 		stats.GetRetryRate()*100, stats.RetryCount.Load())
509 | 	fmt.Printf("   Output Written: %.2f MB\n", float64(stats.OutputBytesWritten.Load())/(1024*1024))
510 | 	fmt.Printf("-------------------------------\n")
511 | }
512 | 
513 | // extractDomains is the handler for the 'domains' command.
514 | func extractDomains(outputDir string, maxConcurrentLogs int, bufferSize int, showStats bool, turbo bool, compress bool) {
515 | 	log.Printf("Starting domain extraction: output='%s', concurrency=%d, buffer=%d, stats=%t, turbo=%t, compress=%t",
516 | 		outputDir, maxConcurrentLogs, bufferSize, showStats, turbo, compress)
517 | 
518 | 	// Initialize HTTP client with turbo mode if requested
519 | 	if turbo {
520 | 		log.Println("Enabling turbo mode for HTTP client")
521 | 		client.ConfigureTurboMode()
522 | 	}
523 | 
524 | 	// 1. List logs for selection (Could be made non-interactive with flags/args later)
525 | 	allLogs, err := core.ListCTLogs()
526 | 	if err != nil {
527 | 		log.Fatalf("Error listing CT logs for selection: %v", err)
528 | 	}
529 | 	if len(allLogs) == 0 {
530 | 		log.Fatalf("No CT logs found to select from.")
531 | 	}
532 | 
533 | 	// 2. Display and prompt for selection
534 | 	fmt.Println("Available Certificate Transparency Logs:")
535 | 	for i, lg := range allLogs {
536 | 		fmt.Printf("  [%d] %s (%s)\n", i+1, lg.Description, lg.URL)
537 | 	}
538 | 	fmt.Println("  [all] Extract from all logs")
539 | 	fmt.Print("Enter log number(s) to extract domains from (e.g., 1,3,5 or all): ")
540 | 	reader := bufio.NewReader(os.Stdin)
541 | 	input, _ := reader.ReadString('\n')
542 | 	input = strings.TrimSpace(input)
543 | 	var selectedLogs []certlib.CTLogInfo
544 | 	if strings.ToLower(input) == "all" {
545 | 		selectedLogs = allLogs
546 | 		fmt.Println("Selected all logs for domain extraction.")
547 | 	} else {
548 | 		parts := strings.Split(input, ",")
549 | 		selectedIndices := make(map[int]bool)
550 | 		for _, part := range parts {
551 | 			indexStr := strings.TrimSpace(part)
552 | 			if indexStr == "" {
553 | 				continue
554 | 			}
555 | 			index, err := strconv.Atoi(indexStr)
556 | 			if err != nil || index < 1 || index > len(allLogs) {
557 | 				log.Fatalf("Invalid input: %q is not a valid number in the range 1-%d", indexStr, len(allLogs))
558 | 			}
559 | 			if !selectedIndices[index-1] {
560 | 				selectedLogs = append(selectedLogs, allLogs[index-1])
561 | 				selectedIndices[index-1] = true
562 | 			}
563 | 		}
564 | 		if len(selectedLogs) == 0 {
565 | 			log.Fatalf("No valid logs selected.")
566 | 		}
567 | 		fmt.Printf("Selected %d log(s) for domain extraction.\n", len(selectedLogs))
568 | 	}
569 | 	// -----------------------------------------------------
570 | 
571 | 	// 3. Create DomainExtractor Configuration
572 | 	config := &core.DomainExtractorConfig{
573 | 		OutputDir:         outputDir,
574 | 		BufferSize:        bufferSize,
575 | 		MaxConcurrentLogs: maxConcurrentLogs,
576 | 		Turbo:             turbo,
577 | 		CompressOutput:    compress,
578 | 	}
579 | 
580 | 	// 4. Setup Context and Signal Handling for graceful shutdown
581 | 	ctx, cancel := context.WithCancel(context.Background())
582 | 	defer cancel() // Ensure context is cancelled on exit
583 | 	sigChan := make(chan os.Signal, 1)
584 | 	signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
585 | 
586 | 	// Goroutine to listen for signals and trigger shutdown
587 | 	go func() {
588 | 		sig := <-sigChan
589 | 		log.Printf("Received signal %v, initiating shutdown...", sig)
590 | 		cancel() // Cancel context first
591 | 	}()
592 | 
593 | 	// 5. Create the Domain Extractor
594 | 	extractor, errManager := core.NewDomainExtractor(ctx, config) // Renamed err
595 | 	if errManager != nil {
596 | 		log.Fatalf("Failed to create domain extractor: %v", errManager)
597 | 	}
598 | 
599 | 	// 6. Launch Stats Display Goroutine (if enabled)
600 | 	var statsWg sync.WaitGroup
601 | 	if showStats {
602 | 		statsWg.Add(1)
603 | 		go func() {
604 | 			defer statsWg.Done()
605 | 			displayDomainStats(ctx, extractor) // Swapped order
606 | 		}()
607 | 	}
608 | 
609 | 	// 7. Start Domain Extraction Process (BLOCKING CALL)
610 | 	log.Printf("Starting extraction for %d selected logs...", len(selectedLogs))
611 | 	if err := extractor.ExtractDomainsToCSV(selectedLogs); err != nil && !errors.Is(err, context.Canceled) && !errors.Is(err, core.ErrDownloadCancelled) {
612 | 		// Log error unless it was just context cancellation
613 | 		log.Printf("Error during domain extraction: %v", err)
614 | 	}
615 | 
616 | 	// Extraction finished or was cancelled
617 | 	log.Println("Main extraction process finished or cancelled.")
618 | 
619 | 	// 8. Ensure stats goroutine finishes (if started)
620 | 	if showStats {
621 | 		log.Println("Waiting for statistics display to finish...")
622 | 		// If context wasn't cancelled by signal, cancel it now to stop stats
623 | 		cancel() // Ensure context is cancelled
624 | 		statsWg.Wait()
625 | 	}
626 | 
627 | 	// 9. Display Final Stats
628 | 	displayFinalDomainStats(extractor)
629 | 	log.Println("Domain extraction command complete.")
630 | }
631 | 
632 | // displayDomainStats periodically shows domain extraction progress.
633 | // ctx should be the first parameter for consistency with Go conventions.
634 | func displayDomainStats(ctx context.Context, extractor *core.DomainExtractor) {
635 | 	ticker := time.NewTicker(time.Second * 2) // Update every 2 seconds
636 | 	defer ticker.Stop()
637 | 	startTime := extractor.GetStats().StartTime
638 | 
639 | 	log.Println("Starting statistics display...")
640 | 
641 | 	for {
642 | 		select {
643 | 		case <-ticker.C:
644 | 			stats := extractor.GetStats()
645 | 			elapsed := time.Since(startTime).Seconds()
646 | 			if elapsed < 0.1 {
647 | 				elapsed = 0.1
648 | 			} // Avoid division by zero initially
649 | 
650 | 			processedEntries := stats.ProcessedEntries.Load()
651 | 			totalEntries := stats.TotalEntries.Load()
652 | 			failedEntries := stats.FailedEntries.Load()
653 | 			entriesPerSec := float64(processedEntries) / elapsed
654 | 			percentDone := 0.0
655 | 			if totalEntries > 0 {
656 | 				// Calculate percentage based on processed + failed vs. total
657 | 				percentDone = float64(processedEntries+failedEntries) / float64(totalEntries) * 100
658 | 			}
659 | 
660 | 			// Use carriage return to update the line in place
661 | 			fmt.Printf("\rProcessed: %d/%d logs | Entries: %d / ~%d (%.1f%%) | Failed: %d | Rate: %.0f ent/s | Domains: %d | Retries: %.2f%%",
662 | 				stats.ProcessedLogs.Load(),
663 | 				stats.TotalLogs.Load(),
664 | 				processedEntries,
665 | 				totalEntries,
666 | 				percentDone,
667 | 				failedEntries,
668 | 				entriesPerSec,
669 | 				stats.TotalDomainsFound.Load(),
670 | 				stats.GetRetryRate()*100, // Assuming DomainExtractorStats also gets GetRetryRate
671 | 			)
672 | 		case <-ctx.Done(): // Use the passed context
673 | 			fmt.Println("\nStats display stopping due to context cancellation.")
674 | 			return
675 | 		}
676 | 	}
677 | }
678 | 
679 | // displayFinalDomainStats shows the summary statistics at the end.
680 | func displayFinalDomainStats(extractor *core.DomainExtractor) {
681 | 	stats := extractor.GetStats()
682 | 	elapsed := time.Since(stats.StartTime)
683 | 	processedEntries := stats.ProcessedEntries.Load()
684 | 	rate := 0.0
685 | 	if elapsed.Seconds() > 0 {
686 | 		rate = float64(processedEntries) / elapsed.Seconds()
687 | 	}
688 | 
689 | 	// Ensure the final stats appear on a new line after the progress indicator
690 | 	fmt.Println()
691 | 	fmt.Printf("\n--- Final Domain Extraction Statistics ---\n")
692 | 	fmt.Printf(" Processing Time: %v\n", elapsed.Round(time.Millisecond))
693 | 	fmt.Printf("   Total Logs: %d\n", stats.TotalLogs.Load())
694 | 	fmt.Printf(" Processed Logs: %d\n", stats.ProcessedLogs.Load())
695 | 	fmt.Printf("    Failed Logs: %d\n", stats.FailedLogs.Load())
696 | 	fmt.Printf("  Total Entries: ~%d\n", stats.TotalEntries.Load())
697 | 	fmt.Printf("Processed Entries: %d (%.2f%% first try)\n",
698 | 		processedEntries,
699 | 		float64(stats.SuccessFirstTry.Load())/float64(processedEntries+1)*100) // Assuming DomainExtractorStats has SuccessFirstTry
700 | 	fmt.Printf("   Failed Entries: %d\n", stats.FailedEntries.Load())
701 | 	fmt.Printf("   Total Domains: %d\n", stats.TotalDomainsFound.Load())
702 | 	fmt.Printf("  Overall Rate: %.0f entries/sec\n", rate)
703 | 	fmt.Printf("    Retry Rate: %.2f%% (Total Retries: %d)\n",
704 | 		stats.GetRetryRate()*100, stats.RetryCount.Load()) // Assuming DomainExtractorStats has GetRetryRate and RetryCount
705 | 	fmt.Printf("   Output Written: %.2f MB\n", float64(stats.OutputBytesWritten.Load())/(1024*1024))
706 | 	fmt.Printf("----------------------------------------\n")
707 | }
708 | 
709 | // fetchAndSaveLogs fetches the CT logs list and saves it to a local file.
710 | func fetchAndSaveLogs() {
711 | 	log.Printf("Fetching CT logs list to %s...", logsFile)
712 | 
713 | 	// Temporarily disable UseLocalLogs to force fetching from remote
714 | 	oldUseLocalLogs := certlib.UseLocalLogs
715 | 	certlib.UseLocalLogs = false
716 | 	defer func() { certlib.UseLocalLogs = oldUseLocalLogs }() // Ensure it's restored
717 | 
718 | 	// Use the client package to fetch the logs list directly
719 | 	httpClient := client.GetHTTPClient()
720 | 	resp, err := httpClient.Get(certlib.CTLListsURL)
721 | 	if err != nil {
722 | 		log.Fatalf("Error fetching CT logs list: %v", err)
723 | 	}
724 | 	defer resp.Body.Close()
725 | 
726 | 	if resp.StatusCode != http.StatusOK {
727 | 		log.Fatalf("HTTP error %d fetching log list (%s)", resp.StatusCode, certlib.CTLListsURL)
728 | 	}
729 | 
730 | 	body, err := io.ReadAll(resp.Body)
731 | 	if err != nil {
732 | 		log.Fatalf("Error reading CT logs list body: %v", err)
733 | 	}
734 | 
735 | 	// Save the response to the specified file
736 | 	if err := os.WriteFile(logsFile, body, 0644); err != nil {
737 | 		log.Fatalf("Error saving logs to file '%s': %v", logsFile, err)
738 | 	}
739 | 
740 | 	log.Printf("Successfully saved CT logs list to %s", logsFile)
741 | 
742 | 	// Now try to parse and count the logs from the newly saved file.
743 | 	// This also serves as a basic validation of the saved file content.
744 | 	tempOriginalLocalLogsFile := certlib.LocalLogsFile // Save original for restoration
745 | 	certlib.LocalLogsFile = logsFile                   // Temporarily point certlib to the new file
746 | 	certlib.UseLocalLogs = true                        // Force use of this local file
747 | 
748 | 	logs, err := core.ListCTLogs() // This will now use the new file.
749 | 	if err != nil {
750 | 		log.Printf("Warning: Saved logs file to '%s' but encountered an error parsing it: %v", logsFile, err)
751 | 	} else {
752 | 		log.Printf("Successfully parsed %d CT logs from the saved file '%s'.", len(logs), logsFile)
753 | 	}
754 | 
755 | 	// Restore the original certlib settings.
756 | 	certlib.LocalLogsFile = tempOriginalLocalLogsFile
757 | 	certlib.UseLocalLogs = oldUseLocalLogs // Restore original UseLocalLogs setting
758 | }
759 | 
760 | // Helper to find min of two integers (for batching end index calculation)
761 | func min(a, b int) int {
762 | 	if a < b {
763 | 		return a
764 | 	}
765 | 	return b
766 | }
767 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/x-stp/rxtls
 2 | 
 3 | go 1.24.2
 4 | 
 5 | require (
 6 | 	github.com/prometheus/client_golang v1.22.0
 7 | 	github.com/spf13/cobra v1.9.1
 8 | 	github.com/zeebo/xxh3 v1.0.2
 9 | 	golang.org/x/sys v0.33.0
10 | 	golang.org/x/time v0.11.0
11 | )
12 | 
13 | require (
14 | 	github.com/beorn7/perks v1.0.1 // indirect
15 | 	github.com/cespare/xxhash/v2 v2.3.0 // indirect
16 | 	github.com/inconshreveable/mousetrap v1.1.0 // indirect
17 | 	github.com/klauspost/cpuid/v2 v2.0.9 // indirect
18 | 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
19 | 	github.com/prometheus/client_model v0.6.1 // indirect
20 | 	github.com/prometheus/common v0.62.0 // indirect
21 | 	github.com/prometheus/procfs v0.15.1 // indirect
22 | 	github.com/spf13/pflag v1.0.6 // indirect
23 | 	google.golang.org/protobuf v1.36.5 // indirect
24 | )
25 | 


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
 1 | github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
 2 | github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
 3 | github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
 4 | github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
 5 | github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g=
 6 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 7 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 8 | github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
 9 | github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
10 | github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
11 | github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
12 | github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo=
13 | github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ=
14 | github.com/klauspost/cpuid/v2 v2.0.9 h1:lgaqFMSdTdQYdZ04uHyN2d/eKdOMyi2YLSvlQIBFYa4=
15 | github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
16 | github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
17 | github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
18 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
19 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
20 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
21 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
22 | github.com/prometheus/client_golang v1.22.0 h1:rb93p9lokFEsctTys46VnV1kLCDpVZ0a/Y92Vm0Zc6Q=
23 | github.com/prometheus/client_golang v1.22.0/go.mod h1:R7ljNsLXhuQXYZYtw6GAE9AZg8Y7vEW5scdCXrWRXC0=
24 | github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E=
25 | github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY=
26 | github.com/prometheus/common v0.62.0 h1:xasJaQlnWAeyHdUBeGjXmutelfJHWMRr+Fg4QszZ2Io=
27 | github.com/prometheus/common v0.62.0/go.mod h1:vyBcEuLSvWos9B1+CyL7JZ2up+uFzXhkqml0W5zIY1I=
28 | github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc=
29 | github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk=
30 | github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
31 | github.com/spf13/cobra v1.9.1 h1:CXSaggrXdbHK9CF+8ywj8Amf7PBRmPCOJugH954Nnlo=
32 | github.com/spf13/cobra v1.9.1/go.mod h1:nDyEzZ8ogv936Cinf6g1RU9MRY64Ir93oCnqb9wxYW0=
33 | github.com/spf13/pflag v1.0.6 h1:jFzHGLGAlb3ruxLB8MhbI6A8+AQX/2eW4qeyNZXNp2o=
34 | github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
35 | github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
36 | github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
37 | github.com/zeebo/assert v1.3.0 h1:g7C04CbJuIDKNPFHmsk4hwZDO5O+kntRxzaUoNXj+IQ=
38 | github.com/zeebo/assert v1.3.0/go.mod h1:Pq9JiuJQpG8JLJdtkwrJESF0Foym2/D9XMU5ciN/wJ0=
39 | github.com/zeebo/xxh3 v1.0.2 h1:xZmwmqxHZA8AI603jOQ0tMqmBr9lPeFwGg6d+xy9DC0=
40 | github.com/zeebo/xxh3 v1.0.2/go.mod h1:5NWz9Sef7zIDm2JHfFlcQvNekmcEl9ekUZQQKCYaDcA=
41 | golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw=
42 | golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
43 | golang.org/x/time v0.11.0 h1:/bpjEDfN9tkoN/ryeYHnv5hcMlc8ncjMcM4XBk5NWV0=
44 | golang.org/x/time v0.11.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg=
45 | google.golang.org/protobuf v1.36.5 h1:tPhr+woSbjfYvY6/GPufUoYizxw1cF/yFoxJ2fmpwlM=
46 | google.golang.org/protobuf v1.36.5/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE=
47 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
48 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
49 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
50 | 


--------------------------------------------------------------------------------
/internal/certlib/api.go:
--------------------------------------------------------------------------------
  1 | package certlib
  2 | 
  3 | /*
  4 | rxtls — fast tool in Go for working with Certificate Transparency logs
  5 | Copyright (C) 2025  Pepijn van der Stap <rxtls@vanderstap.info>
  6 | 
  7 | This program is free software: you can redistribute it and/or modify
  8 | it under the terms of the GNU Affero General Public License as published by
  9 | the Free Software Foundation, either version 3 of the License, or
 10 | (at your option) any later version.
 11 | 
 12 | This program is distributed in the hope that it will be useful,
 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 15 | GNU Affero General Public License for more details.
 16 | 
 17 | You should have received a copy of the GNU Affero General Public License
 18 | along with this program.  If not, see <https://www.gnu.org/licenses/>.
 19 | */
 20 | 
 21 | import (
 22 | 	"bytes"
 23 | 	"context"
 24 | 	"crypto/x509"
 25 | 	"encoding/base64"
 26 | 	"encoding/binary"
 27 | 	"encoding/json"
 28 | 	"errors"
 29 | 	"fmt"
 30 | 	"io"
 31 | 	"log"
 32 | 	"net/http"
 33 | 	"os"
 34 | 	"strings"
 35 | 	"time"
 36 | 
 37 | 	"github.com/x-stp/rxtls/internal/client" // Import shared client package
 38 | )
 39 | 
 40 | // CTLResponse represents the structure of the JSON log list.
 41 | // Used for unmarshalling JSON (allocates).
 42 | type CTLResponse struct {
 43 | 	Operators []struct {
 44 | 		ID   int    `json:"id"`
 45 | 		Name string `json:"name"`
 46 | 	} `json:"operators"`
 47 | 	Logs []struct {
 48 | 		Description string `json:"description"`
 49 | 		Key         string `json:"key"`
 50 | 		URL         string `json:"url"`
 51 | 		MMD         int    `json:"mmd"`
 52 | 		State       struct {
 53 | 			Timestamp string `json:"timestamp"`
 54 | 		} `json:"state"`
 55 | 		OperatedBy     []int  `json:"operated_by"`
 56 | 		DNSAPIEndpoint string `json:"dns_api_endpoint,omitempty"`
 57 | 	} `json:"logs"`
 58 | }
 59 | 
 60 | // TreeSizeResponse represents the JSON structure from the get-sth endpoint.
 61 | // Used for unmarshalling JSON (allocates).
 62 | type TreeSizeResponse struct {
 63 | 	TreeSize          int    `json:"tree_size"`
 64 | 	Timestamp         int64  `json:"timestamp"`
 65 | 	SHA256RootHash    string `json:"sha256_root_hash"`
 66 | 	TreeHeadSignature string `json:"tree_head_signature"`
 67 | }
 68 | 
 69 | // EntriesResponse represents the JSON structure from the get-entries endpoint.
 70 | // Used for unmarshalling JSON (allocates).
 71 | type EntriesResponse struct {
 72 | 	Entries []struct {
 73 | 		LeafInput string `json:"leaf_input"` // Base64 encoded MerkleTreeLeaf
 74 | 		ExtraData string `json:"extra_data"` // Base64 encoded cert chain
 75 | 	} `json:"entries"`
 76 | }
 77 | 
 78 | // GetCTLogs retrieves the list of known CT logs, either from a remote URL or a local file.
 79 | // Operation: Network or Disk I/O bound. Allocates during HTTP fetch and JSON parsing.
 80 | func GetCTLogs() ([]CTLogInfo, error) {
 81 | 	if UseLocalLogs {
 82 | 		log.Printf("Using local logs list from %s\n", LocalLogsFile)
 83 | 		ctlogs, err := loadLocalCTLogs(LocalLogsFile)
 84 | 		// If local file load fails, DO NOT fall back to network.
 85 | 		if err != nil {
 86 | 			return nil, fmt.Errorf("failed to load local logs file '%s': %w", LocalLogsFile, err)
 87 | 		}
 88 | 		return ctlogs, nil
 89 | 	}
 90 | 
 91 | 	// Network fetch using shared client
 92 | 	log.Println("Fetching CT log list from", CTLListsURL)
 93 | 	httpClient := client.GetHTTPClient()
 94 | 
 95 | 	resp, err := httpClient.Get(CTLListsURL)
 96 | 	if err != nil {
 97 | 		return nil, fmt.Errorf("error retrieving CT logs list: %w", err)
 98 | 	}
 99 | 	defer resp.Body.Close()
100 | 	if resp.StatusCode != http.StatusOK {
101 | 		return nil, fmt.Errorf("HTTP error %d fetching log list", resp.StatusCode)
102 | 	}
103 | 	body, err := io.ReadAll(resp.Body)
104 | 	if err != nil {
105 | 		return nil, fmt.Errorf("error reading CT log list body: %w", err)
106 | 	}
107 | 
108 | 	// Try to parse as V3 format first (same as in loadLocalCTLogs)
109 | 	var v3Response struct {
110 | 		Operators []struct {
111 | 			Name string `json:"name"`
112 | 			Logs []struct {
113 | 				Description string                 `json:"description"`
114 | 				URL         string                 `json:"url"`
115 | 				State       map[string]interface{} `json:"state"`
116 | 			} `json:"logs"`
117 | 		} `json:"operators"`
118 | 	}
119 | 
120 | 	if err := json.Unmarshal(body, &v3Response); err == nil {
121 | 		// Process V3 format
122 | 		var ctlogs []CTLogInfo
123 | 		for _, operator := range v3Response.Operators {
124 | 			for _, logEntry := range operator.Logs {
125 | 				if logEntry.URL == "" {
126 | 					continue
127 | 				}
128 | 				url := cleanLogURL(logEntry.URL)
129 | 				if isLogUsable(logEntry.State) {
130 | 					ctlogs = append(ctlogs, CTLogInfo{
131 | 						URL:         url,
132 | 						Description: logEntry.Description,
133 | 						OperatedBy:  operator.Name,
134 | 						BlockSize:   64, // Default
135 | 					})
136 | 				}
137 | 			}
138 | 		}
139 | 		log.Printf("Found %d usable CT logs from remote (V3 format)", len(ctlogs))
140 | 		return ctlogs, nil
141 | 	}
142 | 
143 | 	// Fallback to V2/older format
144 | 	log.Printf("Failed to parse remote logs as V3, trying older format")
145 | 	var ctlResponse CTLResponse
146 | 	if errFallback := json.Unmarshal(body, &ctlResponse); errFallback != nil {
147 | 		// Save the response to a file for debugging
148 | 		debugFile := "debug_ct_logs_response.json"
149 | 		if err := os.WriteFile(debugFile, body, 0644); err == nil {
150 | 			log.Printf("Saved problematic response to %s for debugging", debugFile)
151 | 		}
152 | 		return nil, fmt.Errorf("error parsing CT logs list JSON with known formats: %w", errFallback)
153 | 	}
154 | 
155 | 	// Process response using old format
156 | 	logs, err := processOldFormat(&ctlResponse)
157 | 	if err != nil {
158 | 		return nil, fmt.Errorf("error processing old format logs: %w", err)
159 | 	}
160 | 
161 | 	// If we got logs successfully, save them to the local file for future use
162 | 	if len(logs) > 0 {
163 | 		if err := os.WriteFile(LocalLogsFile, body, 0644); err != nil {
164 | 			log.Printf("Warning: Failed to save logs to local file: %v", err)
165 | 		} else {
166 | 			log.Printf("Saved logs to %s for future use", LocalLogsFile)
167 | 		}
168 | 	}
169 | 
170 | 	return logs, nil
171 | }
172 | 
173 | // loadLocalCTLogs reads and parses the log list from a local JSON file.
174 | // Operation: Disk I/O bound, allocates for file read and JSON parsing.
175 | func loadLocalCTLogs(filename string) ([]CTLogInfo, error) {
176 | 	data, err := os.ReadFile(filename)
177 | 	if err != nil {
178 | 		return nil, fmt.Errorf("error reading local logs file: %w", err)
179 | 	}
180 | 	// Attempt V3 format parse first
181 | 	var v3Response struct {
182 | 		Operators []struct {
183 | 			Name string `json:"name"`
184 | 			Logs []struct {
185 | 				Description string                 `json:"description"`
186 | 				URL         string                 `json:"url"`
187 | 				State       map[string]interface{} `json:"state"`
188 | 			} `json:"logs"`
189 | 		} `json:"operators"`
190 | 	}
191 | 	if err := json.Unmarshal(data, &v3Response); err == nil {
192 | 		// Process V3 format
193 | 		var ctlogs []CTLogInfo
194 | 		for _, operator := range v3Response.Operators {
195 | 			for _, logEntry := range operator.Logs {
196 | 				if logEntry.URL == "" {
197 | 					continue
198 | 				}
199 | 				url := cleanLogURL(logEntry.URL)
200 | 				if isLogUsable(logEntry.State) {
201 | 					ctlogs = append(ctlogs, CTLogInfo{
202 | 						URL:         url,
203 | 						Description: logEntry.Description,
204 | 						OperatedBy:  operator.Name,
205 | 						BlockSize:   64, // Default
206 | 					})
207 | 				}
208 | 			}
209 | 		}
210 | 		log.Printf("Found %d usable CT logs in local file (V3 format)", len(ctlogs))
211 | 		return ctlogs, nil
212 | 	}
213 | 	// Fallback to V2/older format
214 | 	log.Printf("Failed to parse local logs as V3, trying older format: %v", err)
215 | 	var ctlResponse CTLResponse
216 | 	if errFallback := json.Unmarshal(data, &ctlResponse); errFallback != nil {
217 | 		return nil, fmt.Errorf("error parsing local logs file with known formats: %w (primary V3 err) / %w (fallback V2 err)", err, errFallback)
218 | 	}
219 | 	return processOldFormat(&ctlResponse)
220 | }
221 | 
222 | // cleanLogURL helper
223 | func cleanLogURL(rawURL string) string {
224 | 	url := rawURL
225 | 	if strings.HasPrefix(url, "https://") {
226 | 		url = url[8:]
227 | 	} else if strings.HasPrefix(url, "http://") {
228 | 		url = url[7:]
229 | 	}
230 | 
231 | 	return strings.TrimSuffix(url, "/")
232 | }
233 | 
234 | // isLogUsable helper
235 | func isLogUsable(state map[string]interface{}) bool {
236 | 	if _, ok := state["rejected"]; ok {
237 | 		return false
238 | 	}
239 | 	if _, ok := state["retired"]; ok {
240 | 		return false
241 | 	}
242 | 	logType, _ := state["log_type"].(string)
243 | 	return logType != "test"
244 | }
245 | 
246 | // processOldFormat handles the fallback parsing scenario.
247 | // Operation: Similar allocation patterns to the main processing loop (slice append, string ops).
248 | func processOldFormat(ctlResponse *CTLResponse) ([]CTLogInfo, error) {
249 | 	operatorNames := make(map[int]string)
250 | 	for _, operator := range ctlResponse.Operators {
251 | 		operatorNames[operator.ID] = operator.Name
252 | 	}
253 | 	var ctlogs []CTLogInfo
254 | 	for _, logEntry := range ctlResponse.Logs {
255 | 		if logEntry.URL == "" {
256 | 			continue
257 | 		}
258 | 		url := cleanLogURL(logEntry.URL)
259 | 		operatedBy := ""
260 | 		if len(logEntry.OperatedBy) > 0 {
261 | 			operatedBy = operatorNames[logEntry.OperatedBy[0]]
262 | 		}
263 | 		ctlog := CTLogInfo{
264 | 			URL:         url,
265 | 			Description: logEntry.Description,
266 | 			OperatedBy:  operatedBy,
267 | 			BlockSize:   64,
268 | 		}
269 | 		if ctlog.IsResolvable() { // Simple parse check
270 | 			ctlogs = append(ctlogs, ctlog)
271 | 		}
272 | 	}
273 | 	log.Printf("Found %d usable CT logs in local file (Fallback format)", len(ctlogs))
274 | 
275 | 	if len(ctlogs) == 0 {
276 | 		return nil, fmt.Errorf("no usable CT logs found in fallback format")
277 | 	}
278 | 
279 | 	return ctlogs, nil
280 | }
281 | 
282 | // GetLogInfo retrieves the tree size from a CT log.
283 | // Operation: Network bound. Allocates during HTTP fetch and JSON parsing.
284 | func GetLogInfo(ctlog *CTLogInfo) error {
285 | 	// Use shared HTTP client
286 | 	httpClient := client.GetHTTPClient()
287 | 
288 | 	// Construct URL
289 | 	url := fmt.Sprintf("https://%s/ct/v1/get-sth", ctlog.URL)
290 | 
291 | 	// Make the request with retry logic
292 | 	var resp *http.Response
293 | 	var err error
294 | 	maxRetries := 3
295 | 	retryDelay := 100 * time.Millisecond
296 | 
297 | 	for attempt := range maxRetries {
298 | 		resp, err = httpClient.Get(url)
299 | 		if err == nil && resp.StatusCode == http.StatusOK {
300 | 			break
301 | 		}
302 | 
303 | 		if resp != nil {
304 | 			resp.Body.Close()
305 | 		}
306 | 
307 | 		if attempt < maxRetries-1 {
308 | 			log.Printf("Retrying GetLogInfo for %s after error: %v (attempt %d/%d)",
309 | 				ctlog.URL, err, attempt+1, maxRetries)
310 | 			time.Sleep(retryDelay)
311 | 			retryDelay *= 2 // Exponential backoff
312 | 		}
313 | 	}
314 | 
315 | 	if err != nil {
316 | 		return fmt.Errorf("error retrieving log info after %d attempts: %w", maxRetries, err)
317 | 	}
318 | 	if resp.StatusCode != http.StatusOK {
319 | 		resp.Body.Close()
320 | 		return fmt.Errorf("HTTP error %d fetching log info for %s", resp.StatusCode, ctlog.URL)
321 | 	}
322 | 	defer resp.Body.Close()
323 | 
324 | 	body, err := io.ReadAll(resp.Body)
325 | 	if err != nil {
326 | 		return fmt.Errorf("error reading log info body: %w", err)
327 | 	}
328 | 
329 | 	var treeSize TreeSizeResponse
330 | 	if err := json.Unmarshal(body, &treeSize); err != nil {
331 | 		return fmt.Errorf("error parsing log info JSON: %w", err)
332 | 	}
333 | 
334 | 	ctlog.TreeSize = treeSize.TreeSize
335 | 	return nil
336 | }
337 | 
338 | // DownloadEntries retrieves a range of entries from a CT log.
339 | // Operation: Network bound. Allocates during HTTP fetch and JSON parsing.
340 | func DownloadEntries(ctx context.Context, ctlog *CTLogInfo, start, end int) (*EntriesResponse, error) {
341 | 	// Use shared HTTP client
342 | 	httpClient := client.GetHTTPClient()
343 | 
344 | 	// Construct URL
345 | 	url := fmt.Sprintf("https://%s/ct/v1/get-entries?start=%d&end=%d", ctlog.URL, start, end)
346 | 
347 | 	// Create request with context
348 | 	req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
349 | 	if err != nil {
350 | 		return nil, fmt.Errorf("error creating request: %w", err)
351 | 	}
352 | 	req.Header.Set("User-Agent", "rxtls (+https://github.com/x-stp/rxtls)")
353 | 
354 | 	// Make the request with retry logic
355 | 	var resp *http.Response
356 | 	maxRetries := 3
357 | 	retryDelay := 500 * time.Millisecond
358 | 
359 | 	for attempt := range maxRetries {
360 | 		resp, err = httpClient.Do(req)
361 | 		if err == nil && resp.StatusCode == http.StatusOK {
362 | 			break
363 | 		}
364 | 
365 | 		if resp != nil {
366 | 			resp.Body.Close()
367 | 		}
368 | 
369 | 		// Check if context is cancelled before retrying
370 | 		if ctx.Err() != nil {
371 | 			return nil, ctx.Err()
372 | 		}
373 | 
374 | 		if attempt < maxRetries-1 {
375 | 			log.Printf("Retrying DownloadEntries for %s (%d-%d) after error: %v (attempt %d/%d)",
376 | 				ctlog.URL, start, end, err, attempt+1, maxRetries)
377 | 
378 | 			// Use context-aware sleep
379 | 			select {
380 | 			case <-time.After(retryDelay):
381 | 				retryDelay *= 2 // Exponential backoff
382 | 			case <-ctx.Done():
383 | 				return nil, ctx.Err()
384 | 			}
385 | 		}
386 | 	}
387 | 
388 | 	if err != nil {
389 | 		return nil, fmt.Errorf("error downloading entries after %d attempts: %w", maxRetries, err)
390 | 	}
391 | 	if resp.StatusCode != http.StatusOK {
392 | 		resp.Body.Close()
393 | 		return nil, fmt.Errorf("HTTP error %d fetching entries for %s (%d-%d)", resp.StatusCode, ctlog.URL, start, end)
394 | 	}
395 | 	defer resp.Body.Close()
396 | 
397 | 	body, err := io.ReadAll(resp.Body)
398 | 	if err != nil {
399 | 		return nil, fmt.Errorf("error reading entries body: %w", err)
400 | 	}
401 | 
402 | 	var entries EntriesResponse
403 | 	if err := json.Unmarshal(body, &entries); err != nil {
404 | 		return nil, fmt.Errorf("error parsing entries JSON: %w", err)
405 | 	}
406 | 
407 | 	return &entries, nil
408 | }
409 | 
410 | // ParseCertificateEntry decodes the MerkleTreeLeaf framing and parses the inner certificate data.
411 | // Handles Version 0, LeafType 0 (TimestampedEntry) containing X.509 or Precert.
412 | func ParseCertificateEntry(leafInput, extraData, logURL string) (*CertificateData, error) {
413 | 	leafBytes, err := base64.StdEncoding.DecodeString(leafInput)
414 | 	if err != nil {
415 | 		return nil, fmt.Errorf("failed to decode leaf input base64: %w", err)
416 | 	}
417 | 
418 | 	// --- Check CT Framing Prefix ---
419 | 	if len(leafBytes) < 2 {
420 | 		return nil, fmt.Errorf("leaf input too short for CT framing (len %d)", len(leafBytes))
421 | 	}
422 | 	version := uint8(leafBytes[0])
423 | 	leafType := uint8(leafBytes[1])
424 | 	if version != 0 {
425 | 		return nil, fmt.Errorf("unsupported MerkleTreeLeaf version: %d", version)
426 | 	}
427 | 	if leafType != 0 {
428 | 		return nil, fmt.Errorf("unsupported MerkleLeafType: %d", leafType)
429 | 	}
430 | 	// --------------------------------
431 | 
432 | 	// --- Manually Parse TimestampedEntry ---
433 | 	r := bytes.NewReader(leafBytes[2:]) // Reader for the payload after framing
434 | 
435 | 	var timestamp uint64
436 | 	if err := binary.Read(r, binary.BigEndian, &timestamp); err != nil {
437 | 		return nil, fmt.Errorf("failed to read timestamp: %w", err)
438 | 	}
439 | 
440 | 	var entryTypeUint16 uint16
441 | 	if err := binary.Read(r, binary.BigEndian, &entryTypeUint16); err != nil {
442 | 		return nil, fmt.Errorf("failed to read entry type: %w", err)
443 | 	}
444 | 	entryTypeString := "Unknown"
445 | 
446 | 	var certDER []byte
447 | 
448 | 	switch entryTypeUint16 {
449 | 	case 0: // x509_entry
450 | 		entryTypeString = "X509LogEntry"
451 | 		// Read the 3-byte length field for the certificate
452 | 		var certLenBytes [3]byte
453 | 		if _, err := io.ReadFull(r, certLenBytes[:]); err != nil {
454 | 			return nil, fmt.Errorf("failed to read x509 entry length: %w", err)
455 | 		}
456 | 		certLen := uint32(certLenBytes[0])<<16 | uint32(certLenBytes[1])<<8 | uint32(certLenBytes[2])
457 | 
458 | 		// Check for unreasonable length
459 | 		if certLen > uint32(r.Len()) {
460 | 			return nil, fmt.Errorf("x509 entry length (%d) exceeds remaining data (%d)", certLen, r.Len())
461 | 		}
462 | 
463 | 		// Read the certificate bytes
464 | 		certDER = make([]byte, certLen)
465 | 		if _, err := io.ReadFull(r, certDER); err != nil {
466 | 			return nil, fmt.Errorf("failed to read x509 entry data: %w", err)
467 | 		}
468 | 
469 | 	case 1: // precert_entry
470 | 		entryTypeString = "PrecertLogEntry"
471 | 		// Read Issuer Key Hash (32 bytes) - we don't use it currently, but need to consume it.
472 | 		var issuerKeyHash [32]byte
473 | 		if _, err := io.ReadFull(r, issuerKeyHash[:]); err != nil {
474 | 			return nil, fmt.Errorf("failed to read precert issuer key hash: %w", err)
475 | 		}
476 | 
477 | 		// Read the 3-byte length field for the TBS certificate
478 | 		var tbsCertLenBytes [3]byte
479 | 		if _, err := io.ReadFull(r, tbsCertLenBytes[:]); err != nil {
480 | 			return nil, fmt.Errorf("failed to read precert TBS length: %w", err)
481 | 		}
482 | 		tbsCertLen := uint32(tbsCertLenBytes[0])<<16 | uint32(tbsCertLenBytes[1])<<8 | uint32(tbsCertLenBytes[2])
483 | 
484 | 		// Check length
485 | 		if tbsCertLen > uint32(r.Len()) {
486 | 			return nil, fmt.Errorf("precert TBS length (%d) exceeds remaining data (%d)", tbsCertLen, r.Len())
487 | 		}
488 | 
489 | 		// Read the TBS certificate bytes
490 | 		certDER = make([]byte, tbsCertLen)
491 | 		if _, err := io.ReadFull(r, certDER); err != nil {
492 | 			return nil, fmt.Errorf("failed to read precert TBS data: %w", err)
493 | 		}
494 | 
495 | 	default:
496 | 		return nil, fmt.Errorf("unknown TimestampedEntry.EntryType: %d", entryTypeUint16)
497 | 	}
498 | 
499 | 	// Extensions follow the signed_entry; read their length (2 bytes) and consume them.
500 | 	// We don't parse extensions in this version, but must read past them.
501 | 	var extensionsLen uint16
502 | 	if err := binary.Read(r, binary.BigEndian, &extensionsLen); err != nil {
503 | 		// Allow EOF here if extensions are truly absent, although spec implies length should be present.
504 | 		if !errors.Is(err, io.EOF) && !errors.Is(err, io.ErrUnexpectedEOF) {
505 | 			log.Printf("Warning: Failed to read extensions length for %s (%d-%d): %v. Remaining bytes: %d", logURL, 0, 0, err, r.Len()) // Need index context here if possible
506 | 		}
507 | 	} else if extensionsLen > 0 {
508 | 		if extensionsLen > uint16(r.Len()) {
509 | 			return nil, fmt.Errorf("extensions length (%d) exceeds remaining data (%d)", extensionsLen, r.Len())
510 | 		}
511 | 		// Consume extension bytes
512 | 		extensionBytes := make([]byte, extensionsLen)
513 | 		if _, err := io.ReadFull(r, extensionBytes); err != nil {
514 | 			return nil, fmt.Errorf("failed to read extensions data: %w", err)
515 | 		}
516 | 	}
517 | 	// --------------------------------------
518 | 
519 | 	if len(certDER) == 0 {
520 | 		return nil, fmt.Errorf("no certificate DER data extracted for entry type %d", entryTypeUint16)
521 | 	}
522 | 
523 | 	// --- Parse the final DER bytes ---
524 | 	cert, err := x509.ParseCertificate(certDER)
525 | 	if err != nil {
526 | 		if entryTypeString == "PrecertLogEntry" {
527 | 			// Known failure mode for TBS certs
528 | 			return nil, fmt.Errorf("skipped parsing Precert TBS: %w", err)
529 | 		}
530 | 		return nil, fmt.Errorf("failed to parse certificate DER (type %s): %w", entryTypeString, err)
531 | 	}
532 | 
533 | 	// Convert to our internal struct
534 | 	cd := CertificateFromX509(cert, logURL)
535 | 	cd.Type = entryTypeString // Set the correct type
536 | 	return cd, nil
537 | }


--------------------------------------------------------------------------------
/internal/certlib/domain_normalization_test.go:
--------------------------------------------------------------------------------
  1 | package certlib
  2 | 
  3 | /*
  4 | rxtls — fast tool in Go for working with Certificate Transparency logs
  5 | Copyright (C) 2025  Pepijn van der Stap <rxtls@vanderstap.info>
  6 | 
  7 | This program is free software: you can redistribute it and/or modify
  8 | it under the terms of the GNU Affero General Public License as published by
  9 | the Free Software Foundation, either version 3 of the License, or
 10 | (at your option) any later version.
 11 | 
 12 | This program is distributed in the hope that it will be useful,
 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 15 | GNU Affero General Public License for more details.
 16 | 
 17 | You should have received a copy of the GNU Affero General Public License
 18 | along with this program.  If not, see <https://www.gnu.org/licenses/>.
 19 | */
 20 | 
 21 | import (
 22 | 	"strings"
 23 | 	"testing"
 24 | )
 25 | 
 26 | // TestNormalizeDomain provides table-driven tests for various domain formats and edge cases.
 27 | // Goal: Ensure NormalizeDomain behaves correctly for diverse inputs.
 28 | // Uses t.Parallel() to allow tests within this function to run concurrently.
 29 | func TestNormalizeDomain(t *testing.T) {
 30 | 	t.Parallel() // Mark this test function as safe to run in parallel with others.
 31 | 	testCases := []struct {
 32 | 		name     string
 33 | 		input    string
 34 | 		expected string
 35 | 	}{
 36 | 		{"Simple domain", "example.com", "example.com"},
 37 | 		{"Subdomain", "www.example.com", "www.example.com"},
 38 | 		{"Uppercase", "EXAMPLE.COM", "example.com"},
 39 | 		{"Mixed case", "Www.Example.Com", "www.example.com"},
 40 | 		{"Trailing dot", "example.com.", "example.com"},
 41 | 		{"Multiple trailing dots", "example.com...", "example.com"},
 42 | 		{"Leading dot", ".example.com", "example.com"},
 43 | 		{"Leading/Trailing dots", ".example.com.", "example.com"},
 44 | 		{"Leading/Trailing spaces", "  example.com  ", "example.com"},
 45 | 		{"Wildcard", "*.example.com", "*.example.com"},
 46 | 		{"Wildcard uppercase", "*.EXAMPLE.COM", "*.example.com"},
 47 | 		{"Wildcard trailing dot", "*.example.com.", "*.example.com"},
 48 | 		{"Multiple wildcards", "*.*.example.com", "*.*.example.com"},           // Assuming this is valid/desired
 49 | 		{"Punycode", "xn--bcher-kva.example.com", "xn--bcher-kva.example.com"}, // bücher.example.com
 50 | 		{"Punycode uppercase", "XN--BCHER-KVA.EXAMPLE.COM", "xn--bcher-kva.example.com"},
 51 | 		{"Empty string", "", ""},
 52 | 		{"Just spaces", "   ", ""},
 53 | 		{"Just dots", "...", ""},
 54 | 		{"IP Address v4", "192.168.1.1", "192.168.1.1"},                                            // Should probably remain unchanged or be identified
 55 | 		{"IP Address v6", "::1", "::1"},                                                            // Should probably remain unchanged or be identified
 56 | 		{"Domain with port", "example.com:443", "example.com:443"},                                 // Should likely remain unchanged
 57 | 		{"Internal spaces", "example test.com", "example test.com"},                                // Junk, expect no change or specific handling
 58 | 		{"Leading dash", "-example.com", "-example.com"},                                           // Technically invalid label, expect no change
 59 | 		{"Trailing dash", "example-.com", "example-.com"},                                          // Technically invalid label, expect no change
 60 | 		{"Very long domain", strings.Repeat("a.", 100) + "com", strings.Repeat("a.", 100) + "com"}, // Keep as is
 61 | 	}
 62 | 
 63 | 	for _, tc := range testCases {
 64 | 		// Capture range variable for parallel execution.
 65 | 		tc := tc
 66 | 		// Run each test case as a parallel subtest.
 67 | 		t.Run(tc.name, func(t *testing.T) {
 68 | 			t.Parallel()
 69 | 			actual := NormalizeDomain(tc.input)
 70 | 			if actual != tc.expected {
 71 | 				t.Errorf("NormalizeDomain(%q) = %q; want %q", tc.input, actual, tc.expected)
 72 | 			}
 73 | 		})
 74 | 	}
 75 | }
 76 | 
 77 | // BenchmarkNormalizeDomainSimple measures performance for a common, simple domain.
 78 | // Goal: Establish baseline performance.
 79 | // Operation: Runs NormalizeDomain repeatedly in a loop.
 80 | func BenchmarkNormalizeDomainSimple(b *testing.B) {
 81 | 	domain := "www.example.com"
 82 | 	// b.N is adjusted by the testing framework to achieve stable measurements.
 83 | 	for i := 0; i < b.N; i++ {
 84 | 		_ = NormalizeDomain(domain) // Assign to blank identifier to prevent optimization removal.
 85 | 	}
 86 | }
 87 | 
 88 | // BenchmarkNormalizeDomainMixedCaseTrailingDot measures performance for domains needing case and dot normalization.
 89 | func BenchmarkNormalizeDomainMixedCaseTrailingDot(b *testing.B) {
 90 | 	domain := "Www.Example.COM."
 91 | 	for i := 0; i < b.N; i++ {
 92 | 		_ = NormalizeDomain(domain)
 93 | 	}
 94 | }
 95 | 
 96 | // BenchmarkNormalizeDomainWildcard measures performance for wildcard domains needing normalization.
 97 | func BenchmarkNormalizeDomainWildcard(b *testing.B) {
 98 | 	domain := "*.SubDomain.Example.COM."
 99 | 	for i := 0; i < b.N; i++ {
100 | 		_ = NormalizeDomain(domain)
101 | 	}
102 | }
103 | 
104 | // BenchmarkSortedNormalizedDomains (Placeholder)
105 | // Goal: Measure performance of getting unique, sorted, normalized domains from a CertificateData struct.
106 | // Constraints: Would depend heavily on the number of domains in AllDomains and the sorting algorithm.
107 | // TODO: Implement this benchmark once the corresponding function (e.g., CertificateData.SortedNormalizedDomains) is optimized (uses sort.Strings).
108 | /*
109 | func BenchmarkSortedNormalizedDomains(b *testing.B) {
110 | 	// Setup: Create a CertificateData with a large, diverse list of domains.
111 | 	size := 1000 // Example size
112 | 	allDomains := make([]string, size)
113 | 	for i := 0; i < size; i++ {
114 | 		// Generate realistic domain variations (mixed case, dots, wildcards, duplicates)
115 | 		allDomains[i] = fmt.Sprintf("sub%d.EXAMPLE%d.com.", i%10, i%50)
116 | 	}
117 | 	certData := &certlib.CertificateData{
118 | 		AllDomains: allDomains,
119 | 		Subject:    certlib.SubjectData{O: "Test Org"}, // Needed for DomainOrgHash if testing that
120 | 	}
121 | 
122 | b.ResetTimer() // Start timing after setup
123 | 	for i := 0; i < b.N; i++ {
124 | 		_ = certData.SortedNormalizedDomains() // Call the function under test
125 | 	}
126 | }
127 | */
128 | 


--------------------------------------------------------------------------------
/internal/certlib/models.go:
--------------------------------------------------------------------------------
  1 | package certlib
  2 | 
  3 | /*
  4 | rxtls — fast tool in Go for working with Certificate Transparency logs
  5 | Copyright (C) 2025  Pepijn van der Stap <rxtls@vanderstap.info>
  6 | 
  7 | This program is free software: you can redistribute it and/or modify
  8 | it under the terms of the GNU Affero General Public License as published by
  9 | the Free Software Foundation, either version 3 of the License, or
 10 | (at your option) any later version.
 11 | 
 12 | This program is distributed in the hope that it will be useful,
 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 15 | GNU Affero General Public License for more details.
 16 | 
 17 | You should have received a copy of the GNU Affero General Public License
 18 | along with this program.  If not, see <https://www.gnu.org/licenses/>.
 19 | */
 20 | 
 21 | import (
 22 | 	"crypto/tls"
 23 | 	"crypto/x509"
 24 | 	"encoding/base64"
 25 | 	"fmt"
 26 | 	"net/url"
 27 | 	"sort"
 28 | 	"strings"
 29 | 
 30 | 	"github.com/zeebo/xxh3"
 31 | )
 32 | 
 33 | // Constants related to CT log interaction.
 34 | const (
 35 | 	CTLListsURL         = "https://www.gstatic.com/ct/log_list/v3/log_list.json"
 36 | 	CTLInfoURLTemplate  = "https://%s/ct/v1/get-sth"
 37 | 	DownloadURLTemplate = "https://%s/ct/v1/get-entries?start=%d&end=%d"
 38 | 	HTTPTimeout         = 30 // seconds
 39 | )
 40 | 
 41 | // Global settings influencing certlib behavior.
 42 | var (
 43 | 	UseLocalLogs  = false
 44 | 	LocalLogsFile = "./all_logs_list.json"
 45 | )
 46 | 
 47 | // CTLogInfo holds metadata about a single Certificate Transparency log.
 48 | type CTLogInfo struct {
 49 | 	URL         string `json:"url"`
 50 | 	Description string `json:"description"`
 51 | 	OperatedBy  string `json:"operated_by"`
 52 | 	TreeSize    int    `json:"tree_size"`
 53 | 	BlockSize   int    `json:"block_size"`
 54 | }
 55 | 
 56 | // IsCloudflare checks if the log URL suggests it's operated by Cloudflare.
 57 | func (c *CTLogInfo) IsCloudflare() bool {
 58 | 	return strings.Contains(c.URL, "cloudflare.com")
 59 | }
 60 | 
 61 | // IsDigiCert checks if the log URL suggests it's operated by DigiCert.
 62 | func (c *CTLogInfo) IsDigiCert() bool {
 63 | 	return strings.Contains(c.URL, "digicert.com") ||
 64 | 		strings.Contains(c.URL, "wyvern") ||
 65 | 		strings.Contains(c.URL, "nessie")
 66 | }
 67 | 
 68 | // Host extracts the hostname part from the log URL.
 69 | func (c *CTLogInfo) Host() string {
 70 | 	parts := strings.Split(c.URL, "/")
 71 | 	return parts[0]
 72 | }
 73 | 
 74 | // IsResolvable checks if the log's hostname can be parsed.
 75 | func (c *CTLogInfo) IsResolvable() bool {
 76 | 	_, err := url.Parse("https://" + c.URL)
 77 | 	return err == nil
 78 | }
 79 | 
 80 | // GetTLSConfig provides a TLS configuration optimized for performance.
 81 | func (c *CTLogInfo) GetTLSConfig() *tls.Config {
 82 | 	return &tls.Config{
 83 | 		MinVersion: tls.VersionTLS12,
 84 | 		MaxVersion: tls.VersionTLS13,
 85 | 		CipherSuites: []uint16{
 86 | 			tls.TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,
 87 | 			tls.TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,
 88 | 			tls.TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,
 89 | 			tls.TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,
 90 | 			tls.TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305,
 91 | 			tls.TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,
 92 | 		},
 93 | 		NextProtos: []string{"http/1.1"},
 94 | 	}
 95 | }
 96 | 
 97 | // SubjectData holds components of an X.509 Subject or Issuer Name.
 98 | type SubjectData struct {
 99 | 	Aggregated string `json:"aggregated"`
100 | 	C          string `json:"C,omitempty"`
101 | 	ST         string `json:"ST,omitempty"`
102 | 	L          string `json:"L,omitempty"`
103 | 	O          string `json:"O,omitempty"`
104 | 	OU         string `json:"OU,omitempty"`
105 | 	CN         string `json:"CN,omitempty"`
106 | }
107 | 
108 | // Extensions simplified storage.
109 | type Extensions struct {
110 | 	SubjectAltName string `json:"subjectAltName,omitempty"`
111 | }
112 | 
113 | // CertificateData represents the parsed data from a single certificate entry.
114 | type CertificateData struct {
115 | 	Subject    SubjectData
116 | 	Issuer     SubjectData
117 | 	Extensions map[string]string // Simplified
118 | 	NotBefore  int64
119 | 	NotAfter   int64
120 | 	AsDER      string // Base64 DER
121 | 	AllDomains []string
122 | 	Type       string
123 | 	Source     map[string]string
124 | }
125 | 
126 | // Chain calculates a NON-CRYPTOGRAPHIC hash (xxh3) of the base64 DER string.
127 | func (c *CertificateData) Chain() string {
128 | 	h := xxh3.HashString(c.AsDER)
129 | 	return fmt.Sprintf("%x", h)
130 | }
131 | 
132 | // NormalizedDomainsSet returns a set (map[string]struct{}) of normalized domains.
133 | func (c *CertificateData) NormalizedDomainsSet() map[string]struct{} {
134 | 	result := make(map[string]struct{}, len(c.AllDomains))
135 | 	for _, domain := range c.AllDomains {
136 | 		normalized := NormalizeDomain(domain)
137 | 		if normalized != "" {
138 | 			result[normalized] = struct{}{}
139 | 		}
140 | 	}
141 | 	return result
142 | }
143 | 
144 | // SortedNormalizedDomains returns sorted, unique, normalized domains.
145 | func (c *CertificateData) SortedNormalizedDomains() []string {
146 | 	domainSet := c.NormalizedDomainsSet()
147 | 	domains := make([]string, 0, len(domainSet))
148 | 	for domain := range domainSet {
149 | 		domains = append(domains, domain)
150 | 	}
151 | 	sort.Strings(domains)
152 | 	return domains
153 | }
154 | 
155 | // calculateDomainOrgHash uses xxh3 hash.
156 | func calculateDomainOrgHash(sortedUniqueNormalizedDomains []string, org string) string {
157 | 	estimatedLen := len(org) + 1
158 | 	for _, d := range sortedUniqueNormalizedDomains {
159 | 		estimatedLen += len(d) + 1
160 | 	}
161 | 	var sb strings.Builder
162 | 	sb.Grow(estimatedLen)
163 | 	for i, domain := range sortedUniqueNormalizedDomains {
164 | 		if i > 0 {
165 | 			sb.WriteByte(',')
166 | 		}
167 | 		sb.WriteString(domain)
168 | 	}
169 | 	sb.WriteByte('|')
170 | 	sb.WriteString(org)
171 | 	h := xxh3.HashString(sb.String())
172 | 	return fmt.Sprintf("%x", h)
173 | }
174 | 
175 | // DomainOrgHash calculates the xxh3 hash based on sorted, unique, normalized domains and Org.
176 | func (c *CertificateData) DomainOrgHash() string {
177 | 	return calculateDomainOrgHash(c.SortedNormalizedDomains(), c.Subject.O)
178 | }
179 | 
180 | // ToCSVLine creates a simple CSV for raw certificate download output.
181 | func (c *CertificateData) ToCSVLine(certIndex int) string {
182 | 	return fmt.Sprintf("%s,%d,%s,%s,%s,%d,%d\n",
183 | 		c.Source["url"],
184 | 		certIndex,
185 | 		c.Chain(),
186 | 		c.AsDER,
187 | 		strings.Join(c.AllDomains, " "),
188 | 		c.NotBefore,
189 | 		c.NotAfter,
190 | 	)
191 | }
192 | 
193 | // ToDomainsCSVLine creates the specific CSV format for the 'domains' command.
194 | func (c *CertificateData) ToDomainsCSVLine(certIndex int) string {
195 | 	normalizedCN := NormalizeDomain(c.Subject.CN)
196 | 	normalizedDomains := c.SortedNormalizedDomains()
197 | 	outputDomains := make([]string, len(normalizedDomains))
198 | 	for i, d := range normalizedDomains {
199 | 		if strings.HasPrefix(d, "*.") {
200 | 			outputDomains[i] = d[2:]
201 | 		} else {
202 | 			outputDomains[i] = d
203 | 		}
204 | 	}
205 | 	outputDomainsStr := strings.Join(outputDomains, ",")
206 | 	primaryDomain := ""
207 | 	if len(normalizedDomains) > 0 {
208 | 		primaryDomain = normalizedDomains[0]
209 | 	}
210 | 	hash := calculateDomainOrgHash(normalizedDomains, c.Subject.O)
211 | 	return fmt.Sprintf("%d,%s,%s,%s,%s,\"%s\",\"%s\",\"%s\",\"%s\",%s\n",
212 | 		certIndex,
213 | 		normalizedCN,
214 | 		primaryDomain,
215 | 		outputDomainsStr,
216 | 		c.Subject.C,
217 | 		c.Subject.ST,
218 | 		c.Subject.L,
219 | 		c.Subject.O,
220 | 		c.Issuer.CN,
221 | 		hash,
222 | 	)
223 | }
224 | 
225 | // CertificateFromX509 creates a CertificateData from an x509 Certificate.
226 | func CertificateFromX509(cert *x509.Certificate, source string) *CertificateData {
227 | 	cd := &CertificateData{
228 | 		Type: "X509LogEntry",
229 | 		Subject: SubjectData{
230 | 			Aggregated: cert.Subject.String(),
231 | 			CN:         cert.Subject.CommonName,
232 | 		},
233 | 		Issuer: SubjectData{
234 | 			Aggregated: cert.Issuer.String(),
235 | 			CN:         cert.Issuer.CommonName,
236 | 		},
237 | 		NotBefore:  cert.NotBefore.Unix(),
238 | 		NotAfter:   cert.NotAfter.Unix(),
239 | 		Source:     map[string]string{"url": source},
240 | 		Extensions: make(map[string]string),
241 | 	}
242 | 	if len(cert.Subject.Country) > 0 {
243 | 		cd.Subject.C = cert.Subject.Country[0]
244 | 	}
245 | 	if len(cert.Subject.Organization) > 0 {
246 | 		cd.Subject.O = cert.Subject.Organization[0]
247 | 	}
248 | 	if len(cert.Subject.OrganizationalUnit) > 0 {
249 | 		cd.Subject.OU = cert.Subject.OrganizationalUnit[0]
250 | 	}
251 | 	if len(cert.Subject.Locality) > 0 {
252 | 		cd.Subject.L = cert.Subject.Locality[0]
253 | 	}
254 | 	if len(cert.Subject.Province) > 0 {
255 | 		cd.Subject.ST = cert.Subject.Province[0]
256 | 	}
257 | 	if len(cert.Issuer.Country) > 0 {
258 | 		cd.Issuer.C = cert.Issuer.Country[0]
259 | 	}
260 | 	if len(cert.Issuer.Organization) > 0 {
261 | 		cd.Issuer.O = cert.Issuer.Organization[0]
262 | 	}
263 | 	derBytes := cert.Raw
264 | 	cd.AsDER = base64.StdEncoding.EncodeToString(derBytes)
265 | 	domains := make([]string, 0, len(cert.DNSNames)+1)
266 | 	if cert.Subject.CommonName != "" {
267 | 		domains = append(domains, cert.Subject.CommonName)
268 | 	}
269 | 	domains = append(domains, cert.DNSNames...)
270 | 	seenDomains := make(map[string]bool, len(domains))
271 | 	cd.AllDomains = make([]string, 0, len(domains))
272 | 	for _, domain := range domains {
273 | 		if !seenDomains[domain] {
274 | 			seenDomains[domain] = true
275 | 			cd.AllDomains = append(cd.AllDomains, domain)
276 | 		}
277 | 	}
278 | 	return cd
279 | }
280 | 
281 | // NormalizeDomain standardizes domain names.
282 | func NormalizeDomain(domain string) string {
283 | 	domain = strings.TrimSpace(domain)
284 | 	if domain == "" || strings.ContainsAny(domain, " \t\n") {
285 | 		if strings.ContainsAny(domain, " :/") || domain == "::1" || strings.HasPrefix(domain, "-") {
286 | 			return domain
287 | 		}
288 | 		return ""
289 | 	}
290 | 	domain = strings.ToLower(domain)
291 | 	for strings.HasPrefix(domain, ".") {
292 | 		domain = domain[1:]
293 | 	}
294 | 	for strings.HasSuffix(domain, ".") {
295 | 		domain = domain[:len(domain)-1]
296 | 	}
297 | 	if domain == "" {
298 | 		return ""
299 | 	}
300 | 	if len(domain) > 2 && domain[:2] == "*." {
301 | 		domain = domain[2:] // Strip leading wildcard
302 | 	}
303 | 	parts := strings.SplitSeq(domain, ".")
304 | 	for part := range parts {
305 | 		if strings.HasPrefix(part, "-") || strings.HasSuffix(part, "-") || strings.HasPrefix(part, "*") {
306 | 			return domain // Invalid label structure after potential stripping
307 | 		}
308 | 	}
309 | 	return domain
310 | }
311 | 


--------------------------------------------------------------------------------
/internal/certlib/models_test.go:
--------------------------------------------------------------------------------
  1 | package certlib
  2 | 
  3 | /*
  4 | rxtls — fast tool in Go for working with Certificate Transparency logs
  5 | Copyright (C) 2025  Pepijn van der Stap <rxtls@vanderstap.info>
  6 | 
  7 | This program is free software: you can redistribute it and/or modify
  8 | it under the terms of the GNU Affero General Public License as published by
  9 | the Free Software Foundation, either version 3 of the License, or
 10 | (at your option) any later version.
 11 | 
 12 | This program is distributed in the hope that it will be useful,
 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 15 | GNU Affero General Public License for more details.
 16 | 
 17 | You should have received a copy of the GNU Affero General Public License
 18 | along with this program.  If not, see <https://www.gnu.org/licenses/>.
 19 | */
 20 | 
 21 | import (
 22 | 	"fmt"
 23 | 	"sort"
 24 | 	"strings"
 25 | 	"testing"
 26 | 
 27 | 	"github.com/zeebo/xxh3"
 28 | )
 29 | 
 30 | // calculateExpectedDomainOrgHash is a test helper using xxh3.
 31 | func calculateExpectedDomainOrgHash(domains []string, org string) string {
 32 | 	uniqueMap := make(map[string]bool)
 33 | 	var normalizedDomains []string
 34 | 	for _, d := range domains {
 35 | 		n := NormalizeDomain(d)
 36 | 		if n != "" && !uniqueMap[n] {
 37 | 			uniqueMap[n] = true
 38 | 			normalizedDomains = append(normalizedDomains, n)
 39 | 		}
 40 | 	}
 41 | 	sort.Strings(normalizedDomains)
 42 | 	domainsStr := strings.Join(normalizedDomains, ",")
 43 | 	h := xxh3.HashString(fmt.Sprintf("%s|%s", domainsStr, org))
 44 | 	return fmt.Sprintf("%x", h)
 45 | }
 46 | 
 47 | // TestToDomainsCSVLine validates the domain-focused CSV output.
 48 | func TestToDomainsCSVLine(t *testing.T) {
 49 | 	t.Parallel()
 50 | 	certIndex := 12345
 51 | 	testCases := []struct {
 52 | 		name     string
 53 | 		certData CertificateData
 54 | 	}{
 55 | 		{
 56 | 			name: "Simple CN, single SAN",
 57 | 			certData: CertificateData{
 58 | 				Subject:    SubjectData{CN: "example.com", O: "Test Org Inc.", C: "US", ST: "California", L: "Mountain View"},
 59 | 				Issuer:     SubjectData{CN: "Test CA"},
 60 | 				AllDomains: []string{"example.com", "www.example.com"},
 61 | 			},
 62 | 		},
 63 | 		{
 64 | 			name: "Mixed case, trailing dots, duplicate SAN",
 65 | 			certData: CertificateData{
 66 | 				Subject:    SubjectData{CN: "EXAMPLE.net.", O: "Another, Org", C: "GB", ST: "", L: "London"},
 67 | 				Issuer:     SubjectData{CN: "Issuing CA Ltd.", O: "Issuer Org"},
 68 | 				AllDomains: []string{"EXAMPLE.net.", "WWW.example.net", "www.example.net"},
 69 | 			},
 70 | 		},
 71 | 		{
 72 | 			name: "Wildcard domain (gets stripped in output list)",
 73 | 			certData: CertificateData{
 74 | 				Subject:    SubjectData{CN: "*.example.org", O: "Wild Org", C: "", ST: "", L: ""},
 75 | 				Issuer:     SubjectData{CN: "Wild CA"},
 76 | 				AllDomains: []string{"*.example.org", "example.org"},
 77 | 			},
 78 | 		},
 79 | 		{
 80 | 			name: "No CN, only SANs",
 81 | 			certData: CertificateData{
 82 | 				Subject:    SubjectData{CN: "", O: "SAN Org", C: "DE", ST: "Berlin", L: "Berlin"},
 83 | 				Issuer:     SubjectData{CN: "SAN Issuer"},
 84 | 				AllDomains: []string{"san1.com", "san2.com"},
 85 | 			},
 86 | 		},
 87 | 		{
 88 | 			name: "No domains at all",
 89 | 			certData: CertificateData{
 90 | 				Subject:    SubjectData{CN: "", O: "Empty Org", C: "JP", ST: "Tokyo", L: "Tokyo"},
 91 | 				Issuer:     SubjectData{CN: "Empty Issuer"},
 92 | 				AllDomains: []string{},
 93 | 			},
 94 | 		},
 95 | 		{
 96 | 			name: "CN needs normalization, SAN is primary",
 97 | 			certData: CertificateData{
 98 | 				Subject:    SubjectData{CN: " INVALID CN ", O: "Norm Org", C: "CA", ST: "Ontario", L: "Toronto"},
 99 | 				Issuer:     SubjectData{CN: "Norm CA"},
100 | 				AllDomains: []string{" a.valid.domain ", " INVALID CN "},
101 | 			},
102 | 		},
103 | 	}
104 | 
105 | 	for _, tc := range testCases {
106 | 		tc := tc
107 | 		t.Run(tc.name, func(t *testing.T) {
108 | 			t.Parallel()
109 | 			expectedNormalizedCN := NormalizeDomain(tc.certData.Subject.CN)
110 | 			normalizedSortedDomains := tc.certData.SortedNormalizedDomains()
111 | 			expectedOutputDomains := make([]string, len(normalizedSortedDomains))
112 | 			for i, d := range normalizedSortedDomains {
113 | 				if strings.HasPrefix(d, "*.") {
114 | 					expectedOutputDomains[i] = d[2:]
115 | 				} else {
116 | 					expectedOutputDomains[i] = d
117 | 				}
118 | 			}
119 | 			expectedOutputDomainsStr := strings.Join(expectedOutputDomains, ",")
120 | 			expectedPrimaryDomain := ""
121 | 			if len(normalizedSortedDomains) > 0 {
122 | 				expectedPrimaryDomain = normalizedSortedDomains[0]
123 | 			}
124 | 			hashExpected := calculateExpectedDomainOrgHash(tc.certData.AllDomains, tc.certData.Subject.O)
125 | 			expectedOutput := fmt.Sprintf("%d,%s,%s,%s,%s,\"%s\",\"%s\",\"%s\",\"%s\",%s\n",
126 | 				certIndex, expectedNormalizedCN, expectedPrimaryDomain, expectedOutputDomainsStr,
127 | 				tc.certData.Subject.C, tc.certData.Subject.ST, tc.certData.Subject.L, tc.certData.Subject.O,
128 | 				tc.certData.Issuer.CN, hashExpected)
129 | 			actualOutput := tc.certData.ToDomainsCSVLine(certIndex)
130 | 			if actualOutput != expectedOutput {
131 | 				t.Errorf("ToDomainsCSVLine() mismatch:\n Input: %+v\n Want: %q\n Got:  %q", tc.certData, expectedOutput, actualOutput)
132 | 			}
133 | 		})
134 | 	}
135 | }
136 | 
137 | // BenchmarkSortedNormalizedDomains measures performance of getting unique, sorted,
138 | // normalized domains from a CertificateData struct with a large SAN list.
139 | func BenchmarkSortedNormalizedDomains(b *testing.B) {
140 | 	size := 100000
141 | 	allDomains := make([]string, size)
142 | 	for i := range size {
143 | 		prefix := ""
144 | 		suffix := ".com"
145 | 		if i%10 == 0 {
146 | 			prefix = "*.Sub."
147 | 			suffix = ".NET."
148 | 		}
149 | 		if i%3 == 0 {
150 | 			prefix += " "
151 | 		}
152 | 		baseDomain := fmt.Sprintf("%sexample-%d-%d%s", prefix, i%1000, i%50, suffix)
153 | 		if i > 0 && i%7 == 0 {
154 | 			allDomains[i] = allDomains[i-1]
155 | 		} else {
156 | 			allDomains[i] = baseDomain
157 | 		}
158 | 	}
159 | 	certData := &CertificateData{
160 | 		AllDomains: allDomains,
161 | 		Subject:    SubjectData{O: "Benchmark Org"},
162 | 	}
163 | 	b.ReportAllocs()
164 | 	b.ResetTimer()
165 | 	for i := 0; i < b.N; i++ {
166 | 		_ = certData.SortedNormalizedDomains()
167 | 	}
168 | }
169 | 


--------------------------------------------------------------------------------
/internal/client/http.go:
--------------------------------------------------------------------------------
  1 | package client
  2 | 
  3 | /*
  4 | rxtls — fast tool in Go for working with Certificate Transparency logs
  5 | Copyright (C) 2025  Pepijn van der Stap <rxtls@vanderstap.info>
  6 | 
  7 | This program is free software: you can redistribute it and/or modify
  8 | it under the terms of the GNU Affero General Public License as published by
  9 | the Free Software Foundation, either version 3 of the License, or
 10 | (at your option) any later version.
 11 | 
 12 | This program is distributed in the hope that it will be useful,
 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 15 | GNU Affero General Public License for more details.
 16 | 
 17 | You should have received a copy of the GNU Affero General Public License
 18 | along with this program.  If not, see <https://www.gnu.org/licenses/>.
 19 | */
 20 | 
 21 | /*
 22 | Package client provides a configurable HTTP client for making requests to Certificate Transparency logs and other services.
 23 | It includes support for connection pooling, timeouts, and a "turbo" mode for aggressive, high-throughput scenarios.
 24 | 
 25 | The package manages a shared global HTTP client instance that can be configured once and then retrieved by multiple
 26 | parts of the application. This promotes reuse of TCP connections and consistent client behavior.
 27 | */
 28 | 
 29 | import (
 30 | 	"net"
 31 | 	"net/http"
 32 | 	"sync"
 33 | 	"time"
 34 | )
 35 | 
 36 | // HTTP client-specific constants.
 37 | const (
 38 | 	// DialTimeout is the maximum amount of time a dial will wait for a connect to complete.
 39 | 	DialTimeout = 5 * time.Second
 40 | 	// KeepAliveTimeout is the interval between keep-alive probes for active network connections.
 41 | 	// If zero, keep-alive probes are sent with a default OS-dependent interval.
 42 | 	KeepAliveTimeout = 60 * time.Second
 43 | 	// RequestTimeout is the timeout for the entire HTTP request, including connection time, all redirects, and reading the response body.
 44 | 	RequestTimeout = 15 * time.Second
 45 | 	// MaxIdleConnsPerHost is the maximum number of idle (keep-alive) connections to keep per-host.
 46 | 	MaxIdleConnsPerHost = 150 // Default value, can be overridden by Config.
 47 | )
 48 | 
 49 | var (
 50 | 	// defaultDialTimeout specifies the default timeout for establishing a new connection.
 51 | 	defaultDialTimeout = 5 * time.Second
 52 | 	// defaultKeepAliveTimeout specifies the default keep-alive period for an active network connection.
 53 | 	defaultKeepAliveTimeout = 60 * time.Second
 54 | 	// defaultIdleConnTimeout is the maximum amount of time an idle (keep-alive) connection will remain
 55 | 	// idle before closing itself.
 56 | 	defaultIdleConnTimeout = 90 * time.Second
 57 | 	// defaultMaxIdleConns controls the maximum number of idle (keep-alive) connections across all hosts.
 58 | 	defaultMaxIdleConns = 100
 59 | 	// defaultMaxConnsPerHost controls the maximum number of connections per host (includes dial, active, and idle).
 60 | 	defaultMaxConnsPerHost = 100
 61 | 	// defaultRequestTimeout specifies the default timeout for a complete HTTP request.
 62 | 	defaultRequestTimeout = 15 * time.Second
 63 | 
 64 | 	// sharedClient is the global HTTP client instance used by the application.
 65 | 	// It is lazily initialized on first use or when explicitly configured.
 66 | 	sharedClient *http.Client
 67 | 	// sharedClientLock protects access to sharedClient and clientInitialized.
 68 | 	sharedClientLock sync.RWMutex
 69 | 	// clientInitialized indicates whether the sharedClient has been initialized.
 70 | 	clientInitialized bool
 71 | )
 72 | 
 73 | // Config holds configuration parameters for the HTTP client.
 74 | // These settings allow tuning of connection pooling, timeouts, and other transport-level behaviors.
 75 | // A zero-value Config will result in default settings being used.
 76 | type Config struct {
 77 | 	// DialTimeout is the maximum duration for establishing a new connection.
 78 | 	DialTimeout time.Duration
 79 | 	// KeepAliveTimeout specifies the keep-alive period for an active network connection.
 80 | 	KeepAliveTimeout time.Duration
 81 | 	// IdleConnTimeout is the maximum amount of time an idle (keep-alive) connection
 82 | 	// will remain idle before closing itself.
 83 | 	IdleConnTimeout time.Duration
 84 | 	// MaxIdleConns controls the maximum number of idle (keep-alive) connections across all hosts.
 85 | 	MaxIdleConns int
 86 | 	// MaxConnsPerHost controls the maximum number of connections per host, including connections in the dialing,
 87 | 	// active, and idle states. On limit violation, dials will block.
 88 | 	MaxConnsPerHost int
 89 | 	// RequestTimeout is the timeout for the entire HTTP request, including connection time,
 90 | 	// all redirects, and reading the response body.
 91 | 	RequestTimeout time.Duration
 92 | }
 93 | 
 94 | // DefaultConfig returns a new Config struct populated with default HTTP client settings.
 95 | // These defaults are sensible for general-purpose HTTP interactions but may need tuning
 96 | // for specific high-performance or constrained environments.
 97 | func DefaultConfig() *Config {
 98 | 	return &Config{
 99 | 		DialTimeout:      defaultDialTimeout,
100 | 		KeepAliveTimeout: defaultKeepAliveTimeout,
101 | 		IdleConnTimeout:  defaultIdleConnTimeout,
102 | 		MaxIdleConns:     defaultMaxIdleConns,
103 | 		MaxConnsPerHost:  defaultMaxConnsPerHost,
104 | 		RequestTimeout:   defaultRequestTimeout,
105 | 	}
106 | }
107 | 
108 | // InitHTTPClient initializes or reconfigures the shared global HTTP client with the provided configuration.
109 | // If a nil config is provided, it uses the default configuration obtained from DefaultConfig().
110 | // This function is thread-safe.
111 | //
112 | // Note: Calling this function will replace the existing shared client, potentially affecting
113 | // in-flight requests made with the old client if its transport was not reusable or if connections
114 | // were specific to the old transport's settings.
115 | func InitHTTPClient(config *Config) {
116 | 	sharedClientLock.Lock()
117 | 	defer sharedClientLock.Unlock()
118 | 
119 | 	if config == nil {
120 | 		config = DefaultConfig()
121 | 	}
122 | 
123 | 	// Configure the transport with timeouts and connection pooling options.
124 | 	// ForceAttemptHTTP2 is enabled to prefer HTTP/2 if available.
125 | 	transport := &http.Transport{
126 | 		Proxy: http.ProxyFromEnvironment, // Respect standard proxy environment variables.
127 | 		DialContext: (&net.Dialer{
128 | 			Timeout:   config.DialTimeout,
129 | 			KeepAlive: config.KeepAliveTimeout, // Enables TCP keep-alives.
130 | 		}).DialContext,
131 | 		MaxIdleConns:        config.MaxIdleConns,
132 | 		MaxIdleConnsPerHost: config.MaxConnsPerHost,
133 | 		IdleConnTimeout:     config.IdleConnTimeout,
134 | 		DisableCompression:  false, // Enable compression (e.g., gzip) by default.
135 | 		ForceAttemptHTTP2:   true,  // Try to use HTTP/2.
136 | 	}
137 | 
138 | 	sharedClient = &http.Client{
139 | 		Transport: transport,
140 | 		Timeout:   config.RequestTimeout, // Overall request timeout.
141 | 	}
142 | 
143 | 	clientInitialized = true
144 | }
145 | 
146 | // GetHTTPClient returns the shared global HTTP client instance.
147 | // If the client has not been initialized, it will be initialized with default settings.
148 | // This function is thread-safe.
149 | func GetHTTPClient() *http.Client {
150 | 	sharedClientLock.RLock() // Use RLock for initial check to allow concurrent reads.
151 | 	if !clientInitialized {
152 | 		sharedClientLock.RUnlock()
153 | 		// Client not initialized, need to acquire a write lock.
154 | 		// This double-check locking pattern minimizes write lock contention.
155 | 		InitHTTPClient(nil)      // Initialize with defaults under a write lock.
156 | 		sharedClientLock.RLock() // Re-acquire read lock to safely access sharedClient.
157 | 	}
158 | 	client := sharedClient
159 | 	sharedClientLock.RUnlock()
160 | 	return client
161 | }
162 | 
163 | // ConfigureHTTPClient provides a convenience function to update the shared HTTP client's configuration.
164 | // It's equivalent to calling InitHTTPClient.
165 | // This function is thread-safe.
166 | func ConfigureHTTPClient(config *Config) {
167 | 	InitHTTPClient(config) // InitHTTPClient handles locking.
168 | }
169 | 
170 | // ConfigureTurboMode applies a set of aggressive HTTP client settings optimized for
171 | // high-throughput scenarios, such as massively parallel log fetching.
172 | // This typically involves shorter dial timeouts, longer keep-alive and idle timeouts,
173 | // and higher connection pool limits.
174 | // This function is thread-safe.
175 | func ConfigureTurboMode() {
176 | 	turboConfig := &Config{
177 | 		DialTimeout:      2 * time.Second,   // Faster dial attempts.
178 | 		KeepAliveTimeout: 120 * time.Second, // Keep connections alive longer.
179 | 		IdleConnTimeout:  120 * time.Second, // Allow idle connections to persist longer.
180 | 		MaxIdleConns:     500,               // Larger overall idle connection pool.
181 | 		MaxConnsPerHost:  200,               // More connections allowed per host.
182 | 		RequestTimeout:   30 * time.Second,  // Slightly longer request timeout for potentially slower turbo operations.
183 | 	}
184 | 	ConfigureHTTPClient(turboConfig)
185 | }
186 | 


--------------------------------------------------------------------------------
/internal/core/common.go:
--------------------------------------------------------------------------------
  1 | /*
  2 | Package core provides the central logic for rxtls, including the scheduler, download manager,
  3 | and domain extractor. It defines common data structures and constants used across these components.
  4 | 
  5 | Key responsibilities of the core package include:
  6 | - Managing concurrent operations through a worker pool (Scheduler).
  7 | - Orchestrating the download of certificate entries from Certificate Transparency logs (DownloadManager).
  8 | - Processing downloaded entries to extract domain names and other relevant metadata (DomainExtractor - if used).
  9 | - Defining shared data types like WorkItem and CTLogInfo (though CTLogInfo is primarily from certlib).
 10 | - Establishing common constants for retry logic, queue sizes, and default behaviors.
 11 | */
 12 | package core
 13 | 
 14 | import (
 15 | 	"bufio"
 16 | 	"context"
 17 | 	"os"
 18 | 	"sync"
 19 | 	"time"
 20 | 
 21 | 	"github.com/x-stp/rxtls/internal/certlib"
 22 | )
 23 | 
 24 | // Common constants used across the core package.
 25 | // These values configure aspects like worker queue capacities, scheduler behavior, and retry policies.
 26 | const (
 27 | 	// WorkerQueueCapacity defines the maximum number of work items that can be buffered in a single worker's queue.
 28 | 	// A larger capacity can absorb more bursty workloads but consumes more memory.
 29 | 	WorkerQueueCapacity = 500000
 30 | 
 31 | 	// MaxShardQueueSize is the maximum size of a shard's queue in the scheduler.
 32 | 	// This is used when initializing workers and their individual limiter burst sizes.
 33 | 	// It defines how many items can be enqueued for a specific shard (log URL hash) before backpressure occurs.
 34 | 	MaxShardQueueSize = 1000
 35 | 
 36 | 	// WorkerMultiplier determines the number of worker goroutines relative to the number of CPU cores.
 37 | 	// For example, a multiplier of 2 on an 8-core machine would create 16 workers.
 38 | 	WorkerMultiplier = 2
 39 | 
 40 | 	// RetryBaseDelay is the initial delay before the first retry attempt for a failed operation.
 41 | 	// Subsequent retries use exponential backoff based on this delay.
 42 | 	RetryBaseDelay = 125 * time.Millisecond
 43 | 	// RetryMaxDelay is the maximum delay between retry attempts, capping the exponential backoff.
 44 | 	RetryMaxDelay = 30 * time.Second
 45 | 	// RetryBackoffMultiplier is the factor by which the retry delay increases after each failed attempt.
 46 | 	RetryBackoffMultiplier = 1.5
 47 | 	// RetryJitterFactor introduces randomness to retry delays to prevent thundering herd problems.
 48 | 	// The actual jitter is calculated as a percentage of the current delay (e.g., 0.2 means +/- 20% jitter).
 49 | 	RetryJitterFactor = 0.2
 50 | )
 51 | 
 52 | // WorkItem represents a discrete unit of work to be processed by a worker in the scheduler.
 53 | // It encapsulates all necessary information for a task, including the target log, entry range,
 54 | // callback function, and retry state.
 55 | // WorkItems are typically pooled and reused to reduce allocations.
 56 | type WorkItem struct {
 57 | 	// Immutable fields, set at creation and not changed during the WorkItem's lifecycle.
 58 | 
 59 | 	// LogURL is the URL of the Certificate Transparency log server for this work item.
 60 | 	LogURL string
 61 | 	// LogInfo provides detailed metadata about the CT log, such as its tree size and block size.
 62 | 	// This is a pointer to a shared certlib.CTLogInfo struct.
 63 | 	LogInfo *certlib.CTLogInfo
 64 | 	// Start is the starting index of the certificate entry range for this work item.
 65 | 	Start int64
 66 | 	// End is the ending index (inclusive) of the certificate entry range.
 67 | 	End int64
 68 | 	// Callback is the function that will be executed by a worker to process this WorkItem.
 69 | 	// It takes the WorkItem itself as an argument and returns an error if processing fails.
 70 | 	Callback WorkCallback
 71 | 	// Ctx is the context associated with this specific work item. It can be used for cancellation
 72 | 	// that is specific to this item, separate from the broader scheduler or worker context.
 73 | 	Ctx context.Context
 74 | 	// CreatedAt records the time when the WorkItem was initially created or retrieved from a pool.
 75 | 	// Useful for tracking queue latency or item age.
 76 | 	CreatedAt time.Time
 77 | 
 78 | 	// Mutable fields, potentially modified during processing or retry attempts.
 79 | 
 80 | 	// Attempt an_integer_representing_the_number_of_times_this_WorkItem_has_been_attempted.
 81 | 	// Starts at 0 for the first attempt.
 82 | 	Attempt int
 83 | 	// Error stores any error encountered during the execution of the Callback function.
 84 | 	// It is nil if the callback was successful.
 85 | 	Error error
 86 | }
 87 | 
 88 | // WorkCallback defines the signature for functions that can process a WorkItem.
 89 | // These functions are executed by the scheduler's worker goroutines.
 90 | // The WorkItem itself is passed as an argument, allowing the callback to access
 91 | // log information, entry ranges, and its own context.
 92 | // An error should be returned if the processing fails, which may trigger retry logic.
 93 | type WorkCallback func(item *WorkItem) error
 94 | 
 95 | // lockedWriter provides a thread-safe wrapper around a bufio.Writer, typically used for
 96 | // writing output to files concurrently from multiple goroutines.
 97 | // It embeds a sync.Mutex to protect access to the underlying writer and associated file resources.
 98 | //
 99 | // Fields for filePath and finalPath are included to support atomic-like file operations
100 | // where data is written to a temporary file and then renamed to its final destination upon
101 | // successful completion, preventing partially written or corrupt files from being visible.
102 | type lockedWriter struct {
103 | 	// writer is the buffered writer used for efficient I/O.
104 | 	writer *bufio.Writer
105 | 	// gzWriter is an optional gzip.Writer, used if output compression is enabled.
106 | 	// It implements the io.Closer interface for proper resource release.
107 | 	gzWriter interface{ Close() error }
108 | 	// file is the underlying os.File being written to.
109 | 	file *os.File
110 | 	// mu is the mutex protecting concurrent access to the writer, gzWriter, and file.
111 | 	mu sync.Mutex
112 | 	// filePath is the path to the temporary file being written.
113 | 	filePath string
114 | 	// finalPath is the intended final path for the file after all writes are complete and successful.
115 | 	finalPath string
116 | }
117 | 


--------------------------------------------------------------------------------
/internal/core/constants.go:
--------------------------------------------------------------------------------
  1 | /*
  2 | Package core constants that are not specific to a single manager/component but are shared across the core logic.
  3 | This file centralizes various configurable parameters related to memory management, networking behavior,
  4 | CT log interaction defaults, disk I/O, and observability.
  5 | 
  6 | These constants are intended to provide sensible defaults and can be tuned for different performance profiles
  7 | or operational environments. They are distinct from the very fundamental constants defined in common.go
  8 | (like worker multipliers or base retry delays) and focus more on higher-level application behavior settings.
  9 | */
 10 | package core
 11 | 
 12 | /*
 13 | rxtls — fast tool in Go for working with Certificate Transparency logs
 14 | Copyright (C) 2025  Pepijn van der Stap <rxtls@vanderstap.info>
 15 | 
 16 | This program is free software: you can redistribute it and/or modify
 17 | it under the terms of the GNU Affero General Public License as published by
 18 | the Free Software Foundation, either version 3 of the License, or
 19 | (at your option) any later version.
 20 | 
 21 | This program is distributed in the hope that it will be useful,
 22 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 23 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 24 | GNU Affero General Public License for more details.
 25 | 
 26 | You should have received a copy of the GNU Affero General Public License
 27 | along with this program.  If not, see <https://www.gnu.org/licenses/>.
 28 | */
 29 | 
 30 | import (
 31 | 	"time"
 32 | )
 33 | 
 34 | // Application-wide constants for tuning performance and behavior.
 35 | const (
 36 | 	// --- Memory ---
 37 | 
 38 | 	// MaxWorkers defines the absolute upper limit on the number of concurrent worker goroutines
 39 | 	// that the scheduler will create. This acts as a safeguard regardless of CPU core count or multipliers.
 40 | 	MaxWorkers = 2048
 41 | 
 42 | 	// DefaultShards specifies the default number of shards used by the scheduler for distributing
 43 | 	// work based on log URL hashing. This helps in balancing load across workers.
 44 | 	// This value is not directly used by the current scheduler implementation, which shards by numWorkers.
 45 | 	DefaultShards = 32 // TODO: Re-evaluate or remove if scheduler sharding remains worker-based.
 46 | 
 47 | 	// CacheLineSize is a common CPU cache line size in bytes. It's used as a guideline for padding
 48 | 	// in data structures to help prevent false sharing when multiple CPU cores access adjacent memory locations.
 49 | 	CacheLineSize = 64
 50 | 
 51 | 	// DefaultNetworkBufferSize is the default size for buffers used in network read operations.
 52 | 	// Larger buffers can reduce the number of read syscalls but increase memory footprint.
 53 | 	DefaultNetworkBufferSize = 256 * 1024 // 256KB
 54 | 
 55 | 	// DefaultDiskBufferSize is the default size for `bufio.Writer` instances used for disk I/O.
 56 | 	// Similar to network buffers, this trades memory for potentially fewer write syscalls.
 57 | 	DefaultDiskBufferSize = 256 * 1024 // 256KB
 58 | 
 59 | 	// CertProcessingBatchSize dictates how many certificates are grouped together for logical processing steps,
 60 | 	// such as batching writes to disk or updating progress metrics.
 61 | 	CertProcessingBatchSize = 1024 * 10
 62 | 
 63 | 	// --- Networking ---
 64 | 
 65 | 	// MaxNetworkRetries specifies the maximum number of times a failed network operation
 66 | 	// (like fetching STH or log entries) will be retried by components in `certlib`.
 67 | 	MaxNetworkRetries = 6
 68 | 
 69 | 	// MaxSubmitRetries is the maximum number of times a component (like DownloadManager or DomainExtractor)
 70 | 	// will attempt to submit a work item to a worker's queue if it's initially full (ErrQueueFull).
 71 | 	// This is for retrying the *submission* to the queue, not the work item execution itself.
 72 | 	MaxSubmitRetries = 2 // Reduced from 5 as queue full should be handled by rate limiting ideally.
 73 | 
 74 | 	// DialTimeout limits the time spent establishing a new TCP connection to a remote server.
 75 | 	DialTimeout = 10 * time.Second
 76 | 
 77 | 	// RequestTimeout sets the maximum duration for an entire HTTP request, encompassing
 78 | 	// connection establishment, sending the request, and receiving the full response body.
 79 | 	// This is typically applied at the http.Client level.
 80 | 	RequestTimeout = 15 * time.Second
 81 | 
 82 | 	// KeepAliveTimeout defines the keep-alive period for an active network connection.
 83 | 	// This is used by the net.Dialer to configure TCP keep-alives.
 84 | 	KeepAliveTimeout = 60 * time.Second
 85 | 
 86 | 	// ReadTimeout is the maximum duration for reading the next chunk of data from a connection
 87 | 	// after a successful connection and request send. Not directly used by client, but a common HTTP server setting.
 88 | 	ReadTimeout = 15 * time.Second // Typically a server-side setting or per-request on client.
 89 | 
 90 | 	// IdleConnTimeout is the maximum amount of time an idle (keep-alive) connection will remain
 91 | 	// in the HTTP client's connection pool before being closed.
 92 | 	IdleConnTimeout = 120 * time.Second
 93 | 
 94 | 	// ResponseHeaderTimeout limits the time spent waiting to receive the complete response headers
 95 | 	// from the server after the request has been sent.
 96 | 	ResponseHeaderTimeout = 15 * time.Second
 97 | 
 98 | 	// MaxIdleConnsPerHost controls the maximum number of idle connections that will be maintained
 99 | 	// in the pool for any single host. This helps prevent resource exhaustion when interacting
100 | 	// with many different hosts.
101 | 	MaxIdleConnsPerHost = 55
102 | 
103 | 	// DefaultRequestTimeout is a general default timeout for HTTP requests, potentially used
104 | 	// by components that don't have a more specific timeout configured.
105 | 	// It's similar to RequestTimeout but might be used as a fallback.
106 | 	DefaultRequestTimeout = 30 * time.Second
107 | 
108 | 	// --- CT Log Specific ---
109 | 
110 | 	// DefaultLogEntryBlockSize is the number of entries to request in a single `get-entries`
111 | 	// call if the CT log does not specify its own preferred block size (max_entries_per_get).
112 | 	DefaultLogEntryBlockSize = 64
113 | 
114 | 	// DefaultBatchSize defines a common batch size for fetching entries from CT logs.
115 | 	// This is often a multiple of the log's block size.
116 | 	DefaultBatchSize = 1024 * 4
117 | 
118 | 	// DefaultMaxParallelBatches sets a soft limit on how many batches of log entries
119 | 	// might be processed in parallel by the application. This can help manage memory and CPU load.
120 | 	DefaultMaxParallelBatches = 50 // This constant appears to be for higher-level batching strategy.
121 | 
122 | 	// MaxConcurrentDownloadsPerHost limits how many concurrent `get-entries` requests rxtls
123 | 	// will make to a single CT log server host. This is crucial for being a good network citizen.
124 | 	// This would typically be enforced by the HTTP client's MaxConnsPerHost or similar, or custom logic.
125 | 	MaxConcurrentDownloadsPerHost = 50
126 | 
127 | 	// MaxRetries defines the maximum number of retries for failed network operations.
128 | 	// This is similar to MaxNetworkRetries but might be used by different components with different retry policies.
129 | 	MaxRetries = 5
130 | 
131 | 	// --- Disk I/O ---
132 | 
133 | 	// DiskFlushBatchSize indicates how many *processed* certificate entries should trigger
134 | 	// a flush of the output file buffer to disk. This helps ensure data is persisted regularly.
135 | 	DiskFlushBatchSize = CertProcessingBatchSize
136 | 
137 | 	// --- Observability ---
138 | 
139 | 	// RequestHistorySize is the number of recent network request details to retain in memory
140 | 	// for observability or debugging purposes (e.g., for a live dashboard or error analysis).
141 | 	RequestHistorySize = 1000 // Currently not implemented, but a common pattern.
142 | 
143 | 	// LogHistorySize determines the number of recent log messages to keep in an in-memory buffer
144 | 	// for potential display or inspection, especially in UIs or diagnostic tools.
145 | 	LogHistorySize = 5000 // Currently not implemented.
146 | 
147 | 	// StatsReportInterval specifies how frequently summary statistics (e.g., download progress,
148 | 	// processing rates) should be reported, typically to standard output or a log file.
149 | 	StatsReportInterval = 10 * time.Second
150 | 
151 | 	// MinimumProgressLoggingInterval defines the minimum time that must elapse between
152 | 	// progress log updates to avoid flooding logs with too frequent updates.
153 | 	MinimumProgressLoggingInterval = 5 * time.Second
154 | )
155 | 


--------------------------------------------------------------------------------
/internal/core/download_manager.go:
--------------------------------------------------------------------------------
  1 | package core
  2 | 
  3 | /*
  4 | rxtls — fast tool in Go for working with Certificate Transparency logs
  5 | Copyright (C) 2025  Pepijn van der Stap <rxtls@vanderstap.info>
  6 | 
  7 | This program is free software: you can redistribute it and/or modify
  8 | it under the terms of the GNU Affero General Public License as published by
  9 | the Free Software Foundation, either version 3 of the License, or
 10 | (at your option) any later version.
 11 | 
 12 | This program is distributed in the hope that it will be useful,
 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 15 | GNU Affero General Public License for more details.
 16 | 
 17 | You should have received a copy of the GNU Affero General Public License
 18 | along with this program.  If not, see <https://www.gnu.org/licenses/>.
 19 | */
 20 | 
 21 | import (
 22 | 	"bufio"
 23 | 	"compress/gzip"
 24 | 	"context"
 25 | 	"errors"
 26 | 	"fmt"
 27 | 	"log"
 28 | 	"math/rand"
 29 | 	"os"
 30 | 	"path/filepath"
 31 | 	"runtime"
 32 | 	"strings"
 33 | 	"sync"
 34 | 	"sync/atomic"
 35 | 	"time"
 36 | 
 37 | 	"github.com/x-stp/rxtls/internal/certlib"
 38 | 	"github.com/x-stp/rxtls/internal/util"
 39 | 
 40 | 	"github.com/zeebo/xxh3"
 41 | )
 42 | 
 43 | // Constants for download performance
 44 | const (
 45 | 	// OutputFlushInterval is how often to flush buffers to disk
 46 | 	OutputFlushInterval = 5 * time.Second
 47 | 
 48 | 	// Setup concurrency maximums
 49 | 	MaxSetupConcurrency = 16
 50 | 
 51 | 	// Writer buffer sizes
 52 | 	DefaultBufferSize = 8 * 1024 * 1024 // 8MB
 53 | 
 54 | 	// Memory pool size for string building
 55 | 	StringPoolSize = 1024 * 1024 // 1MB
 56 | 
 57 | 	// Distribution strategy - submit in batches to allow better parallelism
 58 | 	batchSize int64 = 100 // Submit blocks in batches
 59 | )
 60 | 
 61 | // Error types specific to download operations
 62 | var (
 63 | 	ErrDownloadCancelled = errors.New("download operation cancelled")
 64 | 	ErrLogSetupFailed    = errors.New("log setup failed")
 65 | 	ErrDownloadFailed    = errors.New("download failed")
 66 | )
 67 | 
 68 | // DownloadManager manages the process of downloading raw cert entries from CT logs.
 69 | type DownloadManager struct {
 70 | 	scheduler     *Scheduler
 71 | 	config        *DownloadConfig
 72 | 	stats         *DownloadStats
 73 | 	ctx           context.Context
 74 | 	cancel        context.CancelFunc
 75 | 	outputMap     sync.Map  // Maps log URL -> *lockedWriter
 76 | 	stringPool    sync.Pool // Reusable string builders
 77 | 	setupComplete atomic.Bool
 78 | }
 79 | 
 80 | // DownloadConfig holds configuration for downloading.
 81 | type DownloadConfig struct {
 82 | 	OutputDir         string
 83 | 	BufferSize        int
 84 | 	MaxConcurrentLogs int
 85 | 	CompressOutput    bool // If true, output files will be .gz
 86 | }
 87 | 
 88 | // DownloadStats holds runtime statistics for downloads.
 89 | type DownloadStats struct {
 90 | 	TotalLogs          atomic.Int64
 91 | 	ProcessedLogs      atomic.Int64
 92 | 	FailedLogs         atomic.Int64
 93 | 	TotalEntries       atomic.Int64
 94 | 	ProcessedEntries   atomic.Int64 // Entries successfully fetched and written
 95 | 	FailedEntries      atomic.Int64 // Entries failed (download, parse leaf, write)
 96 | 	OutputBytesWritten atomic.Int64
 97 | 	StartTime          time.Time
 98 | 	RetryCount         atomic.Int64 // Count of retried blocks
 99 | 	SuccessFirstTry    atomic.Int64 // Count of blocks successful on first try
100 | }
101 | 
102 | // GetStartTime returns the start time of the download process
103 | func (s *DownloadStats) GetStartTime() time.Time { return s.StartTime }
104 | 
105 | // GetTotalLogs returns the total number of logs being processed
106 | func (s *DownloadStats) GetTotalLogs() int64 { return s.TotalLogs.Load() }
107 | 
108 | // GetProcessedLogs returns the number of logs successfully processed
109 | func (s *DownloadStats) GetProcessedLogs() int64 { return s.ProcessedLogs.Load() }
110 | 
111 | // GetFailedLogs returns the number of logs that failed processing
112 | func (s *DownloadStats) GetFailedLogs() int64 { return s.FailedLogs.Load() }
113 | 
114 | // GetTotalEntries returns the total number of entries to be processed
115 | func (s *DownloadStats) GetTotalEntries() int64 { return s.TotalEntries.Load() }
116 | 
117 | // GetProcessedEntries returns the number of entries successfully processed
118 | func (s *DownloadStats) GetProcessedEntries() int64 { return s.ProcessedEntries.Load() }
119 | 
120 | // GetFailedEntries returns the number of entries that failed processing
121 | func (s *DownloadStats) GetFailedEntries() int64 { return s.FailedEntries.Load() }
122 | 
123 | // GetOutputBytesWritten returns the total bytes written to output files
124 | func (s *DownloadStats) GetOutputBytesWritten() int64 { return s.OutputBytesWritten.Load() }
125 | 
126 | // GetTotalDomainsFound returns the total domains found (not applicable for download stats)
127 | func (s *DownloadStats) GetTotalDomainsFound() int64 { return 0 }
128 | 
129 | // GetRetryRate returns the retry rate as a fraction of processed entries
130 | func (s *DownloadStats) GetRetryRate() float64 {
131 | 	if s.ProcessedEntries.Load() == 0 {
132 | 		return 0
133 | 	}
134 | 	return float64(s.RetryCount.Load()) / float64(s.ProcessedEntries.Load())
135 | }
136 | 
137 | // NewDownloadManager creates a new download manager instance.
138 | func NewDownloadManager(ctx context.Context, config *DownloadConfig) (*DownloadManager, error) {
139 | 	scheduler, err := NewScheduler(ctx)
140 | 	if err != nil {
141 | 		return nil, fmt.Errorf("failed to initialize scheduler: %w", err)
142 | 	}
143 | 
144 | 	// Set a sensible default buffer size if not specified
145 | 	if config.BufferSize <= 0 {
146 | 		config.BufferSize = DefaultBufferSize
147 | 	}
148 | 
149 | 	dmCtx, cancel := context.WithCancel(ctx)
150 | 	dm := &DownloadManager{
151 | 		scheduler: scheduler,
152 | 		config:    config,
153 | 		stats:     &DownloadStats{StartTime: time.Now()},
154 | 		ctx:       dmCtx,
155 | 		cancel:    cancel,
156 | 		stringPool: sync.Pool{
157 | 			New: func() interface{} {
158 | 				return &strings.Builder{}
159 | 			},
160 | 		},
161 | 	}
162 | 
163 | 	// Start background flush worker
164 | 	go dm.periodicFlush()
165 | 
166 | 	return dm, nil
167 | }
168 | 
169 | // periodicFlush runs in background to periodically flush output files
170 | func (dm *DownloadManager) periodicFlush() {
171 | 	ticker := time.NewTicker(OutputFlushInterval)
172 | 	defer ticker.Stop()
173 | 
174 | 	for {
175 | 		select {
176 | 		case <-dm.ctx.Done():
177 | 			// Flush one last time before exiting
178 | 			dm.flushAllWriters()
179 | 			return
180 | 		case <-ticker.C:
181 | 			dm.flushAllWriters()
182 | 		}
183 | 	}
184 | }
185 | 
186 | // flushAllWriters flushes all writers but doesn't close them
187 | func (dm *DownloadManager) flushAllWriters() {
188 | 	var flushCount int
189 | 	dm.outputMap.Range(func(key, value interface{}) bool {
190 | 		if value == nil {
191 | 			return true
192 | 		}
193 | 
194 | 		lw, ok := value.(*lockedWriter)
195 | 		if !ok || lw == nil {
196 | 			log.Printf("Warning: Invalid writer type in map during flush for key %v", key)
197 | 			return true
198 | 		}
199 | 
200 | 		// Use a short-term lock just for flushing
201 | 		func() {
202 | 			lw.mu.Lock()
203 | 			defer lw.mu.Unlock()
204 | 			if lw.writer != nil {
205 | 				if err := lw.writer.Flush(); err != nil {
206 | 					log.Printf("Warning: Error flushing writer for %s: %v", key.(string), err)
207 | 				} else {
208 | 					flushCount++
209 | 				}
210 | 			}
211 | 		}()
212 | 		return true
213 | 	})
214 | 
215 | 	if flushCount > 0 {
216 | 		log.Printf("Flushed %d output files to disk", flushCount)
217 | 	}
218 | }
219 | 
220 | // DownloadCertificates orchestrates the download process for the given logs.
221 | func (dm *DownloadManager) DownloadCertificates(logsToProcess interface{}) error {
222 | 	// Convert the interface to the expected type
223 | 	logs, ok := logsToProcess.([]certlib.CTLogInfo)
224 | 	if !ok {
225 | 		return fmt.Errorf("invalid logs type: expected []certlib.CTLogInfo")
226 | 	}
227 | 
228 | 	dm.stats.TotalLogs.Store(int64(len(logs)))
229 | 	log.Printf("Starting certificate download for %d logs...", len(logs))
230 | 
231 | 	// Create base output directory
232 | 	if err := os.MkdirAll(dm.config.OutputDir, 0755); err != nil {
233 | 		return fmt.Errorf("failed to create output directory '%s': %w", dm.config.OutputDir, err)
234 | 	}
235 | 
236 | 	// Limit concurrent setup
237 | 	concurrencyLimit := runtime.NumCPU()
238 | 	if concurrencyLimit > MaxSetupConcurrency {
239 | 		concurrencyLimit = MaxSetupConcurrency
240 | 	}
241 | 
242 | 	// Setup logs concurrently with limited concurrency
243 | 	var wg sync.WaitGroup
244 | 	setupSem := make(chan struct{}, concurrencyLimit)
245 | 	setupErrors := make(chan error, len(logs)) // Collect errors
246 | 
247 | 	for i := range logs {
248 | 		select {
249 | 		case <-dm.ctx.Done():
250 | 			log.Println("Download cancelled during log setup.")
251 | 			return ErrDownloadCancelled
252 | 		case setupSem <- struct{}{}:
253 | 			wg.Add(1)
254 | 			go func(logInfo certlib.CTLogInfo) {
255 | 				defer wg.Done()
256 | 				defer func() { <-setupSem }()
257 | 
258 | 				if err := dm.processSingleLogForDownload(&logInfo); err != nil {
259 | 					if !errors.Is(err, ErrDownloadCancelled) { // Don't log cancellations
260 | 						log.Printf("Error processing log %s for download: %v", logInfo.URL, err)
261 | 					}
262 | 					dm.stats.FailedLogs.Add(1)
263 | 					setupErrors <- fmt.Errorf("log %s: %w", logInfo.URL, err)
264 | 				} else {
265 | 					dm.stats.ProcessedLogs.Add(1)
266 | 				}
267 | 			}(logs[i])
268 | 		}
269 | 	}
270 | 
271 | 	wg.Wait() // Wait for setup goroutines
272 | 	close(setupErrors)
273 | 
274 | 	// Mark setup complete
275 | 	dm.setupComplete.Store(true)
276 | 
277 | 	// Check for setup errors
278 | 	var setupErrorsList []error
279 | 	for err := range setupErrors {
280 | 		setupErrorsList = append(setupErrorsList, err)
281 | 	}
282 | 
283 | 	// If all logs failed, return a combined error
284 | 	if len(setupErrorsList) == len(logs) {
285 | 		return fmt.Errorf("%w: all logs failed setup: %v", ErrLogSetupFailed, errors.Join(setupErrorsList...))
286 | 	}
287 | 
288 | 	if dm.ctx.Err() != nil {
289 | 		log.Println("Download cancelled after log setup phase.")
290 | 		dm.Shutdown()
291 | 		return ErrDownloadCancelled
292 | 	}
293 | 
294 | 	totalLogSize := dm.stats.TotalEntries.Load()
295 | 	log.Printf("All download work submitted (%d entries). Waiting for scheduler...", totalLogSize)
296 | 
297 | 	// Wait for all submitted download tasks
298 | 	dm.scheduler.Wait()
299 | 
300 | 	// Check for cancellation during processing
301 | 	if dm.ctx.Err() != nil {
302 | 		log.Println("Download cancelled during processing phase.")
303 | 		dm.Shutdown()
304 | 		return ErrDownloadCancelled
305 | 	}
306 | 
307 | 	// Check if we had complete success or partial success
308 | 	processedEntries := dm.stats.ProcessedEntries.Load()
309 | 	failedEntries := dm.stats.FailedEntries.Load()
310 | 
311 | 	log.Printf("Download processing complete. Finalizing... (Success: %d, Failed: %d entries)",
312 | 		processedEntries, failedEntries)
313 | 
314 | 	// Shutdown (this will flush and close all writers)
315 | 	dm.Shutdown()
316 | 
317 | 	// Return error if there were significant failures
318 | 	if failedEntries > 0 && failedEntries >= processedEntries/10 { // More than 10% failure rate
319 | 		return fmt.Errorf("%w: %d of %d entries failed to download",
320 | 			ErrDownloadFailed, failedEntries, processedEntries+failedEntries)
321 | 	}
322 | 
323 | 	retryRate := dm.stats.GetRetryRate()
324 | 	log.Printf("Certificate download finished successfully. Retry rate: %.2f%%", retryRate*100)
325 | 	return nil
326 | }
327 | 
328 | // processSingleLogForDownload handles STH fetch, output setup, and work submission for one log.
329 | func (dm *DownloadManager) processSingleLogForDownload(ctlog *certlib.CTLogInfo) error {
330 | 	log.Printf("Setting up download for log: %s", ctlog.URL)
331 | 
332 | 	// Fetch log info with a short timeout
333 | 	ctxWithTimeout, cancel := context.WithTimeout(dm.ctx, 30*time.Second)
334 | 	defer cancel()
335 | 
336 | 	// Create a derived context for this specific log
337 | 	logCtx, logCancel := context.WithCancel(dm.ctx)
338 | 	defer func() {
339 | 		// If we exit with error, cancel any pending work for this log
340 | 		if logCtx.Err() == nil {
341 | 			logCancel()
342 | 		}
343 | 	}()
344 | 
345 | 	// Get log info with timeout
346 | 	if err := certlib.GetLogInfo(ctlog); err != nil {
347 | 		return fmt.Errorf("failed to get log info for %s: %w", ctlog.URL, err)
348 | 	}
349 | 
350 | 	// Check context before proceeding
351 | 	if ctxWithTimeout.Err() != nil {
352 | 		return ErrDownloadCancelled
353 | 	}
354 | 
355 | 	treeSize := int64(ctlog.TreeSize)
356 | 	if treeSize == 0 {
357 | 		log.Printf("Skipping log %s: tree size is 0", ctlog.URL)
358 | 		return nil
359 | 	}
360 | 
361 | 	blockSize := int64(ctlog.BlockSize)
362 | 	if blockSize <= 0 {
363 | 		blockSize = DefaultLogEntryBlockSize
364 | 	}
365 | 
366 | 	// Setup Output Writer
367 | 	filename := fmt.Sprintf("%s_certs.csv", util.SanitizeFilename(ctlog.URL))
368 | 	if dm.config.CompressOutput {
369 | 		filename += ".gz"
370 | 	}
371 | 	filePath := filepath.Join(dm.config.OutputDir, filename)
372 | 
373 | 	// Create output file with temp name, then rename when complete to avoid partial files
374 | 	tempFilePath := filePath + ".tmp"
375 | 	file, err := os.Create(tempFilePath)
376 | 	if err != nil {
377 | 		return fmt.Errorf("failed to create output file %s: %w", tempFilePath, err)
378 | 	}
379 | 
380 | 	var writer *bufio.Writer
381 | 	var gzWriter *gzip.Writer
382 | 
383 | 	if dm.config.CompressOutput {
384 | 		gzWriter, _ = gzip.NewWriterLevel(file, gzip.BestSpeed)
385 | 		writer = bufio.NewWriterSize(gzWriter, dm.config.BufferSize)
386 | 	} else {
387 | 		writer = bufio.NewWriterSize(file, dm.config.BufferSize)
388 | 	}
389 | 
390 | 	// Write header: offset,leaf_input_b64,extra_data_b64
391 | 	headerLine := "offset,leaf_input_b64,extra_data_b64\n"
392 | 	_, err = writer.WriteString(headerLine)
393 | 	if err != nil {
394 | 		file.Close()
395 | 		return fmt.Errorf("failed to write header to %s: %w", tempFilePath, err)
396 | 	}
397 | 
398 | 	// Store the locked writer instance
399 | 	lw := &lockedWriter{
400 | 		writer:    writer,
401 | 		file:      file,
402 | 		filePath:  tempFilePath,
403 | 		finalPath: filePath,
404 | 	}
405 | 	// Only set gzWriter if compression is enabled to avoid nil interface issues
406 | 	if dm.config.CompressOutput && gzWriter != nil {
407 | 		lw.gzWriter = gzWriter
408 | 	}
409 | 	dm.outputMap.Store(ctlog.URL, lw)
410 | 
411 | 	// Submit Work Blocks in chunks for more even distribution
412 | 	numBlocks := (treeSize + blockSize - 1) / blockSize
413 | 	log.Printf("Log %s: TreeSize=%d, BlockSize=%d, NumBlocks=%d (Download)",
414 | 		ctlog.URL, treeSize, blockSize, numBlocks)
415 | 
416 | 	// Track total entries
417 | 	dm.stats.TotalEntries.Add(treeSize)
418 | 
419 | 	// Distribution strategy - submit in batches to allow better parallelism
420 | 	var submittedBlocks, droppedBlocks int64
421 | 
422 | 	for i := int64(0); i < numBlocks; i += batchSize {
423 | 		// Check for context cancellation between batches
424 | 		if dm.ctx.Err() != nil {
425 | 			return ErrDownloadCancelled
426 | 		}
427 | 
428 | 		end := i + batchSize
429 | 		if end > numBlocks {
430 | 			end = numBlocks
431 | 		}
432 | 
433 | 		// Submit blocks in this batch
434 | 		for j := i; j < end; j++ {
435 | 			if dm.ctx.Err() != nil {
436 | 				return ErrDownloadCancelled
437 | 			}
438 | 
439 | 			start := j * blockSize
440 | 			endEntry := start + blockSize - 1
441 | 			if endEntry >= treeSize {
442 | 				endEntry = treeSize - 1
443 | 			}
444 | 
445 | 			// Use log-specific context for the work item
446 | 			err := dm.submitDownloadBlock(logCtx, ctlog, start, endEntry)
447 | 			if err != nil {
448 | 				if errors.Is(err, ErrQueueFull) {
449 | 					// Adjust total entries for dropped blocks
450 | 					entriesInBlock := endEntry - start + 1
451 | 					dm.stats.TotalEntries.Add(-entriesInBlock)
452 | 					droppedBlocks++
453 | 				} else if errors.Is(err, ErrDownloadCancelled) {
454 | 					return err
455 | 				} else {
456 | 					log.Printf("Error submitting block %d-%d for %s: %v",
457 | 						start, endEntry, ctlog.URL, err)
458 | 				}
459 | 			} else {
460 | 				submittedBlocks++
461 | 			}
462 | 		}
463 | 
464 | 		// Small sleep between batches to avoid overwhelming scheduler
465 | 		if end < numBlocks {
466 | 			time.Sleep(250 * time.Millisecond)
467 | 		}
468 | 	}
469 | 
470 | 	// Report submission stats
471 | 	if droppedBlocks > 0 {
472 | 		log.Printf("Log %s: Submitted %d blocks, dropped %d blocks due to backpressure",
473 | 			ctlog.URL, submittedBlocks, droppedBlocks)
474 | 	} else {
475 | 		log.Printf("Successfully submitted all %d download blocks for %s",
476 | 			submittedBlocks, ctlog.URL)
477 | 	}
478 | 
479 | 	return nil
480 | }
481 | 
482 | // submitDownloadBlock attempts to submit a work block with retries
483 | func (dm *DownloadManager) submitDownloadBlock(ctx context.Context, ctlog *certlib.CTLogInfo, start, end int64) error {
484 | 	// Determine target worker based on log URL (consistent sharding)
485 | 	hash := xxh3.HashString(ctlog.URL)
486 | 	shardIndex := int(hash % uint64(dm.scheduler.numWorkers))
487 | 	targetWorker := dm.scheduler.workers[shardIndex]
488 | 
489 | 	// Wait on rate limiter
490 | 	waitStart := time.Now()
491 | 	if err := targetWorker.limiter.Wait(ctx); err != nil {
492 | 		if errors.Is(err, context.Canceled) {
493 | 			return ErrDownloadCancelled
494 | 		}
495 | 		return fmt.Errorf("rate limiter wait failed: %w", err)
496 | 	}
497 | 
498 | 	waitDuration := time.Since(waitStart)
499 | 	if waitDuration > 100*time.Millisecond {
500 | 		log.Printf("Worker %d rate limit caused %v wait for log %s (%d-%d), limit: %.2f req/s",
501 | 			targetWorker.id, waitDuration, ctlog.URL, start, end,
502 | 			float64(targetWorker.limiter.Limit()))
503 | 	}
504 | 
505 | 	// Attempt submission with retry for transient full queue
506 | 	maxRetries := MaxSubmitRetries
507 | 	retryDelay := 1000 * time.Millisecond
508 | 
509 | 	for attempt := 0; attempt < maxRetries; attempt++ {
510 | 		if ctx.Err() != nil {
511 | 			return ErrDownloadCancelled
512 | 		}
513 | 
514 | 		err := dm.scheduler.SubmitWork(ctx, ctlog, start, end, dm.downloadCallback)
515 | 		if err == nil {
516 | 			return nil // Success
517 | 		}
518 | 
519 | 		// Handle specific error types
520 | 		if errors.Is(err, ErrQueueFull) || strings.Contains(err.Error(), "queue full") {
521 | 			// Exponential backoff with jitter
522 | 			jitter := time.Duration(float64(retryDelay) * (0.5 + rand.Float64()))
523 | 			select {
524 | 			case <-time.After(jitter):
525 | 				retryDelay = retryDelay * 2
526 | 				if retryDelay > 500*time.Millisecond {
527 | 					retryDelay = 500 * time.Millisecond
528 | 				}
529 | 				continue // Retry submission
530 | 			case <-ctx.Done():
531 | 				return ErrDownloadCancelled
532 | 			}
533 | 		}
534 | 
535 | 		// Non-retriable error
536 | 		log.Printf("Permanent error submitting download work for %s (%d-%d): %v",
537 | 			ctlog.URL, start, end, err)
538 | 		return err
539 | 	}
540 | 
541 | 	// All retries exhausted
542 | 	log.Printf("Dropped download block %s (%d-%d) after %d retries (queue full).",
543 | 		ctlog.URL, start, end, maxRetries)
544 | 	return ErrQueueFull
545 | }
546 | 
547 | // downloadCallback fetches entries and writes raw data to the output file.
548 | // It's called by the worker for each block to be downloaded.
549 | func (dm *DownloadManager) downloadCallback(item *WorkItem) error {
550 | 	logInfo := item.LogInfo
551 | 	if logInfo == nil {
552 | 		return fmt.Errorf("internal error: WorkItem missing LogInfo (download)")
553 | 	}
554 | 
555 | 	// Extract context from the work item
556 | 	ctx := item.Ctx
557 | 	if ctx == nil {
558 | 		ctx = context.Background()
559 | 	}
560 | 
561 | 	// Track retries
562 | 	isRetry := item.Attempt > 0
563 | 	if isRetry {
564 | 		dm.stats.RetryCount.Add(1)
565 | 	}
566 | 
567 | 	// Download entries with retry logic already in certlib.DownloadEntries
568 | 	downloadStart := time.Now()
569 | 	entriesResponse, err := certlib.DownloadEntries(ctx, logInfo, int(item.Start), int(item.End))
570 | 	downloadDuration := time.Since(downloadStart)
571 | 
572 | 	if err != nil {
573 | 		dm.stats.FailedEntries.Add(item.End - item.Start + 1)
574 | 
575 | 		// Log different error levels based on context
576 | 		if errors.Is(err, context.Canceled) {
577 | 			// This is expected during shutdown, don't log as error
578 | 			return err
579 | 		}
580 | 
581 | 		return fmt.Errorf("failed to download entries %d-%d for %s (attempt %d): %w",
582 | 			item.Start, item.End, item.LogURL, item.Attempt+1, err)
583 | 	}
584 | 
585 | 	// Get the locked writer for this log
586 | 	writerUntyped, ok := dm.outputMap.Load(item.LogURL)
587 | 	if !ok {
588 | 		dm.stats.FailedEntries.Add(int64(len(entriesResponse.Entries)))
589 | 		return fmt.Errorf("output writer not found for log %s (download)", item.LogURL)
590 | 	}
591 | 
592 | 	lw, ok := writerUntyped.(*lockedWriter)
593 | 	if !ok || lw == nil {
594 | 		return fmt.Errorf("invalid writer type found in map for log %s (download)", item.LogURL)
595 | 	}
596 | 
597 | 	// Process entries in batches to minimize lock contention
598 | 	entriesCount := len(entriesResponse.Entries)
599 | 
600 | 	// Get a string builder from the pool
601 | 	sbInterface := dm.stringPool.Get()
602 | 	sb := sbInterface.(*strings.Builder)
603 | 	sb.Reset()
604 | 	sb.Grow(entriesCount * 512) // Pre-allocate approximate space
605 | 
606 | 	// Build output in memory first
607 | 	for i, entry := range entriesResponse.Entries {
608 | 		certIndex := item.Start + int64(i)
609 | 		fmt.Fprintf(sb, "%d,%s,%s\n", certIndex, entry.LeafInput, entry.ExtraData)
610 | 	}
611 | 
612 | 	// Get the built string
613 | 	outputData := sb.String()
614 | 
615 | 	// Reset and return the builder to the pool
616 | 	sb.Reset()
617 | 	dm.stringPool.Put(sb)
618 | 
619 | 	// Lock once for the entire write
620 | 	lw.mu.Lock()
621 | 	n, err := lw.writer.WriteString(outputData)
622 | 	lw.mu.Unlock()
623 | 
624 | 	if err != nil {
625 | 		dm.stats.FailedEntries.Add(int64(entriesCount))
626 | 		return fmt.Errorf("error writing to output buffer for %s: %w", item.LogURL, err)
627 | 	}
628 | 
629 | 	// Update stats
630 | 	dm.stats.ProcessedEntries.Add(int64(entriesCount))
631 | 	dm.stats.OutputBytesWritten.Add(int64(n))
632 | 
633 | 	// Track first-attempt success
634 | 	if !isRetry {
635 | 		dm.stats.SuccessFirstTry.Add(1)
636 | 	}
637 | 
638 | 	// Performance logging for slow blocks
639 | 	if downloadDuration > 2*time.Second {
640 | 		entriesPerSec := float64(entriesCount) / downloadDuration.Seconds()
641 | 		log.Printf("Slow download: %s (%d-%d): %.2f entries/sec, %d bytes written",
642 | 			item.LogURL, item.Start, item.End, entriesPerSec, n)
643 | 	}
644 | 
645 | 	return nil
646 | }
647 | 
648 | // Shutdown gracefully closes resources.
649 | func (dm *DownloadManager) Shutdown() {
650 | 	if dm.ctx.Err() != nil {
651 | 		// Already shut down
652 | 		return
653 | 	}
654 | 
655 | 	log.Println("Shutting down Download Manager...")
656 | 	dm.cancel() // Cancel context
657 | 
658 | 	// Shutdown scheduler (this will wait for worker queues to empty)
659 | 	if dm.scheduler != nil {
660 | 		dm.scheduler.Shutdown()
661 | 	}
662 | 
663 | 	log.Println("Flushing and closing download writers...")
664 | 
665 | 	// Close and rename all writers
666 | 	var successCount, errorCount int
667 | 
668 | 	dm.outputMap.Range(func(key, value interface{}) bool {
669 | 		if value == nil {
670 | 			return true
671 | 		}
672 | 
673 | 		lw, ok := value.(*lockedWriter)
674 | 		if !ok || lw == nil {
675 | 			log.Printf("Warning: Invalid writer type in map during download shutdown for key %v", key)
676 | 			return true
677 | 		}
678 | 
679 | 		// Lock, flush, close and rename
680 | 		func() {
681 | 			lw.mu.Lock()
682 | 			defer lw.mu.Unlock()
683 | 
684 | 			closeErr := false
685 | 
686 | 			// Flush buffers
687 | 			if lw.writer != nil {
688 | 				if err := lw.writer.Flush(); err != nil {
689 | 					log.Printf("Error flushing download writer for %s: %v", key.(string), err)
690 | 					closeErr = true
691 | 				}
692 | 			}
693 | 
694 | 			// Close gzip writer if present
695 | 			if lw.gzWriter != nil {
696 | 				if err := lw.gzWriter.Close(); err != nil {
697 | 					log.Printf("Error closing gzip download writer for %s: %v", key.(string), err)
698 | 					closeErr = true
699 | 				}
700 | 			}
701 | 
702 | 			// Close file
703 | 			if lw.file != nil {
704 | 				if err := lw.file.Close(); err != nil {
705 | 					log.Printf("Error closing file for download %s: %v", key.(string), err)
706 | 					closeErr = true
707 | 				}
708 | 			}
709 | 
710 | 			// Rename temp file to final name if we're fully set up
711 | 			if dm.setupComplete.Load() && !closeErr && lw.filePath != "" && lw.finalPath != "" {
712 | 				if err := os.Rename(lw.filePath, lw.finalPath); err != nil {
713 | 					log.Printf("Error renaming temp file %s to %s: %v",
714 | 						lw.filePath, lw.finalPath, err)
715 | 					errorCount++
716 | 				} else {
717 | 					successCount++
718 | 				}
719 | 			} else if closeErr {
720 | 				errorCount++
721 | 			}
722 | 		}()
723 | 		return true
724 | 	})
725 | 
726 | 	log.Printf("Download Manager shutdown complete. Finalized %d files with %d errors.",
727 | 		successCount, errorCount)
728 | }
729 | 
730 | // GetStats returns the current statistics.
731 | func (dm *DownloadManager) GetStats() *DownloadStats {
732 | 	return dm.stats
733 | }
734 | 


--------------------------------------------------------------------------------
/internal/core/error.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Package core provides the central logic for rxtls, including the scheduler, download manager,
 3 | and domain extractor. It defines common data structures and constants used across these components.
 4 | */
 5 | package core
 6 | 
 7 | // customError is an error type that includes a retryable flag.
 8 | // This allows components to determine if an operation that resulted in this error
 9 | // should be retried.
10 | // It implements the standard `error` interface.
11 | type customError struct {
12 | 	message   string // The error message.
13 | 	retryable bool   // True if the error indicates a condition that might be resolved by retrying.
14 | }
15 | 
16 | // NewError creates a new customError with the given message and retryable status.
17 | //
18 | // Parameters:
19 | //   msg: The textual description of the error.
20 | //   retryable: A boolean indicating if the error condition is potentially transient
21 | //              and the operation could succeed on a subsequent attempt.
22 | //
23 | // Returns:
24 | //   An error of type *customError.
25 | func NewError(msg string, retryable bool) error {
26 | 	return &customError{
27 | 		message:   msg,
28 | 		retryable: retryable,
29 | 	}
30 | }
31 | 
32 | // Error implements the standard Go `error` interface.
33 | // It returns the textual message associated with the customError.
34 | func (e *customError) Error() string {
35 | 	return e.message
36 | }
37 | 
38 | // IsRetryable returns true if the error is designated as retryable, false otherwise.
39 | // This method allows consuming code to check the retryable nature of the error
40 | // without needing to type-assert to the concrete `customError` type if they
41 | // are working with a standard `error` interface variable.
42 | func (e *customError) IsRetryable() bool {
43 | 	return e.retryable
44 | }
45 | 
46 | // IsRetryable is a helper function to check if a given error is of type *customError
47 | // and if its retryable flag is set.
48 | // If the error is nil, it returns false.
49 | // If the error is not a *customError, it defaults to false (non-retryable).
50 | //
51 | // Parameters:
52 | //   err: The error to check.
53 | //
54 | // Returns:
55 | //   True if the error is a retryable *customError, false otherwise.
56 | func IsRetryable(err error) bool {
57 | 	if err == nil {
58 | 		return false
59 | 	}
60 | 
61 | 	// Type assert to *customError to access the IsRetryable method.
62 | 	if e, ok := err.(*customError); ok {
63 | 		return e.IsRetryable()
64 | 	}
65 | 
66 | 	// If not a *customError, assume not retryable by default for unknown error types.
67 | 	return false
68 | }
69 | 
70 | // Common error constants used within the core package.
71 | // These provide standardized error values for frequent conditions like full queues
72 | // or worker shutdowns, facilitating consistent error handling and checking.
73 | var (
74 | 	// ErrQueueFull indicates that a worker's queue is at capacity and cannot accept new work items.
75 | 	// This error is typically considered retryable, as the queue might free up later.
76 | 	ErrQueueFull = NewError("queue full", true)
77 | 	// ErrWorkerShutdown indicates that a worker or the scheduler is in the process of shutting down
78 | 	// and can no longer process new work items. This is generally not a retryable error
79 | 	// in the context of the current operation, as the component is terminating.
80 | 	ErrWorkerShutdown = NewError("worker shutdown", false)
81 | )
82 | 


--------------------------------------------------------------------------------
/internal/core/list.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Package core provides the central logic for rxtls, including the scheduler, download manager,
 3 | and domain extractor. It defines common data structures and constants used across these components.
 4 | */
 5 | package core
 6 | 
 7 | /*
 8 | rxtls — fast tool in Go for working with Certificate Transparency logs
 9 | Copyright (C) 2025  Pepijn van der Stap <rxtls@vanderstap.info>
10 | 
11 | This program is free software: you can redistribute it and/or modify
12 | it under the terms of the GNU Affero General Public License as published by
13 | the Free Software Foundation, either version 3 of the License, or
14 | (at your option) any later version.
15 | 
16 | This program is distributed in the hope that it will be useful,
17 | but WITHOUT ANY WARRANTY; without even the implied warranty of
18 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 | GNU Affero General Public License for more details.
20 | 
21 | You should have received a copy of the GNU Affero General Public License
22 | along with this program.  If not, see <https://www.gnu.org/licenses/>.
23 | */
24 | 
25 | import (
26 | 	"fmt"
27 | 
28 | 	"github.com/x-stp/rxtls/internal/certlib"
29 | )
30 | 
31 | // ListCTLogs retrieves the list of available Certificate Transparency (CT) logs.
32 | // It serves as a simple wrapper around `certlib.GetCTLogs`, which encapsulates the logic
33 | // for fetching the log list either from a remote source (e.g., Google's JSON list) or
34 | // from a local file cache, depending on the `certlib.UseLocalLogs` global setting.
35 | //
36 | // This function is primarily used by command-line interface (CLI) commands that need to
37 | // display available logs or allow users to select logs for processing.
38 | //
39 | // Performance Note: This function itself does not perform detailed STH (Signed Tree Head)
40 | // fetching for each log to determine its size or state, as that would be too slow for
41 | // a simple listing operation. The `certlib.GetCTLogs` function focuses on retrieving the
42 | // basic log metadata (URL, description, operator).
43 | //
44 | // Returns:
45 | //   - A slice of `certlib.CTLogInfo` structs, each representing a known CT log.
46 | //   - An error if retrieving or parsing the log list fails.
47 | func ListCTLogs() ([]certlib.CTLogInfo, error) {
48 | 	ctlogs, err := certlib.GetCTLogs()
49 | 	if err != nil {
50 | 		// Wrap the error from certlib to provide more context.
51 | 		return nil, fmt.Errorf("error retrieving CT logs list: %w", err)
52 | 	}
53 | 	return ctlogs, nil
54 | }
55 | 


--------------------------------------------------------------------------------
/internal/core/ratelimiter.go:
--------------------------------------------------------------------------------
  1 | /*
  2 | Package core provides the central logic for rxtls, including the scheduler, download manager,
  3 | and domain extractor. It defines common data structures and constants used across these components.
  4 | */
  5 | package core
  6 | 
  7 | import (
  8 | 	"math"
  9 | 	"sync/atomic"
 10 | 	"time"
 11 | )
 12 | 
 13 | // Rate limiting constants defining the behavior of the adaptive rate limiter.
 14 | const (
 15 | 	// MinRate is the minimum allowed rate in requests per second (RPS).
 16 | 	// The rate limiter will not decrease the rate below this value.
 17 | 	MinRate = 2.0
 18 | 	// MaxRate is the maximum allowed rate in requests per second (RPS).
 19 | 	// The rate limiter will not increase the rate above this value.
 20 | 	MaxRate = 1000.0
 21 | 	// RateIncreaseStep is the additive amount by which the rate is increased upon a successful operation.
 22 | 	RateIncreaseStep = 20.0
 23 | 	// RateDecreaseStep is the subtractive amount by which the rate is decreased upon a failed operation
 24 | 	// or when backpressure is detected.
 25 | 	RateDecreaseStep = 50.0
 26 | )
 27 | 
 28 | // RateLimiter implements a simple adaptive rate limiting mechanism.
 29 | // It adjusts the rate based on success/failure of operations and can respond to backpressure signals.
 30 | // The current rate is stored as a float64 manipulated via atomic operations on its uint64 bit representation
 31 | // to ensure thread-safe updates without locks for `getRate` and `setRate` hot paths.
 32 | //
 33 | // This rate limiter is a basic token bucket variant where tokens are implicitly refilled based on elapsed time
 34 | // and the current rate.
 35 | //
 36 | // Concurrency: The `currentRate` is accessed atomically. `successCount`, `failureCount`,
 37 | // and `backpressure` are also atomic, making most operations non-blocking.
 38 | // `lastAdjustment` is not atomic but primarily used for calculating elapsed time in `Allow`,
 39 | // where its exact precision is less critical than overall rate control.
 40 | type RateLimiter struct {
 41 | 	// currentRate stores the bit representation of the current float64 rate limit.
 42 | 	// This allows for atomic load/store of the rate.
 43 | 	currentRate uint64
 44 | 	// successCount tracks the number of successful operations recorded.
 45 | 	successCount atomic.Uint64
 46 | 	// failureCount tracks the number of failed operations recorded.
 47 | 	failureCount atomic.Uint64
 48 | 	// lastAdjustment records the time of the last `Allow` call that consumed a token.
 49 | 	// It's used to calculate token replenishment.
 50 | 	lastAdjustment time.Time
 51 | 	// backpressure, if true, forces the Allow method to return false, effectively halting
 52 | 	// operations. This can be triggered externally (e.g., by a full queue).
 53 | 	backpressure atomic.Bool
 54 | }
 55 | 
 56 | // NewRateLimiter creates a new RateLimiter instance with the specified initial rate.
 57 | //
 58 | // Parameters:
 59 | //
 60 | //	initialRate: The starting rate limit in requests per second (RPS).
 61 | //
 62 | // Returns:
 63 | //
 64 | //	A pointer to the newly created RateLimiter.
 65 | func NewRateLimiter(initialRate float64) *RateLimiter {
 66 | 	rl := &RateLimiter{
 67 | 		lastAdjustment: time.Now(), // Initialize lastAdjustment to current time.
 68 | 	}
 69 | 	rl.setRate(initialRate) // Set the initial rate atomically.
 70 | 	return rl
 71 | }
 72 | 
 73 | // Allow determines if an operation should be permitted based on the current rate limit.
 74 | // It implements a simple token bucket logic: tokens are replenished over time based on `currentRate`.
 75 | // If backpressure is active, Allow will always return false.
 76 | // If the rate is zero or negative, Allow will also return false.
 77 | //
 78 | // Returns:
 79 | //
 80 | //	True if the operation is allowed, false otherwise.
 81 | //
 82 | // Hot Path: This method is expected to be called frequently and should be highly performant.
 83 | // It primarily involves atomic reads and time calculations.
 84 | func (rl *RateLimiter) Allow() bool {
 85 | 	if rl.backpressure.Load() {
 86 | 		return false // Backpressure is active, disallow operation.
 87 | 	}
 88 | 
 89 | 	rate := rl.getRate()
 90 | 	if rate <= 0 {
 91 | 		return false // Rate is zero or negative, no operations allowed.
 92 | 	}
 93 | 
 94 | 	// Simple token bucket: Calculate tokens accrued since last allowed operation.
 95 | 	now := time.Now()
 96 | 	elapsed := now.Sub(rl.lastAdjustment).Seconds() // Time since last token consumption.
 97 | 	tokens := elapsed * rate                        // Tokens generated during elapsed time.
 98 | 
 99 | 	if tokens >= 1.0 {
100 | 		rl.lastAdjustment = now // Consume one token by updating lastAdjustment.
101 | 		return true             // Enough tokens, allow operation.
102 | 	}
103 | 
104 | 	return false // Not enough tokens.
105 | }
106 | 
107 | // RecordSuccess is called to indicate that an operation controlled by this rate limiter was successful.
108 | // It increments the success counter and may trigger an increase in the rate limit.
109 | func (rl *RateLimiter) RecordSuccess() {
110 | 	rl.successCount.Add(1)
111 | 	rl.adjustRate(true) // Attempt to increase rate.
112 | }
113 | 
114 | // RecordFailure is called to indicate that an operation controlled by this rate limiter failed.
115 | // It increments the failure counter and may trigger a decrease in the rate limit.
116 | func (rl *RateLimiter) RecordFailure() {
117 | 	rl.failureCount.Add(1)
118 | 	rl.adjustRate(false) // Attempt to decrease rate.
119 | }
120 | 
121 | // UpdateBackpressure sets the backpressure state of the rate limiter.
122 | // If `hasBackpressure` is true, the `Allow` method will subsequently return false until
123 | // backpressure is cleared by calling UpdateBackpressure(false).
124 | // This provides a mechanism for external components (e.g., a queue monitor) to signal the
125 | // rate limiter to pause operations.
126 | func (rl *RateLimiter) UpdateBackpressure(hasBackpressure bool) {
127 | 	rl.backpressure.Store(hasBackpressure)
128 | }
129 | 
130 | // GetCurrentRate returns the current effective rate limit in requests per second.
131 | func (rl *RateLimiter) GetCurrentRate() float64 {
132 | 	return rl.getRate()
133 | }
134 | 
135 | // adjustRate dynamically modifies the rate limit based on the success or failure of an operation.
136 | // If `success` is true, it attempts to increase the rate by `RateIncreaseStep`,
137 | // capped at `MaxRate`.
138 | // If `success` is false, it attempts to decrease the rate by `RateDecreaseStep`,
139 | // floored at `MinRate`.
140 | //
141 | // This method is called internally by RecordSuccess and RecordFailure.
142 | func (rl *RateLimiter) adjustRate(success bool) {
143 | 	current := rl.getRate()
144 | 	var newRate float64
145 | 
146 | 	if success {
147 | 		newRate = current + RateIncreaseStep
148 | 		if newRate > MaxRate {
149 | 			newRate = MaxRate // Cap at maximum allowed rate.
150 | 		}
151 | 	} else {
152 | 		newRate = current - RateDecreaseStep
153 | 		if newRate < MinRate {
154 | 			newRate = MinRate // Floor at minimum allowed rate.
155 | 		}
156 | 	}
157 | 
158 | 	rl.setRate(newRate) // Atomically update the rate.
159 | }
160 | 
161 | // GetStats returns a map containing current statistics of the rate limiter.
162 | // This is useful for monitoring and debugging the rate limiter's behavior.
163 | // The returned map includes the current rate, total success/failure counts,
164 | // backpressure state, and the timestamp of the last rate adjustment (token consumption).
165 | func (rl *RateLimiter) GetStats() map[string]interface{} {
166 | 	return map[string]interface{}{
167 | 		"current_rate":    rl.getRate(),
168 | 		"success_count":   rl.successCount.Load(),
169 | 		"failure_count":   rl.failureCount.Load(),
170 | 		"backpressure":    rl.backpressure.Load(),
171 | 		"last_adjustment": rl.lastAdjustment,
172 | 	}
173 | }
174 | 
175 | // Reset reinitializes the rate limiter to a given initial rate and clears its statistics.
176 | // Success/failure counts are reset, backpressure is turned off, and lastAdjustment is set to now.
177 | //
178 | // Parameters:
179 | //
180 | //	initialRate: The new initial rate limit in requests per second (RPS).
181 | func (rl *RateLimiter) Reset(initialRate float64) {
182 | 	rl.setRate(initialRate)
183 | 	rl.successCount.Store(0)
184 | 	rl.failureCount.Store(0)
185 | 	rl.backpressure.Store(false)
186 | 	rl.lastAdjustment = time.Now()
187 | }
188 | 
189 | // getRate atomically retrieves the current rate limit as a float64.
190 | // It reads the uint64 bits and converts them to a float64.
191 | func (rl *RateLimiter) getRate() float64 {
192 | 	bits := atomic.LoadUint64(&rl.currentRate)
193 | 	return math.Float64frombits(bits)
194 | }
195 | 
196 | // setRate atomically sets the current rate limit.
197 | // It converts the float64 rate to its uint64 bit representation for atomic storage.
198 | func (rl *RateLimiter) setRate(rate float64) {
199 | 	bits := math.Float64bits(rate)
200 | 	atomic.StoreUint64(&rl.currentRate, bits)
201 | }
202 | 


--------------------------------------------------------------------------------
/internal/core/scheduler_stub.go:
--------------------------------------------------------------------------------
  1 | //go:build !linux
  2 | // +build !linux
  3 | 
  4 | /*
  5 | rxtls — fast tool in Go for working with Certificate Transparency logs
  6 | Copyright (C) 2025  Pepijn van der Stap <rxtls@vanderstap.info>
  7 | 
  8 | This program is free software: you can redistribute it and/or modify
  9 | it under the terms of the GNU Affero General Public License as published by
 10 | the Free Software Foundation, either version 3 of the License, or
 11 | (at your option) any later version.
 12 | 
 13 | This program is distributed in the hope that it will be useful,
 14 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 16 | GNU Affero General Public License for more details.
 17 | 
 18 | You should have received a copy of the GNU Affero General Public License
 19 | along with this program.  If not, see <https://www.gnu.org/licenses/>.
 20 | */
 21 | 
 22 | // This file provides a stub implementation of the scheduler for non-Linux platforms
 23 | // where CPU affinity setting is not available or not implemented via x/sys/unix.
 24 | 
 25 | package core
 26 | 
 27 | import (
 28 | 	"context"
 29 | 	"fmt"
 30 | 	"log"
 31 | 	"runtime"
 32 | 	"sync"
 33 | 	"sync/atomic"
 34 | 	"time"
 35 | 
 36 | 	"github.com/x-stp/rxtls/internal/certlib"
 37 | 
 38 | 	"github.com/zeebo/xxh3" // Consistent hashing
 39 | 	"golang.org/x/time/rate"
 40 | )
 41 | 
 42 | // Scheduler definition MUST be identical across builds.
 43 | // Manages workers and dispatch, but without affinity.
 44 | type Scheduler struct {
 45 | 	numWorkers   int
 46 | 	workers      []*Worker
 47 | 	ctx          context.Context
 48 | 	cancel       context.CancelFunc
 49 | 	shutdown     atomic.Bool
 50 | 	workItemPool sync.Pool
 51 | 	activeWork   sync.WaitGroup // Tracks active work
 52 | }
 53 | 
 54 | // Worker definition MUST be identical, cpuAffinity field is present but unused.
 55 | type Worker struct {
 56 | 	// Immutable fields
 57 | 	id          int
 58 | 	ctx         context.Context
 59 | 	cancel      context.CancelFunc
 60 | 	scheduler   *Scheduler
 61 | 	queue       chan *WorkItem
 62 | 	limiter     *rate.Limiter
 63 | 	cpuAffinity int
 64 | 
 65 | 	// Metrics
 66 | 	processed  atomic.Int64
 67 | 	errors     atomic.Int64
 68 | 	panics     atomic.Int64
 69 | 	busy       atomic.Bool
 70 | 	lastActive atomic.Int64
 71 | }
 72 | 
 73 | // NewScheduler creates and starts the scheduler (stub version without affinity).
 74 | // Operation: Blocking (at startup), allocates worker/channel resources.
 75 | func NewScheduler(parentCtx context.Context) (*Scheduler, error) {
 76 | 	numWorkers := runtime.NumCPU() * WorkerMultiplier
 77 | 	if numWorkers <= 0 {
 78 | 		numWorkers = 1
 79 | 	}
 80 | 
 81 | 	sctx, cancel := context.WithCancel(parentCtx)
 82 | 
 83 | 	s := &Scheduler{
 84 | 		numWorkers: numWorkers,
 85 | 		workers:    make([]*Worker, numWorkers),
 86 | 		ctx:        sctx,
 87 | 		cancel:     cancel,
 88 | 		workItemPool: sync.Pool{
 89 | 			New: func() interface{} {
 90 | 				return &WorkItem{
 91 | 					CreatedAt: time.Now(),
 92 | 				}
 93 | 			},
 94 | 		},
 95 | 	}
 96 | 
 97 | 	initialRate := rate.Limit(1000)
 98 | 	burstSize := MaxShardQueueSize
 99 | 
100 | 	for i := 0; i < numWorkers; i++ {
101 | 		w := &Worker{
102 | 			id:          i,
103 | 			cpuAffinity: -1, // Mark as unused
104 | 			queue:       make(chan *WorkItem, MaxShardQueueSize),
105 | 			scheduler:   s,
106 | 			ctx:         sctx,
107 | 			limiter:     rate.NewLimiter(initialRate, burstSize), // Init limiter
108 | 		}
109 | 		s.workers[i] = w
110 | 		go w.run() // Start the worker goroutine
111 | 	}
112 | 
113 | 	fmt.Printf("Scheduler initialized with %d workers (CPU affinity disabled).\n", numWorkers)
114 | 	return s, nil
115 | }
116 | 
117 | // run is the main loop for a worker goroutine (stub version without affinity setup).
118 | // Hot Path: Yes. Must be zero-GC, non-blocking (except on queue read).
119 | func (w *Worker) run() {
120 | 	// No LockOSThread or affinity setting needed/possible on non-Linux.
121 | 	for {
122 | 		select {
123 | 		case <-w.ctx.Done():
124 | 			return
125 | 		case item := <-w.queue:
126 | 			if item == nil {
127 | 				continue
128 | 			}
129 | 
130 | 			// Mark work as done when the callback finishes or panics
131 | 			func() {
132 | 				defer w.scheduler.activeWork.Done() // Signal completion via WaitGroup
133 | 				defer func() {
134 | 					if r := recover(); r != nil {
135 | 						log.Printf("Panic recovered in worker %d processing item for %s (%d-%d): %v", w.id, item.LogURL, item.Start, item.End, r)
136 | 						// TODO: Increment failure counter
137 | 					}
138 | 				}()
139 | 
140 | 				err := item.Callback(item)
141 | 				if err != nil {
142 | 					// Basic error logging.
143 | 					// TODO: Implement retry mechanism using item.Attempt.
144 | 					fmt.Printf("Error processing item for %s (%d-%d): %v\n", item.LogURL, item.Start, item.End, err)
145 | 				}
146 | 			}()
147 | 
148 | 			// Return item to pool, resetting fields.
149 | 			item.Callback = nil
150 | 			item.LogURL = ""
151 | 			item.LogInfo = nil
152 | 			item.Ctx = nil
153 | 			item.Error = nil
154 | 			w.scheduler.workItemPool.Put(item)
155 | 		}
156 | 	}
157 | }
158 | 
159 | // setAffinity is a no-op stub on non-Linux platforms.
160 | func setAffinity(workerID, cpuID int) {
161 | 	// Affinity not supported/implemented on this OS.
162 | }
163 | 
164 | // SubmitWork definition MUST be identical across builds.
165 | // Hot Path: Yes. Non-blocking, low allocation.
166 | func (s *Scheduler) SubmitWork(ctx context.Context, logInfo *certlib.CTLogInfo, start, end int64, callback WorkCallback) error {
167 | 	if s.shutdown.Load() {
168 | 		return fmt.Errorf("scheduler is shutting down")
169 | 	}
170 | 
171 | 	logURL := logInfo.URL
172 | 	hash := xxh3.HashString(logURL)
173 | 	shardIndex := int(hash % uint64(s.numWorkers))
174 | 	targetWorker := s.workers[shardIndex]
175 | 
176 | 	// NOTE: Rate limiting handled by caller
177 | 
178 | 	item := s.workItemPool.Get().(*WorkItem)
179 | 	item.LogURL = logURL
180 | 	item.LogInfo = logInfo
181 | 	item.Start = start
182 | 	item.End = end
183 | 	item.Attempt = 0
184 | 	item.Callback = callback
185 | 	item.Ctx = ctx
186 | 	item.CreatedAt = time.Now()
187 | 	s.activeWork.Add(1)
188 | 
189 | 	select {
190 | 	case targetWorker.queue <- item:
191 | 		// Optional: Increase rate limit on success
192 | 		return nil
193 | 	default:
194 | 		// Backpressure: Queue full.
195 | 		s.activeWork.Done()
196 | 		s.workItemPool.Put(item)
197 | 		// Optional: Decrease rate limit
198 | 		return fmt.Errorf("worker %d for log %s: %w", targetWorker.id, logURL, ErrQueueFull)
199 | 	}
200 | }
201 | 
202 | // Wait definition MUST be identical across builds.
203 | func (s *Scheduler) Wait() {
204 | 	s.activeWork.Wait()
205 | }
206 | 
207 | // Shutdown definition MUST be identical across builds.
208 | func (s *Scheduler) Shutdown() {
209 | 	s.shutdown.Store(true)
210 | 	s.cancel()
211 | 	s.Wait()
212 | }
213 | 


--------------------------------------------------------------------------------
/internal/core/work.go:
--------------------------------------------------------------------------------
 1 | // Package core provides the central logic for rxtls, including the scheduler, download manager,
 2 | // and domain extractor. It defines common data structures and constants used across these components.
 3 | package core
 4 | 
 5 | import (
 6 | 	"context"
 7 | 	"time"
 8 | )
 9 | 
10 | // Work defines an interface for a unit of work that can be processed.
11 | // This interface allows for different types of tasks to be handled by a generic
12 | // processing system (like a scheduler or worker pool) as long as they conform to this contract.
13 | //
14 | // Implementations of Work should encapsulate all necessary data and logic for their execution.
15 | type Work interface {
16 | 	// Process executes the primary logic of the work unit.
17 | 	// It takes a context that can be used for cancellation or deadlines.
18 | 	// An error is returned if processing fails.
19 | 	Process(ctx context.Context) error
20 | 	// GetID returns a unique identifier for this work unit.
21 | 	// This ID can be used for logging, tracking, or sharding purposes.
22 | 	GetID() string
23 | 	// GetCreatedAt returns the timestamp when this work unit was created.
24 | 	// This can be useful for metrics, priority queuing, or staleness checks.
25 | 	GetCreatedAt() time.Time
26 | }
27 | 
28 | // Task is a concrete implementation of the Work interface.
29 | // It provides a flexible way to define a work unit by associating arbitrary data
30 | // with a specific processing function (ProcessFn).
31 | //
32 | // Fields:
33 | //
34 | //	ID: A string identifier for the task.
35 | //	CreatedAt: The time the task was created.
36 | //	Data: An interface{} to hold any data required by the ProcessFn.
37 | //	ProcessFn: The function that encapsulates the actual processing logic for this task.
38 | type Task struct {
39 | 	ID        string
40 | 	CreatedAt time.Time
41 | 	Data      interface{}
42 | 	ProcessFn func(ctx context.Context, data interface{}) error
43 | }
44 | 
45 | // Process executes the task by calling its ProcessFn with the associated context and data.
46 | // It conforms to the Work interface.
47 | func (t *Task) Process(ctx context.Context) error {
48 | 	return t.ProcessFn(ctx, t.Data)
49 | }
50 | 
51 | // GetID returns the unique identifier of the task.
52 | // It conforms to the Work interface.
53 | func (t *Task) GetID() string {
54 | 	return t.ID
55 | }
56 | 
57 | // GetCreatedAt returns the creation timestamp of the task.
58 | // It conforms to the Work interface.
59 | func (t *Task) GetCreatedAt() time.Time {
60 | 	return t.CreatedAt
61 | }
62 | 
63 | // NewTask creates and returns a new Task instance.
64 | //
65 | // Parameters:
66 | //
67 | //	id: The unique string identifier for the new task.
68 | //	data: The data payload to be associated with the task.
69 | //	processFn: The function that will be called to process this task's data.
70 | //	           This function must match the signature `func(ctx context.Context, data interface{}) error`.
71 | //
72 | // Returns:
73 | //
74 | //	A pointer to the newly created Task.
75 | func NewTask(id string, data interface{}, processFn func(ctx context.Context, data interface{}) error) *Task {
76 | 	return &Task{
77 | 		ID:        id,
78 | 		CreatedAt: time.Now(), // Set creation time to now.
79 | 		Data:      data,
80 | 		ProcessFn: processFn,
81 | 	}
82 | }
83 | 


--------------------------------------------------------------------------------
/internal/io/buffer.go:
--------------------------------------------------------------------------------
  1 | package io
  2 | 
  3 | /*
  4 | rxtls — fast tool in Go for working with Certificate Transparency logs
  5 | Copyright (C) 2025  Pepijn van der Stap <rxtls@vanderstap.info>
  6 | 
  7 | This program is free software: you can redistribute it and/or modify
  8 | it under the terms of the GNU Affero General Public License as published by
  9 | the Free Software Foundation, either version 3 of the License, or
 10 | (at your option) any later version.
 11 | 
 12 | This program is distributed in the hope that it will be useful,
 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 15 | GNU Affero General Public License for more details.
 16 | 
 17 | You should have received a copy of the GNU Affero General Public License
 18 | along with this program.  If not, see <https://www.gnu.org/licenses/>.
 19 | */
 20 | 
 21 | import (
 22 | 	"bufio"
 23 | 	"compress/gzip"
 24 | 	"context"
 25 | 	"errors"
 26 | 	"fmt"
 27 | 	"os"
 28 | 	"path/filepath"
 29 | 	"runtime"
 30 | 	"sync"
 31 | 	"sync/atomic"
 32 | 	"time"
 33 | )
 34 | 
 35 | const (
 36 | 	// DefaultBufferSize is the default buffer size for disk I/O
 37 | 	DefaultBufferSize = 256 * 1024 // 256KB
 38 | 
 39 | 	// PageSize is the OS page size for aligned writes
 40 | 	PageSize = 4096 // 4KB, typical OS page size
 41 | 
 42 | 	// FlushInterval is how often to flush buffers automatically
 43 | 	FlushInterval = 2 * time.Second
 44 | 
 45 | 	// BackpressureThreshold is the percentage of buffer capacity that triggers backpressure
 46 | 	BackpressureThreshold = 0.8 // 80%
 47 | )
 48 | 
 49 | var (
 50 | 	// ErrBufferFull is returned when the buffer is full and backpressure is applied
 51 | 	ErrBufferFull = errors.New("write buffer full, applying backpressure")
 52 | 
 53 | 	// ErrBufferClosed is returned when attempting to write to a closed buffer
 54 | 	ErrBufferClosed = errors.New("write buffer closed")
 55 | 
 56 | 	// ErrFlushTimeout is returned when a flush operation times out
 57 | 	ErrFlushTimeout = errors.New("flush operation timed out")
 58 | )
 59 | 
 60 | // BufferMetrics holds metrics for a buffer
 61 | type BufferMetrics struct {
 62 | 	BytesWritten     atomic.Int64
 63 | 	BytesFlushed     atomic.Int64
 64 | 	FlushCount       atomic.Int64
 65 | 	WriteCount       atomic.Int64
 66 | 	BackpressureHits atomic.Int64
 67 | 	ErrorCount       atomic.Int64
 68 | 	LastFlushTime    atomic.Int64 // Unix timestamp in nanoseconds
 69 | 	LastWriteTime    atomic.Int64 // Unix timestamp in nanoseconds
 70 | 	LastErrorTime    atomic.Int64 // Unix timestamp in nanoseconds
 71 | }
 72 | 
 73 | // AsyncBuffer is a high-performance buffer for disk I/O with async flushing
 74 | type AsyncBuffer struct {
 75 | 	// Immutable after creation
 76 | 	file           *os.File
 77 | 	gzWriter       *gzip.Writer
 78 | 	bufWriter      *bufio.Writer
 79 | 	flushInterval  time.Duration
 80 | 	bufferSize     int
 81 | 	alignWrites    bool
 82 | 	compressed     bool
 83 | 	flushThreshold float64
 84 | 	fileDescriptor int
 85 | 	identifier     string // For logging/metrics
 86 | 
 87 | 	// Mutable state protected by mutex
 88 | 	mu              sync.Mutex
 89 | 	closed          bool
 90 | 	lastFlushTime   time.Time
 91 | 	flushInProgress bool
 92 | 	writeQueue      [][]byte // Pending writes that couldn't fit in buffer
 93 | 
 94 | 	// Context for cancellation
 95 | 	ctx    context.Context
 96 | 	cancel context.CancelFunc
 97 | 
 98 | 	// Wait group for flush operations
 99 | 	flushWg sync.WaitGroup
100 | 
101 | 	// Metrics (atomic)
102 | 	metrics BufferMetrics
103 | 
104 | 	// Signaling channels
105 | 	flushComplete chan struct{} // Signals when a flush is complete
106 | 	backpressure  chan struct{} // Signals when backpressure is applied/released
107 | }
108 | 
109 | // AsyncBufferOptions configures an AsyncBuffer
110 | type AsyncBufferOptions struct {
111 | 	BufferSize     int
112 | 	FlushInterval  time.Duration
113 | 	AlignWrites    bool
114 | 	Compressed     bool
115 | 	FlushThreshold float64
116 | 	Identifier     string
117 | }
118 | 
119 | // DefaultAsyncBufferOptions returns the default options for AsyncBuffer
120 | func DefaultAsyncBufferOptions() *AsyncBufferOptions {
121 | 	return &AsyncBufferOptions{
122 | 		BufferSize:     DefaultBufferSize,
123 | 		FlushInterval:  FlushInterval,
124 | 		AlignWrites:    true,
125 | 		Compressed:     false,
126 | 		FlushThreshold: BackpressureThreshold,
127 | 		Identifier:     "",
128 | 	}
129 | }
130 | 
131 | // NewAsyncBuffer creates a new AsyncBuffer
132 | func NewAsyncBuffer(ctx context.Context, path string, options *AsyncBufferOptions) (*AsyncBuffer, error) {
133 | 	if options == nil {
134 | 		options = DefaultAsyncBufferOptions()
135 | 	}
136 | 
137 | 	// Ensure directory exists
138 | 	dir := filepath.Dir(path)
139 | 	if err := os.MkdirAll(dir, 0755); err != nil {
140 | 		return nil, fmt.Errorf("failed to create directory %s: %w", dir, err)
141 | 	}
142 | 
143 | 	// Open file with direct I/O if supported and requested
144 | 	flag := os.O_CREATE | os.O_WRONLY | os.O_TRUNC
145 | 	if options.AlignWrites && runtime.GOOS == "linux" {
146 | 		// oDirect is only available on Linux
147 | 		// Use a constant value instead of syscall.O_DIRECT to avoid build errors on other platforms
148 | 		const oDirect = 0x4000 // Linux specific
149 | 		flag |= oDirect
150 | 	}
151 | 
152 | 	file, err := os.OpenFile(path, flag, 0644)
153 | 	if err != nil {
154 | 		return nil, fmt.Errorf("failed to open file %s: %w", path, err)
155 | 	}
156 | 
157 | 	// Get file descriptor for direct operations
158 | 	fd := int(file.Fd())
159 | 
160 | 	// Create buffer context
161 | 	bufCtx, bufCancel := context.WithCancel(ctx)
162 | 
163 | 	// Create the buffer
164 | 	ab := &AsyncBuffer{
165 | 		file:           file,
166 | 		bufferSize:     options.BufferSize,
167 | 		alignWrites:    options.AlignWrites,
168 | 		compressed:     options.Compressed,
169 | 		flushInterval:  options.FlushInterval,
170 | 		flushThreshold: options.FlushThreshold,
171 | 		fileDescriptor: fd,
172 | 		identifier:     options.Identifier,
173 | 		lastFlushTime:  time.Now(),
174 | 		ctx:            bufCtx,
175 | 		cancel:         bufCancel,
176 | 		flushComplete:  make(chan struct{}, 1),
177 | 		backpressure:   make(chan struct{}, 1),
178 | 	}
179 | 
180 | 	// Set up the writer chain
181 | 	if options.Compressed {
182 | 		gzw, err := gzip.NewWriterLevel(file, gzip.BestSpeed)
183 | 		if err != nil {
184 | 			file.Close()
185 | 			bufCancel()
186 | 			return nil, fmt.Errorf("failed to create gzip writer: %w", err)
187 | 		}
188 | 		ab.gzWriter = gzw
189 | 		ab.bufWriter = bufio.NewWriterSize(gzw, options.BufferSize)
190 | 	} else {
191 | 		ab.bufWriter = bufio.NewWriterSize(file, options.BufferSize)
192 | 	}
193 | 
194 | 	// Start background flusher
195 | 	ab.startBackgroundFlusher()
196 | 
197 | 	return ab, nil
198 | }
199 | 
200 | // startBackgroundFlusher starts a goroutine that periodically flushes the buffer
201 | func (ab *AsyncBuffer) startBackgroundFlusher() {
202 | 	ticker := time.NewTicker(ab.flushInterval)
203 | 
204 | 	go func() {
205 | 		defer ticker.Stop()
206 | 		for {
207 | 			select {
208 | 			case <-ticker.C:
209 | 				if err := ab.Flush(); err != nil && !errors.Is(err, ErrFlushTimeout) {
210 | 					ab.metrics.ErrorCount.Add(1)
211 | 					ab.metrics.LastErrorTime.Store(time.Now().UnixNano())
212 | 					// TODO: Log error
213 | 				}
214 | 			case <-ab.ctx.Done():
215 | 				return
216 | 			}
217 | 		}
218 | 	}()
219 | }
220 | 
221 | // Write writes data to the buffer
222 | func (ab *AsyncBuffer) Write(data []byte) (int, error) {
223 | 	ab.mu.Lock()
224 | 	defer ab.mu.Unlock()
225 | 
226 | 	if ab.closed {
227 | 		return 0, ErrBufferClosed
228 | 	}
229 | 
230 | 	// Check if we need to apply backpressure
231 | 	if float64(ab.bufWriter.Buffered())/float64(ab.bufferSize) >= ab.flushThreshold {
232 | 		// Signal backpressure
233 | 		select {
234 | 		case ab.backpressure <- struct{}{}:
235 | 		default:
236 | 			// Channel already has a value
237 | 		}
238 | 
239 | 		ab.metrics.BackpressureHits.Add(1)
240 | 
241 | 		// If we have too many pending writes, return error
242 | 		if len(ab.writeQueue) > 100 {
243 | 			return 0, ErrBufferFull
244 | 		}
245 | 
246 | 		// Queue the write for later
247 | 		dataCopy := make([]byte, len(data))
248 | 		copy(dataCopy, data)
249 | 		ab.writeQueue = append(ab.writeQueue, dataCopy)
250 | 
251 | 		// Trigger a flush
252 | 		go ab.Flush()
253 | 
254 | 		return len(data), nil
255 | 	}
256 | 
257 | 	// Write to buffer
258 | 	n, err := ab.bufWriter.Write(data)
259 | 	if err != nil {
260 | 		ab.metrics.ErrorCount.Add(1)
261 | 		ab.metrics.LastErrorTime.Store(time.Now().UnixNano())
262 | 		return n, fmt.Errorf("failed to write to buffer: %w", err)
263 | 	}
264 | 
265 | 	ab.metrics.BytesWritten.Add(int64(n))
266 | 	ab.metrics.WriteCount.Add(1)
267 | 	ab.metrics.LastWriteTime.Store(time.Now().UnixNano())
268 | 
269 | 	// Process queued writes if buffer has space
270 | 	if len(ab.writeQueue) > 0 && float64(ab.bufWriter.Buffered())/float64(ab.bufferSize) < ab.flushThreshold {
271 | 		// Process some queued writes
272 | 		processed := 0
273 | 		for i, queuedData := range ab.writeQueue {
274 | 			if float64(ab.bufWriter.Buffered()+len(queuedData))/float64(ab.bufferSize) >= ab.flushThreshold {
275 | 				break
276 | 			}
277 | 
278 | 			n, err := ab.bufWriter.Write(queuedData)
279 | 			if err != nil {
280 | 				ab.metrics.ErrorCount.Add(1)
281 | 				ab.metrics.LastErrorTime.Store(time.Now().UnixNano())
282 | 				break
283 | 			}
284 | 
285 | 			ab.metrics.BytesWritten.Add(int64(n))
286 | 			ab.metrics.WriteCount.Add(1)
287 | 			processed = i + 1
288 | 		}
289 | 
290 | 		// Remove processed items from queue
291 | 		if processed > 0 {
292 | 			ab.writeQueue = ab.writeQueue[processed:]
293 | 		}
294 | 
295 | 		// If queue is empty, release backpressure
296 | 		if len(ab.writeQueue) == 0 {
297 | 			// Clear backpressure signal
298 | 			select {
299 | 			case <-ab.backpressure:
300 | 			default:
301 | 			}
302 | 		}
303 | 	}
304 | 
305 | 	return n, nil
306 | }
307 | 
308 | // Flush flushes the buffer to disk
309 | func (ab *AsyncBuffer) Flush() error {
310 | 	ab.mu.Lock()
311 | 
312 | 	if ab.closed {
313 | 		ab.mu.Unlock()
314 | 		return ErrBufferClosed
315 | 	}
316 | 
317 | 	if ab.flushInProgress {
318 | 		// Another flush is already in progress
319 | 		ab.mu.Unlock()
320 | 
321 | 		// Wait for it to complete with timeout
322 | 		select {
323 | 		case <-ab.flushComplete:
324 | 			return nil
325 | 		case <-time.After(5 * time.Second):
326 | 			return ErrFlushTimeout
327 | 		case <-ab.ctx.Done():
328 | 			return ab.ctx.Err()
329 | 		}
330 | 	}
331 | 
332 | 	// Nothing to flush
333 | 	if ab.bufWriter.Buffered() == 0 {
334 | 		ab.mu.Unlock()
335 | 		return nil
336 | 	}
337 | 
338 | 	// Mark flush in progress
339 | 	ab.flushInProgress = true
340 | 	ab.flushWg.Add(1)
341 | 	ab.mu.Unlock()
342 | 
343 | 	// Perform the flush in a separate goroutine to avoid blocking
344 | 	go func() {
345 | 		defer ab.flushWg.Done()
346 | 		defer func() {
347 | 			ab.mu.Lock()
348 | 			ab.flushInProgress = false
349 | 			ab.lastFlushTime = time.Now()
350 | 			ab.mu.Unlock()
351 | 
352 | 			// Signal flush complete
353 | 			select {
354 | 			case ab.flushComplete <- struct{}{}:
355 | 			default:
356 | 			}
357 | 		}()
358 | 
359 | 		// Flush the buffer
360 | 		if err := ab.bufWriter.Flush(); err != nil {
361 | 			ab.metrics.ErrorCount.Add(1)
362 | 			ab.metrics.LastErrorTime.Store(time.Now().UnixNano())
363 | 			return
364 | 		}
365 | 
366 | 		// If compressed, flush the gzip writer
367 | 		if ab.compressed && ab.gzWriter != nil {
368 | 			if err := ab.gzWriter.Flush(); err != nil {
369 | 				ab.metrics.ErrorCount.Add(1)
370 | 				ab.metrics.LastErrorTime.Store(time.Now().UnixNano())
371 | 				return
372 | 			}
373 | 		}
374 | 
375 | 		// Sync to disk
376 | 		if err := ab.file.Sync(); err != nil {
377 | 			ab.metrics.ErrorCount.Add(1)
378 | 			ab.metrics.LastErrorTime.Store(time.Now().UnixNano())
379 | 			return
380 | 		}
381 | 
382 | 		// Update metrics
383 | 		ab.metrics.FlushCount.Add(1)
384 | 		ab.metrics.BytesFlushed.Add(int64(ab.bufWriter.Buffered()))
385 | 		ab.metrics.LastFlushTime.Store(time.Now().UnixNano())
386 | 	}()
387 | 
388 | 	return nil
389 | }
390 | 
391 | // Close flushes and closes the buffer
392 | func (ab *AsyncBuffer) Close() error {
393 | 	ab.mu.Lock()
394 | 
395 | 	if ab.closed {
396 | 		ab.mu.Unlock()
397 | 		return nil
398 | 	}
399 | 
400 | 	ab.closed = true
401 | 	ab.mu.Unlock()
402 | 
403 | 	// Cancel context to stop background flusher
404 | 	ab.cancel()
405 | 
406 | 	// Wait for any in-progress flushes to complete
407 | 	ab.flushWg.Wait()
408 | 
409 | 	// Final flush
410 | 	if err := ab.bufWriter.Flush(); err != nil {
411 | 		return fmt.Errorf("failed to flush buffer on close: %w", err)
412 | 	}
413 | 
414 | 	// Close gzip writer if used
415 | 	if ab.compressed && ab.gzWriter != nil {
416 | 		if err := ab.gzWriter.Close(); err != nil {
417 | 			return fmt.Errorf("failed to close gzip writer: %w", err)
418 | 		}
419 | 	}
420 | 
421 | 	// Close file
422 | 	if err := ab.file.Close(); err != nil {
423 | 		return fmt.Errorf("failed to close file: %w", err)
424 | 	}
425 | 
426 | 	return nil
427 | }
428 | 
429 | // WaitForBackpressure waits for backpressure to be applied
430 | func (ab *AsyncBuffer) WaitForBackpressure(ctx context.Context) error {
431 | 	select {
432 | 	case <-ab.backpressure:
433 | 		return nil
434 | 	case <-ctx.Done():
435 | 		return ctx.Err()
436 | 	}
437 | }
438 | 
439 | // GetMetrics returns the current metrics for the buffer
440 | func (ab *AsyncBuffer) GetMetrics() *BufferMetrics {
441 | 	return &ab.metrics
442 | }
443 | 
444 | // BufferPool manages a pool of AsyncBuffers
445 | type BufferPool struct {
446 | 	mu      sync.RWMutex
447 | 	buffers map[string]*AsyncBuffer
448 | 	ctx     context.Context
449 | 	cancel  context.CancelFunc
450 | 	options *AsyncBufferOptions
451 | }
452 | 
453 | // NewBufferPool creates a new BufferPool
454 | func NewBufferPool(ctx context.Context, options *AsyncBufferOptions) *BufferPool {
455 | 	poolCtx, poolCancel := context.WithCancel(ctx)
456 | 
457 | 	return &BufferPool{
458 | 		buffers: make(map[string]*AsyncBuffer),
459 | 		ctx:     poolCtx,
460 | 		cancel:  poolCancel,
461 | 		options: options,
462 | 	}
463 | }
464 | 
465 | // GetBuffer returns a buffer for the given path, creating it if necessary
466 | func (bp *BufferPool) GetBuffer(path string) (*AsyncBuffer, error) {
467 | 	// First check if buffer exists with read lock
468 | 	bp.mu.RLock()
469 | 	buffer, exists := bp.buffers[path]
470 | 	bp.mu.RUnlock()
471 | 
472 | 	if exists {
473 | 		return buffer, nil
474 | 	}
475 | 
476 | 	// Create new buffer with write lock
477 | 	bp.mu.Lock()
478 | 	defer bp.mu.Unlock()
479 | 
480 | 	// Check again in case another goroutine created it
481 | 	buffer, exists = bp.buffers[path]
482 | 	if exists {
483 | 		return buffer, nil
484 | 	}
485 | 
486 | 	// Create new buffer
487 | 	options := *bp.options // Copy options
488 | 	options.Identifier = path
489 | 
490 | 	buffer, err := NewAsyncBuffer(bp.ctx, path, &options)
491 | 	if err != nil {
492 | 		return nil, err
493 | 	}
494 | 
495 | 	bp.buffers[path] = buffer
496 | 	return buffer, nil
497 | }
498 | 
499 | // Close closes all buffers in the pool
500 | func (bp *BufferPool) Close() error {
501 | 	bp.cancel() // Cancel context to stop all background operations
502 | 
503 | 	bp.mu.Lock()
504 | 	defer bp.mu.Unlock()
505 | 
506 | 	var lastErr error
507 | 	for path, buffer := range bp.buffers {
508 | 		if err := buffer.Close(); err != nil {
509 | 			lastErr = fmt.Errorf("failed to close buffer %s: %w", path, err)
510 | 		}
511 | 	}
512 | 
513 | 	return lastErr
514 | }
515 | 
516 | // Flush flushes all buffers in the pool
517 | func (bp *BufferPool) Flush() error {
518 | 	bp.mu.RLock()
519 | 	defer bp.mu.RUnlock()
520 | 
521 | 	var lastErr error
522 | 	for path, buffer := range bp.buffers {
523 | 		if err := buffer.Flush(); err != nil {
524 | 			lastErr = fmt.Errorf("failed to flush buffer %s: %w", path, err)
525 | 		}
526 | 	}
527 | 
528 | 	return lastErr
529 | }
530 | 


--------------------------------------------------------------------------------
/internal/metrics/metrics.go:
--------------------------------------------------------------------------------
  1 | package metrics
  2 | 
  3 | /*
  4 | rxtls — fast tool in Go for working with Certificate Transparency logs
  5 | Copyright (C) 2025  Pepijn van der Stap <rxtls@vanderstap.info>
  6 | 
  7 | This program is free software: you can redistribute it and/or modify
  8 | it under the terms of the GNU Affero General Public License as published by
  9 | the Free Software Foundation, either version 3 of the License, or
 10 | (at your option) any later version.
 11 | 
 12 | This program is distributed in the hope that it will be useful,
 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 15 | GNU Affero General Public License for more details.
 16 | 
 17 | You should have received a copy of the GNU Affero General Public License
 18 | along with this program.  If not, see <https://www.gnu.org/licenses/>.
 19 | */
 20 | 
 21 | import (
 22 | 	"context"
 23 | 	"log"
 24 | 	"net/http"
 25 | 	"strconv"
 26 | 	"sync"
 27 | 	"time"
 28 | 
 29 | 	"github.com/prometheus/client_golang/prometheus"
 30 | 	"github.com/prometheus/client_golang/prometheus/promauto"
 31 | 	"github.com/prometheus/client_golang/prometheus/promhttp"
 32 | )
 33 | 
 34 | var (
 35 | 	registry           = prometheus.NewRegistry()
 36 | 	defaultRegisterer  = promauto.With(registry)
 37 | 	metricsInitialized sync.Once
 38 | 	metricsEnabled     bool
 39 | 	metricsServer      *http.Server
 40 | )
 41 | 
 42 | // Metrics contains all the Prometheus metrics for the application
 43 | type Metrics struct {
 44 | 	// Certificate processing metrics
 45 | 	CertProcessingDuration *prometheus.HistogramVec
 46 | 	CertProcessedTotal     *prometheus.CounterVec
 47 | 	CertFailedTotal        *prometheus.CounterVec
 48 | 
 49 | 	// Network metrics
 50 | 	NetworkRequestDuration *prometheus.HistogramVec
 51 | 	NetworkRequestsTotal   *prometheus.CounterVec
 52 | 	NetworkErrorsTotal     *prometheus.CounterVec
 53 | 	NetworkRetriesTotal    *prometheus.CounterVec
 54 | 	TLSHandshakeDuration   *prometheus.HistogramVec
 55 | 
 56 | 	// Queue metrics
 57 | 	QueueSize            *prometheus.GaugeVec
 58 | 	QueueLatency         *prometheus.HistogramVec
 59 | 	QueuePressure        *prometheus.GaugeVec
 60 | 	QueueCapacity        *prometheus.GaugeVec
 61 | 	QueueBackpressureHit *prometheus.CounterVec
 62 | 
 63 | 	// Worker metrics
 64 | 	WorkerBusy         *prometheus.GaugeVec
 65 | 	WorkerProcessed    *prometheus.CounterVec
 66 | 	WorkerErrors       *prometheus.CounterVec
 67 | 	WorkerPanics       *prometheus.CounterVec
 68 | 	WorkerIdleDuration *prometheus.HistogramVec
 69 | 	WorkerRateLimit    *prometheus.GaugeVec
 70 | 
 71 | 	// Disk I/O metrics
 72 | 	DiskWriteDuration *prometheus.HistogramVec
 73 | 	DiskWriteBytes    *prometheus.HistogramVec
 74 | 	DiskWriteOps      *prometheus.CounterVec
 75 | 	DiskErrors        *prometheus.CounterVec
 76 | 	DiskBufferSize    *prometheus.GaugeVec
 77 | 
 78 | 	// Scheduler metrics
 79 | 	SchedulerShardsActive   *prometheus.GaugeVec
 80 | 	SchedulerWorkSubmitted  *prometheus.CounterVec
 81 | 	SchedulerWorkCompleted  *prometheus.CounterVec
 82 | 	SchedulerWorkFailed     *prometheus.CounterVec
 83 | 	SchedulerRateLimitDelay *prometheus.HistogramVec
 84 | 	SchedulerRetriesRate    *prometheus.GaugeVec
 85 | }
 86 | 
 87 | // Global instance of metrics
 88 | var globalMetrics *Metrics
 89 | var metricsOnce sync.Once
 90 | 
 91 | // GetMetrics returns the global metrics instance
 92 | func GetMetrics() *Metrics {
 93 | 	metricsOnce.Do(func() {
 94 | 		globalMetrics = newMetrics()
 95 | 	})
 96 | 	return globalMetrics
 97 | }
 98 | 
 99 | // EnableMetrics enables metrics collection
100 | func EnableMetrics() {
101 | 	metricsEnabled = true
102 | }
103 | 
104 | // IsMetricsEnabled returns whether metrics collection is enabled
105 | func IsMetricsEnabled() bool {
106 | 	return metricsEnabled
107 | }
108 | 
109 | // newMetrics creates and registers all metrics
110 | func newMetrics() *Metrics {
111 | 	buckets := []float64{.001, .005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10, 30, 60}
112 | 	byteBuckets := []float64{1024, 10 * 1024, 50 * 1024, 100 * 1024, 500 * 1024, 1000 * 1024, 5000 * 1024, 10000 * 1024}
113 | 
114 | 	m := &Metrics{
115 | 		// Certificate processing metrics
116 | 		CertProcessingDuration: defaultRegisterer.NewHistogramVec(
117 | 			prometheus.HistogramOpts{
118 | 				Name:    "rxtls_cert_processing_duration_seconds",
119 | 				Help:    "Time spent processing certificates",
120 | 				Buckets: buckets,
121 | 			},
122 | 			[]string{"log_url", "operation"},
123 | 		),
124 | 		CertProcessedTotal: defaultRegisterer.NewCounterVec(
125 | 			prometheus.CounterOpts{
126 | 				Name: "rxtls_cert_processed_total",
127 | 				Help: "Total number of certificates processed",
128 | 			},
129 | 			[]string{"log_url", "operation", "status"},
130 | 		),
131 | 		CertFailedTotal: defaultRegisterer.NewCounterVec(
132 | 			prometheus.CounterOpts{
133 | 				Name: "rxtls_cert_failed_total",
134 | 				Help: "Total number of certificate processing failures",
135 | 			},
136 | 			[]string{"log_url", "operation", "error_type"},
137 | 		),
138 | 
139 | 		// Network metrics
140 | 		NetworkRequestDuration: defaultRegisterer.NewHistogramVec(
141 | 			prometheus.HistogramOpts{
142 | 				Name:    "rxtls_network_request_duration_seconds",
143 | 				Help:    "Time spent on network requests",
144 | 				Buckets: buckets,
145 | 			},
146 | 			[]string{"log_url", "endpoint"},
147 | 		),
148 | 		NetworkRequestsTotal: defaultRegisterer.NewCounterVec(
149 | 			prometheus.CounterOpts{
150 | 				Name: "rxtls_network_requests_total",
151 | 				Help: "Total number of network requests",
152 | 			},
153 | 			[]string{"log_url", "endpoint", "status"},
154 | 		),
155 | 		NetworkErrorsTotal: defaultRegisterer.NewCounterVec(
156 | 			prometheus.CounterOpts{
157 | 				Name: "rxtls_network_errors_total",
158 | 				Help: "Total number of network errors",
159 | 			},
160 | 			[]string{"log_url", "endpoint", "error_type"},
161 | 		),
162 | 		NetworkRetriesTotal: defaultRegisterer.NewCounterVec(
163 | 			prometheus.CounterOpts{
164 | 				Name: "rxtls_network_retries_total",
165 | 				Help: "Total number of network retries",
166 | 			},
167 | 			[]string{"log_url", "endpoint"},
168 | 		),
169 | 		TLSHandshakeDuration: defaultRegisterer.NewHistogramVec(
170 | 			prometheus.HistogramOpts{
171 | 				Name:    "rxtls_tls_handshake_duration_seconds",
172 | 				Help:    "Time spent on TLS handshakes",
173 | 				Buckets: buckets,
174 | 			},
175 | 			[]string{"log_url"},
176 | 		),
177 | 
178 | 		// Queue metrics
179 | 		QueueSize: defaultRegisterer.NewGaugeVec(
180 | 			prometheus.GaugeOpts{
181 | 				Name: "rxtls_queue_size",
182 | 				Help: "Current size of work queues",
183 | 			},
184 | 			[]string{"worker_id", "log_url"},
185 | 		),
186 | 		QueueLatency: defaultRegisterer.NewHistogramVec(
187 | 			prometheus.HistogramOpts{
188 | 				Name:    "rxtls_queue_latency_seconds",
189 | 				Help:    "Time items spend in queue before processing",
190 | 				Buckets: buckets,
191 | 			},
192 | 			[]string{"worker_id", "log_url"},
193 | 		),
194 | 		QueuePressure: defaultRegisterer.NewGaugeVec(
195 | 			prometheus.GaugeOpts{
196 | 				Name: "rxtls_queue_pressure",
197 | 				Help: "Queue pressure as a ratio of current size to capacity (0-1)",
198 | 			},
199 | 			[]string{"worker_id", "log_url"},
200 | 		),
201 | 		QueueCapacity: defaultRegisterer.NewGaugeVec(
202 | 			prometheus.GaugeOpts{
203 | 				Name: "rxtls_queue_capacity",
204 | 				Help: "Maximum capacity of work queues",
205 | 			},
206 | 			[]string{"worker_id"},
207 | 		),
208 | 		QueueBackpressureHit: defaultRegisterer.NewCounterVec(
209 | 			prometheus.CounterOpts{
210 | 				Name: "rxtls_queue_backpressure_hits_total",
211 | 				Help: "Number of times backpressure was applied due to full queue",
212 | 			},
213 | 			[]string{"worker_id", "log_url"},
214 | 		),
215 | 
216 | 		// Worker metrics
217 | 		WorkerBusy: defaultRegisterer.NewGaugeVec(
218 | 			prometheus.GaugeOpts{
219 | 				Name: "rxtls_worker_busy",
220 | 				Help: "Whether a worker is currently busy (1) or idle (0)",
221 | 			},
222 | 			[]string{"worker_id"},
223 | 		),
224 | 		WorkerProcessed: defaultRegisterer.NewCounterVec(
225 | 			prometheus.CounterOpts{
226 | 				Name: "rxtls_worker_processed_total",
227 | 				Help: "Total number of items processed by a worker",
228 | 			},
229 | 			[]string{"worker_id", "log_url"},
230 | 		),
231 | 		WorkerErrors: defaultRegisterer.NewCounterVec(
232 | 			prometheus.CounterOpts{
233 | 				Name: "rxtls_worker_errors_total",
234 | 				Help: "Total number of errors encountered by a worker",
235 | 			},
236 | 			[]string{"worker_id", "log_url", "error_type"},
237 | 		),
238 | 		WorkerPanics: defaultRegisterer.NewCounterVec(
239 | 			prometheus.CounterOpts{
240 | 				Name: "rxtls_worker_panics_total",
241 | 				Help: "Total number of panics recovered by a worker",
242 | 			},
243 | 			[]string{"worker_id"},
244 | 		),
245 | 		WorkerIdleDuration: defaultRegisterer.NewHistogramVec(
246 | 			prometheus.HistogramOpts{
247 | 				Name:    "rxtls_worker_idle_duration_seconds",
248 | 				Help:    "Time workers spend idle waiting for work",
249 | 				Buckets: buckets,
250 | 			},
251 | 			[]string{"worker_id"},
252 | 		),
253 | 		WorkerRateLimit: defaultRegisterer.NewGaugeVec(
254 | 			prometheus.GaugeOpts{
255 | 				Name: "rxtls_worker_rate_limit",
256 | 				Help: "Current rate limit for each worker",
257 | 			},
258 | 			[]string{"worker_id"},
259 | 		),
260 | 
261 | 		// Disk I/O metrics
262 | 		DiskWriteDuration: defaultRegisterer.NewHistogramVec(
263 | 			prometheus.HistogramOpts{
264 | 				Name:    "rxtls_disk_write_duration_seconds",
265 | 				Help:    "Time spent writing to disk",
266 | 				Buckets: buckets,
267 | 			},
268 | 			[]string{"log_url", "operation"},
269 | 		),
270 | 		DiskWriteBytes: defaultRegisterer.NewHistogramVec(
271 | 			prometheus.HistogramOpts{
272 | 				Name:    "rxtls_disk_write_bytes_total",
273 | 				Help:    "Total number of bytes written to disk",
274 | 				Buckets: byteBuckets,
275 | 			},
276 | 			[]string{"log_url", "operation"},
277 | 		),
278 | 		DiskWriteOps: defaultRegisterer.NewCounterVec(
279 | 			prometheus.CounterOpts{
280 | 				Name: "rxtls_disk_write_ops_total",
281 | 				Help: "Total number of write operations to disk",
282 | 			},
283 | 			[]string{"log_url", "operation"},
284 | 		),
285 | 		DiskErrors: defaultRegisterer.NewCounterVec(
286 | 			prometheus.CounterOpts{
287 | 				Name: "rxtls_disk_errors_total",
288 | 				Help: "Total number of disk errors",
289 | 			},
290 | 			[]string{"log_url", "operation", "error_type"},
291 | 		),
292 | 		DiskBufferSize: defaultRegisterer.NewGaugeVec(
293 | 			prometheus.GaugeOpts{
294 | 				Name: "rxtls_disk_buffer_size_bytes",
295 | 				Help: "Size of disk write buffers in bytes",
296 | 			},
297 | 			[]string{"log_url", "operation"},
298 | 		),
299 | 
300 | 		// Scheduler metrics
301 | 		SchedulerShardsActive: defaultRegisterer.NewGaugeVec(
302 | 			prometheus.GaugeOpts{
303 | 				Name: "rxtls_scheduler_shards_active",
304 | 				Help: "Number of active shards in the scheduler",
305 | 			},
306 | 			[]string{"operation"},
307 | 		),
308 | 		SchedulerWorkSubmitted: defaultRegisterer.NewCounterVec(
309 | 			prometheus.CounterOpts{
310 | 				Name: "rxtls_scheduler_work_submitted_total",
311 | 				Help: "Total number of work items submitted to the scheduler",
312 | 			},
313 | 			[]string{"log_url", "operation"},
314 | 		),
315 | 		SchedulerWorkCompleted: defaultRegisterer.NewCounterVec(
316 | 			prometheus.CounterOpts{
317 | 				Name: "rxtls_scheduler_work_completed_total",
318 | 				Help: "Total number of work items completed by the scheduler",
319 | 			},
320 | 			[]string{"log_url", "operation"},
321 | 		),
322 | 		SchedulerWorkFailed: defaultRegisterer.NewCounterVec(
323 | 			prometheus.CounterOpts{
324 | 				Name: "rxtls_scheduler_work_failed_total",
325 | 				Help: "Total number of work items that failed processing",
326 | 			},
327 | 			[]string{"log_url", "operation", "error_type"},
328 | 		),
329 | 		SchedulerRateLimitDelay: defaultRegisterer.NewHistogramVec(
330 | 			prometheus.HistogramOpts{
331 | 				Name:    "rxtls_scheduler_rate_limit_delay_seconds",
332 | 				Help:    "Time spent waiting due to rate limiting",
333 | 				Buckets: buckets,
334 | 			},
335 | 			[]string{"log_url", "operation"},
336 | 		),
337 | 		SchedulerRetriesRate: defaultRegisterer.NewGaugeVec(
338 | 			prometheus.GaugeOpts{
339 | 				Name: "rxtls_scheduler_retries_rate",
340 | 				Help: "Rate of retries per second",
341 | 			},
342 | 			[]string{"log_url", "operation"},
343 | 		),
344 | 	}
345 | 
346 | 	return m
347 | }
348 | 
349 | // StartMetricsServer starts an HTTP server to expose Prometheus metrics
350 | func StartMetricsServer(addr string) error {
351 | 	if !metricsEnabled {
352 | 		return nil
353 | 	}
354 | 
355 | 	// Only start once
356 | 	var startErr error
357 | 	metricsInitialized.Do(func() {
358 | 		mux := http.NewServeMux()
359 | 		mux.Handle("/metrics", promhttp.HandlerFor(registry, promhttp.HandlerOpts{}))
360 | 
361 | 		metricsServer = &http.Server{
362 | 			Addr:    addr,
363 | 			Handler: mux,
364 | 		}
365 | 
366 | 		go func() {
367 | 			log.Printf("Starting metrics server on %s", addr)
368 | 			if err := metricsServer.ListenAndServe(); err != nil && err != http.ErrServerClosed {
369 | 				log.Printf("Metrics server error: %v", err)
370 | 			}
371 | 		}()
372 | 	})
373 | 
374 | 	return startErr
375 | }
376 | 
377 | // ShutdownMetricsServer gracefully shuts down the metrics server
378 | func ShutdownMetricsServer(ctx context.Context) error {
379 | 	if metricsServer != nil {
380 | 		log.Println("Shutting down metrics server...")
381 | 		return metricsServer.Shutdown(ctx)
382 | 	}
383 | 	return nil
384 | }
385 | 
386 | // RecordWithLabels is a helper to record metrics with labels
387 | func (m *Metrics) RecordWithLabels(fn func(), labels prometheus.Labels) {
388 | 	if !metricsEnabled {
389 | 		fn()
390 | 		return
391 | 	}
392 | 
393 | 	start := time.Now()
394 | 	fn()
395 | 	_ = time.Since(start) // Record duration if needed
396 | 	// This is just a placeholder - actual implementation would depend on the metric type
397 | }
398 | 
399 | // MeasureDuration is a helper to measure the duration of a function
400 | func MeasureDuration(histogram *prometheus.HistogramVec, labels prometheus.Labels) func() {
401 | 	if !metricsEnabled {
402 | 		return func() {}
403 | 	}
404 | 
405 | 	start := time.Now()
406 | 	return func() {
407 | 		duration := time.Since(start)
408 | 		histogram.With(labels).Observe(duration.Seconds())
409 | 	}
410 | }
411 | 
412 | // UpdateQueueMetrics updates queue metrics for a worker
413 | func (m *Metrics) UpdateQueueMetrics(workerID int, logURL string, queueSize, queueCapacity int) {
414 | 	if !metricsEnabled {
415 | 		return
416 | 	}
417 | 
418 | 	m.QueueSize.WithLabelValues(strconv.Itoa(workerID), logURL).Set(float64(queueSize))
419 | 	m.QueueCapacity.WithLabelValues(strconv.Itoa(workerID)).Set(float64(queueCapacity))
420 | 
421 | 	if queueCapacity > 0 {
422 | 		pressure := float64(queueSize) / float64(queueCapacity)
423 | 		m.QueuePressure.WithLabelValues(strconv.Itoa(workerID), logURL).Set(pressure)
424 | 	}
425 | }
426 | 
427 | // UpdateWorkerRateLimit updates the rate limit metric for a worker
428 | func (m *Metrics) UpdateWorkerRateLimit(workerID int, rateLimit float64) {
429 | 	if !metricsEnabled {
430 | 		return
431 | 	}
432 | 
433 | 	m.WorkerRateLimit.WithLabelValues(strconv.Itoa(workerID)).Set(rateLimit)
434 | }
435 | 
436 | // UpdateRetriesRate updates the retries rate metric
437 | func (m *Metrics) UpdateRetriesRate(logURL, operation string, retriesPerSecond float64) {
438 | 	if !metricsEnabled {
439 | 		return
440 | 	}
441 | 
442 | 	m.SchedulerRetriesRate.WithLabelValues(logURL, operation).Set(retriesPerSecond)
443 | }
444 | 


--------------------------------------------------------------------------------
/internal/util/filename.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Package util provides miscellaneous utility functions used across the rxtls application.
 3 | These functions are typically small, self-contained, and offer common helper functionalities
 4 | that don't belong to a more specific package like `core` or `client`.
 5 | */
 6 | package util
 7 | 
 8 | /*
 9 | rxtls — fast tool in Go for working with Certificate Transparency logs
10 | Copyright (C) 2025  Pepijn van der Stap <rxtls@vanderstap.info>
11 | 
12 | This program is free software: you can redistribute it and/or modify
13 | it under the terms of the GNU Affero General Public License as published by
14 | the Free Software Foundation, either version 3 of the License, or
15 | (at your option) any later version.
16 | 
17 | This program is distributed in the hope that it will be useful,
18 | but WITHOUT ANY WARRANTY; without even the implied warranty of
19 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20 | GNU Affero General Public License for more details.
21 | 
22 | You should have received a copy of the GNU Affero General Public License
23 | along with this program.  If not, see <https://www.gnu.org/licenses/>.
24 | */
25 | 
26 | import "strings"
27 | 
28 | // SanitizeFilename takes an input string (typically a URL or a descriptive name)
29 | // and transforms it into a string that is generally safe to use as a filename
30 | // on common operating systems.
31 | //
32 | // The sanitization process involves:
33 | //  1. Replacing characters that are problematic in filenames (e.g., '/', '\', ':', '*', '?', '"', '<', '>', '|')
34 | //     with underscores ('_').
35 | //  2. Limiting the total length of the filename to a predefined maximum (currently 100 characters)
36 | //     to prevent issues with OS filename length limits.
37 | //
38 | // This function is primarily used when generating output filenames based on CT log URLs
39 | // to ensure that the resulting names are valid and do not cause filesystem errors.
40 | //
41 | // Performance: For its intended use (generating a few filenames at the start of processing a log),
42 | // the performance of this function is not critical. It uses standard string manipulation functions.
43 | //
44 | // Parameters:
45 | //   input: The string to be sanitized into a filename-safe format.
46 | //
47 | // Returns:
48 | //   A sanitized string suitable for use as a filename.
49 | func SanitizeFilename(input string) string {
50 | 	// Replace common problematic characters with an underscore.
51 | 	// This set can be expanded if other problematic characters are identified.
52 | 	replaced := strings.Map(func(r rune) rune {
53 | 		switch r {
54 | 		case '/', '\\', ':', '*', '?', '"', '<', '>', '|': // Common invalid filename chars on Windows/Unix.
55 | 			return '_'
56 | 		}
57 | 		return r // Keep other characters as they are.
58 | 	}, input)
59 | 
60 | 	// Limit filename length to avoid issues with operating system limits.
61 | 	// A maxLength of 100 is a conservative choice, well within typical FS limits (e.g., 255 bytes).
62 | 	const maxLength = 100
63 | 	if len(replaced) > maxLength {
64 | 		// Truncate the string if it exceeds the maximum length.
65 | 		// Note: This is a simple truncation. For multi-byte character sets (UTF-8),
66 | 		// this could potentially cut a character in half if not careful. However, for URLs
67 | 		// and typical log names, this is often acceptable. More robust truncation would
68 | 		// require rune-aware iteration.
69 | 		return replaced[:maxLength]
70 | 	}
71 | 	return replaced
72 | }
73 | 


--------------------------------------------------------------------------------