├── .dockerignore ├── .github ├── FUNDING.yml └── workflows │ ├── codacy.yml │ ├── codeql.yml │ ├── go-ossf-slsa3-publish.yml │ ├── power-ci.yaml │ └── release.yml ├── .gitignore ├── .golangci.yml ├── .goreleaser.yaml ├── Dockerfile ├── Dockerfile.goreleaser ├── LICENSE ├── Makefile ├── README.md ├── all_logs_list.json ├── cmd └── rxtls │ └── main.go ├── go.mod ├── go.sum └── internal ├── certlib ├── api.go ├── domain_normalization_test.go ├── models.go └── models_test.go ├── client └── http.go ├── core ├── common.go ├── constants.go ├── domain_extractor.go ├── download_manager.go ├── error.go ├── list.go ├── ratelimiter.go ├── scheduler.go ├── scheduler_stub.go └── work.go ├── io └── buffer.go ├── metrics └── metrics.go └── util └── filename.go /.dockerignore: -------------------------------------------------------------------------------- 1 | # Git files 2 | .git 3 | .gitignore 4 | 5 | # Documentation 6 | *.md 7 | docs/ 8 | LICENSE 9 | 10 | # Build artifacts 11 | dist/ 12 | *.exe 13 | rxtls 14 | 15 | # Development files 16 | .goreleaser.yaml 17 | .golangci.yml 18 | Makefile 19 | .github/ 20 | 21 | # Test files 22 | *_test.go 23 | testdata/ 24 | 25 | # IDE files 26 | .vscode/ 27 | .idea/ 28 | *.swp 29 | *.swo 30 | 31 | # OS files 32 | .DS_Store 33 | Thumbs.db -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: xstp 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: securetheplanet 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry 12 | polar: # Replace with a single Polar username 13 | buy_me_a_coffee: xstp 14 | thanks_dev: # Replace with a single thanks.dev username 15 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 16 | -------------------------------------------------------------------------------- /.github/workflows/codacy.yml: -------------------------------------------------------------------------------- 1 | # This workflow uses actions that are not certified by GitHub. 2 | # They are provided by a third-party and are governed by 3 | # separate terms of service, privacy policy, and support 4 | # documentation. 5 | 6 | # This workflow checks out code, performs a Codacy security scan 7 | # and integrates the results with the 8 | # GitHub Advanced Security code scanning feature. For more information on 9 | # the Codacy security scan action usage and parameters, see 10 | # https://github.com/codacy/codacy-analysis-cli-action. 11 | # For more information on Codacy Analysis CLI in general, see 12 | # https://github.com/codacy/codacy-analysis-cli. 13 | 14 | name: Codacy Security Scan 15 | 16 | on: 17 | push: 18 | branches: [ "main" ] 19 | pull_request: 20 | # The branches below must be a subset of the branches above 21 | branches: [ "main" ] 22 | schedule: 23 | - cron: '32 7 * * 0' 24 | 25 | permissions: 26 | contents: read 27 | 28 | jobs: 29 | codacy-security-scan: 30 | permissions: 31 | contents: read # for actions/checkout to fetch code 32 | security-events: write # for github/codeql-action/upload-sarif to upload SARIF results 33 | actions: read # only required for a private repository by github/codeql-action/upload-sarif to get the Action run status 34 | name: Codacy Security Scan 35 | runs-on: ubuntu-latest 36 | steps: 37 | # Checkout the repository to the GitHub Actions runner 38 | - name: Checkout code 39 | uses: actions/checkout@v4 40 | 41 | # Execute Codacy Analysis CLI and generate a SARIF output with the security issues identified during the analysis 42 | - name: Run Codacy Analysis CLI 43 | uses: codacy/codacy-analysis-cli-action@d840f886c4bd4edc059706d09c6a1586111c540b 44 | with: 45 | # Check https://github.com/codacy/codacy-analysis-cli#project-token to get your project token from your Codacy repository 46 | # You can also omit the token and run the tools that support default configurations 47 | project-token: ${{ secrets.CODACY_PROJECT_TOKEN }} 48 | verbose: true 49 | output: results.sarif 50 | format: sarif 51 | # Adjust severity of non-security issues 52 | gh-code-scanning-compat: true 53 | # Force 0 exit code to allow SARIF file generation 54 | # This will handover control about PR rejection to the GitHub side 55 | max-allowed-issues: 2147483647 56 | 57 | # Upload the SARIF file generated in the previous step 58 | - name: Upload SARIF results file 59 | uses: github/codeql-action/upload-sarif@v3 60 | with: 61 | sarif_file: results.sarif 62 | -------------------------------------------------------------------------------- /.github/workflows/codeql.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL Advanced" 13 | 14 | on: 15 | push: 16 | branches: [ "main" ] 17 | pull_request: 18 | branches: [ "main" ] 19 | schedule: 20 | - cron: '18 18 * * 1' 21 | 22 | jobs: 23 | analyze: 24 | name: Analyze (${{ matrix.language }}) 25 | # Runner size impacts CodeQL analysis time. To learn more, please see: 26 | # - https://gh.io/recommended-hardware-resources-for-running-codeql 27 | # - https://gh.io/supported-runners-and-hardware-resources 28 | # - https://gh.io/using-larger-runners (GitHub.com only) 29 | # Consider using larger runners or machines with greater resources for possible analysis time improvements. 30 | runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }} 31 | permissions: 32 | # required for all workflows 33 | security-events: write 34 | 35 | # required to fetch internal or private CodeQL packs 36 | packages: read 37 | 38 | # only required for workflows in private repositories 39 | actions: read 40 | contents: read 41 | 42 | strategy: 43 | fail-fast: false 44 | matrix: 45 | include: 46 | - language: actions 47 | build-mode: none 48 | - language: go 49 | build-mode: autobuild 50 | # CodeQL supports the following values keywords for 'language': 'actions', 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'swift' 51 | # Use `c-cpp` to analyze code written in C, C++ or both 52 | # Use 'java-kotlin' to analyze code written in Java, Kotlin or both 53 | # Use 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both 54 | # To learn more about changing the languages that are analyzed or customizing the build mode for your analysis, 55 | # see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning. 56 | # If you are analyzing a compiled language, you can modify the 'build-mode' for that language to customize how 57 | # your codebase is analyzed, see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/codeql-code-scanning-for-compiled-languages 58 | steps: 59 | - name: Checkout repository 60 | uses: actions/checkout@v4 61 | 62 | # Add any setup steps before running the `github/codeql-action/init` action. 63 | # This includes steps like installing compilers or runtimes (`actions/setup-node` 64 | # or others). This is typically only required for manual builds. 65 | # - name: Setup runtime (example) 66 | # uses: actions/setup-example@v1 67 | 68 | # Initializes the CodeQL tools for scanning. 69 | - name: Initialize CodeQL 70 | uses: github/codeql-action/init@v3 71 | with: 72 | languages: ${{ matrix.language }} 73 | build-mode: ${{ matrix.build-mode }} 74 | # If you wish to specify custom queries, you can do so here or in a config file. 75 | # By default, queries listed here will override any specified in a config file. 76 | # Prefix the list here with "+" to use these queries and those in the config file. 77 | 78 | # For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs 79 | # queries: security-extended,security-and-quality 80 | 81 | # If the analyze step fails for one of the languages you are analyzing with 82 | # "We were unable to automatically build your code", modify the matrix above 83 | # to set the build mode to "manual" for that language. Then modify this step 84 | # to build your code. 85 | # ℹ️ Command-line programs to run using the OS shell. 86 | # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun 87 | - if: matrix.build-mode == 'manual' 88 | shell: bash 89 | run: | 90 | echo 'If you are using a "manual" build mode for one or more of the' \ 91 | 'languages you are analyzing, replace this with the commands to build' \ 92 | 'your code, for example:' 93 | echo ' make bootstrap' 94 | echo ' make release' 95 | exit 1 96 | 97 | - name: Perform CodeQL Analysis 98 | uses: github/codeql-action/analyze@v3 99 | with: 100 | category: "/language:${{matrix.language}}" 101 | -------------------------------------------------------------------------------- /.github/workflows/go-ossf-slsa3-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow uses actions that are not certified by GitHub. 2 | # They are provided by a third-party and are governed by 3 | # separate terms of service, privacy policy, and support 4 | # documentation. 5 | 6 | # This workflow lets you compile your Go project using a SLSA3 compliant builder. 7 | # This workflow will generate a so-called "provenance" file describing the steps 8 | # that were performed to generate the final binary. 9 | # The project is an initiative of the OpenSSF (openssf.org) and is developed at 10 | # https://github.com/slsa-framework/slsa-github-generator. 11 | # The provenance file can be verified using https://github.com/slsa-framework/slsa-verifier. 12 | # For more information about SLSA and how it improves the supply-chain, visit slsa.dev. 13 | 14 | name: SLSA Go releaser 15 | on: 16 | workflow_dispatch: 17 | release: 18 | types: [created] 19 | 20 | permissions: read-all 21 | 22 | jobs: 23 | # ======================================================================================================================================== 24 | # Prerequesite: Create a .slsa-goreleaser.yml in the root directory of your project. 25 | # See format in https://github.com/slsa-framework/slsa-github-generator/blob/main/internal/builders/go/README.md#configuration-file 26 | #========================================================================================================================================= 27 | build: 28 | permissions: 29 | id-token: write # To sign. 30 | contents: write # To upload release assets. 31 | actions: read # To read workflow path. 32 | uses: slsa-framework/slsa-github-generator/.github/workflows/builder_go_slsa3.yml@v1.4.0 33 | with: 34 | go-version: 1.24 35 | # ============================================================================================================= 36 | # Optional: For more options, see https://github.com/slsa-framework/slsa-github-generator#golang-projects 37 | # ============================================================================================================= 38 | 39 | -------------------------------------------------------------------------------- /.github/workflows/power-ci.yaml: -------------------------------------------------------------------------------- 1 | name: PowerPC CI 2 | 3 | permissions: 4 | contents: read 5 | 6 | on: 7 | pull_request: 8 | branches: [ main ] 9 | push: 10 | branches: [ main ] 11 | 12 | jobs: 13 | build-ppc64le: 14 | name: Build on ppc64le 15 | runs-on: ubuntu-latest 16 | steps: 17 | - name: Checkout code 18 | uses: actions/checkout@v3 19 | 20 | - name: Set up Go 21 | uses: actions/setup-go@v4 22 | with: 23 | go-version: 1.22 24 | 25 | - name: Install QEMU 26 | run: sudo apt-get install -y qemu-user-static 27 | 28 | - name: Cross-compile to ppc64le 29 | run: | 30 | GOARCH=ppc64le GOOS=linux go build -v ./... 31 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*' 7 | 8 | permissions: 9 | contents: write 10 | packages: write 11 | id-token: write 12 | 13 | jobs: 14 | release: 15 | runs-on: ubuntu-latest 16 | steps: 17 | - name: Checkout 18 | uses: actions/checkout@v4 19 | with: 20 | fetch-depth: 0 21 | 22 | - name: Set up Go 23 | uses: actions/setup-go@v5 24 | with: 25 | go-version: '1.24' 26 | 27 | - name: Set up QEMU 28 | uses: docker/setup-qemu-action@v3 29 | 30 | - name: Set up Docker Buildx 31 | uses: docker/setup-buildx-action@v3 32 | 33 | - name: Log in to GitHub Container Registry 34 | uses: docker/login-action@v3 35 | with: 36 | registry: ghcr.io 37 | username: ${{ github.actor }} 38 | password: ${{ secrets.GITHUB_TOKEN }} 39 | 40 | - name: Run GoReleaser 41 | uses: goreleaser/goreleaser-action@v6 42 | with: 43 | version: latest 44 | args: release --clean 45 | env: 46 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 47 | HOMEBREW_TAP_TOKEN: ${{ secrets.HOMEBREW_TAP_TOKEN }} 48 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | output/certs/* 2 | output/domains/* 3 | 4 | # GoReleaser artifacts 5 | dist/ 6 | *.snap 7 | 8 | # Binary 9 | rxtls 10 | 11 | # IDE 12 | .vscode/ 13 | .idea/ 14 | 15 | # Test coverage 16 | *.out 17 | coverage.html 18 | 19 | # Temporary files 20 | *.tmp 21 | *.log 22 | -------------------------------------------------------------------------------- /.golangci.yml: -------------------------------------------------------------------------------- 1 | # golangci-lint configuration for rxtls 2 | # This config enables comprehensive linting with focus on security and code quality 3 | 4 | run: 5 | # Timeout for analysis 6 | timeout: 5m 7 | 8 | # Include test files 9 | tests: true 10 | 11 | # Skip directories 12 | skip-dirs: 13 | - vendor 14 | - third_party 15 | - testdata 16 | - examples 17 | - dist 18 | 19 | # Skip files 20 | skip-files: 21 | - ".*\\.pb\\.go$" 22 | - ".*\\.gen\\.go$" 23 | 24 | # Output configuration 25 | output: 26 | # Format of the output 27 | format: colored-line-number 28 | 29 | # Print lines of code with issue 30 | print-issued-lines: true 31 | 32 | # Print linter name in the end of issue text 33 | print-linter-name: true 34 | 35 | linters: 36 | enable: 37 | # Security 38 | - gosec # Security checker 39 | - exportloopref # Checks for pointers to enclosing loop variables 40 | 41 | # Bug detection 42 | - staticcheck # Advanced static analysis 43 | - bodyclose # Checks HTTP response body is closed 44 | - nilerr # Finds code returning nil even if it checks for error 45 | - errcheck # Checks for unchecked errors 46 | - ineffassign # Detects ineffectual assignments 47 | 48 | # Code quality 49 | - revive # Fast, configurable linter 50 | - govet # Reports suspicious constructs 51 | - gofmt # Checks formatting 52 | - goimports # Checks imports formatting 53 | - misspell # Finds misspelled words 54 | - unconvert # Removes unnecessary type conversions 55 | - prealloc # Finds slice declarations that could be preallocated 56 | - nakedret # Finds naked returns in long functions 57 | 58 | # Style 59 | - gocritic # Highly extensible Go source code linter 60 | - gocyclo # Checks cyclomatic complexity 61 | - gocognit # Checks cognitive complexity 62 | - funlen # Checks function length 63 | - lll # Reports long lines 64 | 65 | # Performance 66 | - goconst # Finds repeated strings that could be constants 67 | - gosimple # Simplifies code 68 | 69 | # Best practices 70 | - unparam # Reports unused function parameters 71 | - dogsled # Checks assignments with too many blank identifiers 72 | - dupl # Checks for duplicated code 73 | - godox # Detects TODO/FIXME/etc comments 74 | 75 | disable: 76 | - gomnd # Magic number detector - too noisy for CT log processing 77 | - wsl # Whitespace linter - too strict 78 | - nlreturn # Too strict about newlines 79 | - gochecknoinits # We might need init functions 80 | - gochecknoglobals # We use some globals for configuration 81 | 82 | linters-settings: 83 | # Security 84 | gosec: 85 | severity: "high" 86 | confidence: "medium" 87 | # Rules to exclude 88 | excludes: 89 | - G104 # Unhandled errors - we use errcheck for this 90 | - G304 # File path provided as taint input - we need this for user-specified paths 91 | config: 92 | global: 93 | audit: true 94 | 95 | # Code quality 96 | revive: 97 | severity: warning 98 | enable-all-rules: false 99 | rules: 100 | - name: blank-imports 101 | - name: context-as-argument 102 | - name: context-keys-type 103 | - name: dot-imports 104 | - name: error-return 105 | - name: error-strings 106 | - name: error-naming 107 | - name: exported 108 | - name: if-return 109 | - name: increment-decrement 110 | - name: var-naming 111 | - name: var-declaration 112 | - name: package-comments 113 | - name: range 114 | - name: receiver-naming 115 | - name: time-naming 116 | - name: unexported-return 117 | - name: indent-error-flow 118 | - name: errorf 119 | - name: empty-block 120 | - name: superfluous-else 121 | - name: unused-parameter 122 | - name: unreachable-code 123 | - name: redefines-builtin-id 124 | 125 | # Bug detection 126 | staticcheck: 127 | checks: ["all", "-ST1000", "-ST1003", "-ST1016"] 128 | 129 | errcheck: 130 | # Report about not checking errors in type assertions 131 | check-type-assertions: true 132 | # Report about assignment of errors to blank identifier 133 | check-blank: true 134 | 135 | # Style 136 | gocritic: 137 | enabled-tags: 138 | - diagnostic 139 | - performance 140 | - style 141 | - opinionated 142 | disabled-checks: 143 | - dupImport # Already covered by goimports 144 | - ifElseChain # Sometimes if-else is clearer 145 | - octalLiteral # We don't use octal 146 | - whyNoLint # We might need nolint sometimes 147 | - wrapperFunc # Too restrictive 148 | 149 | gocyclo: 150 | # Minimal cyclomatic complexity to report 151 | min-complexity: 15 152 | 153 | gocognit: 154 | # Minimal cognitive complexity to report 155 | min-complexity: 20 156 | 157 | funlen: 158 | lines: 100 159 | statements: 50 160 | 161 | lll: 162 | line-length: 140 # Slightly more than default 120 163 | 164 | # Performance 165 | goconst: 166 | min-len: 3 167 | min-occurrences: 3 168 | 169 | prealloc: 170 | # Report preallocation suggestions only on simple loops 171 | simple: true 172 | range-loops: true 173 | for-loops: true 174 | 175 | # Best practices 176 | dogsled: 177 | # Maximum number of blank identifiers 178 | max-blank-identifiers: 2 179 | 180 | dupl: 181 | # Minimum lines to consider as duplicate 182 | threshold: 150 183 | 184 | godox: 185 | keywords: 186 | - TODO 187 | - FIXME 188 | - BUG 189 | - HACK 190 | - XXX 191 | 192 | issues: 193 | # Excluding configuration per-path, per-linter, per-text and per-source 194 | exclude-rules: 195 | # Exclude some linters from running on tests files 196 | - path: _test\.go 197 | linters: 198 | - gocyclo 199 | - errcheck 200 | - dupl 201 | - gosec 202 | - funlen 203 | 204 | # Exclude known issues in main.go (legacy code) 205 | - path: cmd/rxtls/main.go 206 | linters: 207 | - funlen 208 | - gocyclo 209 | - gocognit 210 | 211 | # Allow TODO comments in certain files 212 | - path: "(.*)?TODO(.*)?go" 213 | linters: 214 | - godox 215 | 216 | # Exclude vendor, if any 217 | - path: vendor/ 218 | linters: [all] 219 | 220 | # Maximum issues count per one linter 221 | max-issues-per-linter: 50 222 | 223 | # Maximum count of issues with the same text 224 | max-same-issues: 3 225 | 226 | # Show only new issues 227 | new: false 228 | 229 | severity: 230 | # Default value is empty string 231 | default-severity: warning 232 | 233 | # If set to true, the severity-rules regular expressions become case-sensitive 234 | case-sensitive: false 235 | 236 | rules: 237 | - linters: 238 | - gosec 239 | severity: error 240 | - linters: 241 | - staticcheck 242 | - errcheck 243 | - bodyclose 244 | severity: error 245 | - linters: 246 | - revive 247 | - govet 248 | severity: warning -------------------------------------------------------------------------------- /.goreleaser.yaml: -------------------------------------------------------------------------------- 1 | # yaml-language-server: $schema=https://goreleaser.com/static/schema.json 2 | version: 2 3 | 4 | before: 5 | hooks: 6 | - go mod download 7 | - go mod verify 8 | 9 | builds: 10 | - main: ./cmd/rxtls 11 | binary: rxtls 12 | id: rxtls 13 | env: 14 | - CGO_ENABLED=0 15 | goos: [windows, linux, darwin, freebsd] 16 | goarch: [amd64, '386', arm, arm64] 17 | goarm: ['6', '7'] 18 | ignore: 19 | - goos: windows 20 | goarch: arm 21 | - goos: windows 22 | goarch: arm64 23 | - goos: darwin 24 | goarch: '386' 25 | - goos: darwin 26 | goarch: arm 27 | - goos: freebsd 28 | goarch: arm 29 | flags: 30 | - -trimpath 31 | ldflags: 32 | - -s -w 33 | - -X main.version={{.Version}} 34 | - -X main.commit={{.ShortCommit}} 35 | - -X main.date={{.Date}} 36 | - -X main.builtBy=goreleaser 37 | 38 | archives: 39 | - id: rxtls 40 | ids: [rxtls] 41 | name_template: '{{ .ProjectName }}_{{ .Version }}_{{ if eq .Os "darwin" }}macOS{{ else }}{{ .Os }}{{ end }}_{{ .Arch }}{{ if .Arm }}v{{ .Arm }}{{ end }}' 42 | format_overrides: 43 | - goos: windows 44 | formats: [zip] 45 | files: 46 | - LICENSE 47 | - README.md 48 | 49 | checksum: 50 | name_template: '{{ .ProjectName }}_{{ .Version }}_checksums.txt' 51 | algorithm: sha256 52 | 53 | snapshot: 54 | version_template: '{{ incpatch .Version }}-dev' 55 | 56 | changelog: 57 | sort: asc 58 | use: github 59 | filters: 60 | exclude: 61 | - '^docs:' 62 | - '^test:' 63 | - '^chore:' 64 | - typo 65 | - Merge pull request 66 | - Merge branch 67 | groups: 68 | - title: 'New Features' 69 | regexp: '^.*?feat(\([[:word:]]+\))??!?:.+$' 70 | order: 0 71 | - title: 'Bug Fixes' 72 | regexp: '^.*?fix(\([[:word:]]+\))??!?:.+$' 73 | order: 1 74 | - title: 'Performance Improvements' 75 | regexp: '^.*?perf(\([[:word:]]+\))??!?:.+$' 76 | order: 2 77 | - title: 'Code Refactoring' 78 | regexp: '^.*?refactor(\([[:word:]]+\))??!?:.+$' 79 | order: 3 80 | - title: Other 81 | order: 999 82 | 83 | release: 84 | github: 85 | owner: x-stp 86 | name: rxtls 87 | prerelease: auto 88 | draft: false 89 | name_template: '{{ .Tag }}' 90 | header: | 91 | ## rxtls {{ .Tag }} 92 | 93 | High-Performance Certificate Transparency Processor 94 | footer: | 95 | ## Installation 96 | 97 | ### Binary 98 | Download the appropriate binary for your platform from the assets below. 99 | 100 | ### Homebrew (macOS/Linux) 101 | ```bash 102 | brew tap x-stp/rxtls 103 | brew install rxtls 104 | ``` 105 | 106 | ### Docker 107 | ```bash 108 | docker pull ghcr.io/x-stp/rxtls:{{ .Tag }} 109 | ``` 110 | 111 | **Full documentation**: https://github.com/x-stp/rxtls#readme 112 | 113 | dockers: 114 | - image_templates: 115 | - 'ghcr.io/x-stp/{{ .ProjectName }}:{{ .Tag }}-amd64' 116 | - 'ghcr.io/x-stp/{{ .ProjectName }}:v{{ .Major }}.{{ .Minor }}-amd64' 117 | - 'ghcr.io/x-stp/{{ .ProjectName }}:v{{ .Major }}-amd64' 118 | - 'ghcr.io/x-stp/{{ .ProjectName }}:latest-amd64' 119 | dockerfile: Dockerfile.goreleaser 120 | use: buildx 121 | build_flag_templates: 122 | - '--pull' 123 | - '--platform=linux/amd64' 124 | - '--label=org.opencontainers.image.created={{ .Date }}' 125 | - '--label=org.opencontainers.image.title={{ .ProjectName }}' 126 | - '--label=org.opencontainers.image.revision={{ .FullCommit }}' 127 | - '--label=org.opencontainers.image.version={{ .Version }}' 128 | - '--label=org.opencontainers.image.source=https://github.com/x-stp/rxtls' 129 | - '--label=org.opencontainers.image.licenses=AGPL-3.0' 130 | goarch: amd64 131 | goos: linux 132 | 133 | - image_templates: 134 | - 'ghcr.io/x-stp/{{ .ProjectName }}:{{ .Tag }}-arm64' 135 | - 'ghcr.io/x-stp/{{ .ProjectName }}:v{{ .Major }}.{{ .Minor }}-arm64' 136 | - 'ghcr.io/x-stp/{{ .ProjectName }}:v{{ .Major }}-arm64' 137 | - 'ghcr.io/x-stp/{{ .ProjectName }}:latest-arm64' 138 | dockerfile: Dockerfile.goreleaser 139 | use: buildx 140 | build_flag_templates: 141 | - '--pull' 142 | - '--platform=linux/arm64' 143 | - '--label=org.opencontainers.image.created={{ .Date }}' 144 | - '--label=org.opencontainers.image.title={{ .ProjectName }}' 145 | - '--label=org.opencontainers.image.revision={{ .FullCommit }}' 146 | - '--label=org.opencontainers.image.version={{ .Version }}' 147 | - '--label=org.opencontainers.image.source=https://github.com/x-stp/rxtls' 148 | - '--label=org.opencontainers.image.licenses=AGPL-3.0' 149 | goarch: arm64 150 | goos: linux 151 | 152 | docker_manifests: 153 | - name_template: 'ghcr.io/x-stp/{{ .ProjectName }}:{{ .Tag }}' 154 | image_templates: 155 | - 'ghcr.io/x-stp/{{ .ProjectName }}:{{ .Tag }}-amd64' 156 | - 'ghcr.io/x-stp/{{ .ProjectName }}:{{ .Tag }}-arm64' 157 | 158 | - name_template: 'ghcr.io/x-stp/{{ .ProjectName }}:v{{ .Major }}.{{ .Minor }}' 159 | image_templates: 160 | - 'ghcr.io/x-stp/{{ .ProjectName }}:v{{ .Major }}.{{ .Minor }}-amd64' 161 | - 'ghcr.io/x-stp/{{ .ProjectName }}:v{{ .Major }}.{{ .Minor }}-arm64' 162 | 163 | - name_template: 'ghcr.io/x-stp/{{ .ProjectName }}:v{{ .Major }}' 164 | image_templates: 165 | - 'ghcr.io/x-stp/{{ .ProjectName }}:v{{ .Major }}-amd64' 166 | - 'ghcr.io/x-stp/{{ .ProjectName }}:v{{ .Major }}-arm64' 167 | 168 | - name_template: 'ghcr.io/x-stp/{{ .ProjectName }}:latest' 169 | image_templates: 170 | - 'ghcr.io/x-stp/{{ .ProjectName }}:latest-amd64' 171 | - 'ghcr.io/x-stp/{{ .ProjectName }}:latest-arm64' 172 | 173 | homebrew_casks: 174 | - repository: 175 | owner: x-stp 176 | name: homebrew-rxtls 177 | token: '{{ .Env.HOMEBREW_TAP_TOKEN }}' 178 | name: rxtls 179 | directory: Casks 180 | homepage: 'https://github.com/x-stp/rxtls' 181 | description: 'High-Performance Certificate Transparency Processor' 182 | license: 'AGPL-3.0' 183 | conflicts: 184 | - formula: rxtls 185 | 186 | nfpms: 187 | - id: rxtls 188 | package_name: rxtls 189 | formats: 190 | - deb 191 | - rpm 192 | - apk 193 | vendor: 'x-stp' 194 | homepage: 'https://github.com/x-stp/rxtls' 195 | maintainer: 'Pepijn van der Stap ' 196 | description: 'High-Performance Certificate Transparency Processor' 197 | license: 'AGPL-3.0' 198 | dependencies: 199 | - ca-certificates 200 | section: net 201 | priority: optional 202 | contents: 203 | - src: ./LICENSE 204 | dst: /usr/share/doc/rxtls/LICENSE 205 | - src: ./README.md 206 | dst: /usr/share/doc/rxtls/README.md -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Build stage with multi-arch support 2 | FROM --platform=$BUILDPLATFORM golang:1.24-alpine AS builder 3 | 4 | # Build arguments for cross-compilation 5 | ARG TARGETOS 6 | ARG TARGETARCH 7 | ARG TARGETVARIANT 8 | 9 | # Install git and ca-certificates 10 | RUN apk add --no-cache git ca-certificates 11 | 12 | # Set working directory 13 | WORKDIR /build 14 | 15 | # Copy go mod files 16 | COPY go.mod go.sum ./ 17 | 18 | # Download dependencies 19 | RUN go mod download 20 | 21 | # Copy source code 22 | COPY . . 23 | 24 | # Build the binary for target architecture 25 | RUN CGO_ENABLED=0 GOOS=${TARGETOS} GOARCH=${TARGETARCH} go build \ 26 | -ldflags="-w -s" \ 27 | -o rxtls \ 28 | ./cmd/rxtls 29 | 30 | # Final stage - use scratch for minimal image 31 | FROM scratch 32 | 33 | # Copy ca-certificates from builder 34 | COPY --from=builder /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ 35 | 36 | # Copy the binary 37 | COPY --from=builder /build/rxtls /usr/local/bin/rxtls 38 | 39 | # Set entrypoint 40 | 41 | ENTRYPOINT ["/usr/local/bin/rxtls"] 42 | -------------------------------------------------------------------------------- /Dockerfile.goreleaser: -------------------------------------------------------------------------------- 1 | # Dockerfile for GoReleaser 2 | # This is a minimal Dockerfile that only copies the pre-built binary 3 | FROM scratch 4 | 5 | # Copy ca-certificates for HTTPS connections 6 | COPY --from=alpine:latest /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ 7 | 8 | # Copy the pre-built binary from GoReleaser 9 | COPY rxtls /usr/local/bin/rxtls 10 | 11 | # Set entrypoint 12 | ENTRYPOINT ["/usr/local/bin/rxtls"] -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for rxtls 2 | 3 | # Variables 4 | BINARY_NAME := rxtls 5 | MAIN_PATH := ./cmd/rxtls 6 | BUILD_DIR := ./dist 7 | COVERAGE_FILE := coverage.out 8 | 9 | # Go parameters 10 | GOCMD := go 11 | GOBUILD := $(GOCMD) build 12 | GOCLEAN := $(GOCMD) clean 13 | GOTEST := $(GOCMD) test 14 | GOGET := $(GOCMD) get 15 | GOMOD := $(GOCMD) mod 16 | GOFMT := gofmt 17 | GOVET := $(GOCMD) vet 18 | 19 | # Build flags 20 | LDFLAGS := -s -w 21 | BUILD_FLAGS := -trimpath -ldflags "$(LDFLAGS)" 22 | 23 | # Linting 24 | GOLANGCI_LINT_VERSION := v1.54.2 25 | GOLANGCI_LINT := $(shell which golangci-lint 2> /dev/null) 26 | 27 | .PHONY: all build clean test lint lint-install lint-fix security fmt vet tidy help 28 | 29 | # Default target 30 | all: lint test build 31 | 32 | # Build the binary 33 | build: 34 | @echo "Building $(BINARY_NAME)..." 35 | @$(GOBUILD) $(BUILD_FLAGS) -o $(BINARY_NAME) $(MAIN_PATH) 36 | @echo "Build complete: ./$(BINARY_NAME)" 37 | 38 | # Clean build artifacts 39 | clean: 40 | @echo "Cleaning..." 41 | @$(GOCLEAN) 42 | @rm -f $(BINARY_NAME) 43 | @rm -rf $(BUILD_DIR) 44 | @rm -f $(COVERAGE_FILE) 45 | @echo "Clean complete" 46 | 47 | # Run tests 48 | test: 49 | @echo "Running tests..." 50 | @$(GOTEST) -v -race -cover ./... 51 | 52 | # Run tests with coverage 53 | test-coverage: 54 | @echo "Running tests with coverage..." 55 | @$(GOTEST) -v -race -coverprofile=$(COVERAGE_FILE) -covermode=atomic ./... 56 | @$(GOCMD) tool cover -html=$(COVERAGE_FILE) -o coverage.html 57 | @echo "Coverage report generated: coverage.html" 58 | 59 | # Install golangci-lint if not present 60 | lint-install: 61 | ifndef GOLANGCI_LINT 62 | @echo "Installing golangci-lint $(GOLANGCI_LINT_VERSION)..." 63 | @curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(shell go env GOPATH)/bin $(GOLANGCI_LINT_VERSION) 64 | else 65 | @echo "golangci-lint is already installed at $(GOLANGCI_LINT)" 66 | endif 67 | 68 | # Run linters 69 | lint: lint-install 70 | @echo "Running linters..." 71 | @golangci-lint run --timeout=5m ./... 72 | 73 | # Run linters and fix issues where possible 74 | lint-fix: lint-install 75 | @echo "Running linters with auto-fix..." 76 | @golangci-lint run --fix --timeout=5m ./... 77 | 78 | # Run security-focused linters only 79 | security: lint-install 80 | @echo "Running security checks..." 81 | @golangci-lint run --disable-all --enable=gosec,exportloopref,bodyclose --timeout=5m ./... 82 | 83 | # Run gosec directly with more detailed output 84 | gosec: 85 | @echo "Running gosec security scanner..." 86 | @gosec -fmt=json -out=gosec-report.json -stdout -verbose=text -severity=medium ./... || true 87 | @echo "Security report saved to gosec-report.json" 88 | 89 | # Format code 90 | fmt: 91 | @echo "Formatting code..." 92 | @$(GOFMT) -s -w . 93 | @$(GOCMD) fmt ./... 94 | 95 | # Run go vet 96 | vet: 97 | @echo "Running go vet..." 98 | @$(GOVET) ./... 99 | 100 | # Tidy dependencies 101 | tidy: 102 | @echo "Tidying dependencies..." 103 | @$(GOMOD) tidy 104 | @$(GOMOD) verify 105 | 106 | # Quick check - format, vet, and lint 107 | check: fmt vet lint 108 | 109 | # CI/CD oriented target - strict checking 110 | ci: tidy fmt vet lint test 111 | 112 | # Install all development dependencies 113 | dev-deps: lint-install 114 | @echo "Installing development dependencies..." 115 | @$(GOGET) github.com/securego/gosec/v2/cmd/gosec@latest 116 | @echo "Development dependencies installed" 117 | 118 | # Show help 119 | help: 120 | @echo "Available targets:" 121 | @echo " make build - Build the binary" 122 | @echo " make test - Run tests" 123 | @echo " make test-coverage - Run tests with coverage report" 124 | @echo " make lint - Run all linters" 125 | @echo " make lint-fix - Run linters with auto-fix" 126 | @echo " make security - Run security-focused linters" 127 | @echo " make gosec - Run gosec security scanner" 128 | @echo " make fmt - Format code" 129 | @echo " make vet - Run go vet" 130 | @echo " make tidy - Tidy go modules" 131 | @echo " make check - Quick check (fmt, vet, lint)" 132 | @echo " make ci - Full CI check" 133 | @echo " make clean - Clean build artifacts" 134 | @echo " make dev-deps - Install development dependencies" 135 | @echo " make help - Show this help message" -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # rxtls - High-Performance Certificate Transparency Processor 2 | 3 | rxtls is a high-throughput, fault-tolerant Certificate Transparency log processor designed for hyperscale environments. It provides efficient processing of CT logs with dynamic backpressure handling, adaptive rate limiting, and comprehensive observability. 4 | 5 | ## Features 6 | 7 | - **High Throughput**: Process CT logs efficiently with a worker pool architecture 8 | - **Fault Tolerance**: Automatic retries, backpressure handling, and graceful shutdown 9 | - **Dynamic Rate Limiting**: Adaptive rate control based on success/failure patterns 10 | - **Observability**: Prometheus metrics for monitoring and alerting 11 | - **Configurable**: CLI flags for customizing behavior and CT log sources 12 | - **Versatile**: Download raw certificates or extract domains from CT logs 13 | 14 | ## Architecture 15 | 16 | The system consists of several key components: 17 | 18 | ### Scheduler 19 | - Manages a pool of workers 20 | - Distributes work using least-loaded worker selection 21 | - Implements graceful shutdown 22 | - Provides statistics and metrics 23 | 24 | ### Workers 25 | - Process work items from their queues 26 | - Implement backpressure handling 27 | - Track success/failure metrics 28 | - Support CPU affinity for optimal performance 29 | 30 | ### Rate Limiter 31 | - Dynamic rate adjustment based on success/failure 32 | - Token bucket implementation for smooth rate limiting 33 | - Backpressure integration 34 | - Atomic operations for thread safety 35 | 36 | ### Metrics 37 | - Prometheus integration for monitoring 38 | - Queue pressure tracking 39 | - Success/failure rate monitoring 40 | - Resource utilization metrics 41 | 42 | ## Usage 43 | 44 | The tool provides several subcommands: 45 | 46 | ```bash 47 | # List available CT logs 48 | rxtls list 49 | 50 | # Download certificates from CT logs 51 | rxtls download 52 | 53 | # Extract domains from certificates in CT logs 54 | rxtls domains 55 | 56 | # Fetch and save the CT logs list to a local file 57 | rxtls fetch-logs 58 | 59 | # Direct processing with URI (legacy mode) 60 | rxtls --ct-uri https://ct.example.com/log 61 | ``` 62 | 63 | ### Global Flags 64 | 65 | ```bash 66 | # Use local logs list instead of fetching from internet 67 | rxtls --local-logs [command] 68 | 69 | # Customize worker pool size 70 | rxtls --workers 8 71 | 72 | # Set initial rate limit 73 | rxtls --rate-limit 1000 74 | 75 | # Enable debug logging 76 | rxtls --debug 77 | 78 | # Configure Prometheus metrics port 79 | rxtls --metrics-port 9090 80 | ``` 81 | 82 | ### Download Command 83 | 84 | ```bash 85 | # Basic download with interactive log selection 86 | rxtls download 87 | 88 | # Specify output directory 89 | rxtls download --output /path/to/output 90 | 91 | # Configure concurrency 92 | rxtls download --concurrency 10 93 | 94 | # Adjust buffer size 95 | rxtls download --buffer 262144 96 | 97 | # Enable compression 98 | rxtls download --compress 99 | 100 | # Enable high-speed mode 101 | rxtls download --turbo 102 | ``` 103 | 104 | ### Domains Command 105 | 106 | ```bash 107 | # Basic domain extraction with interactive log selection 108 | rxtls domains 109 | 110 | # Specify output directory 111 | rxtls domains --output /path/to/domains 112 | 113 | # Configure concurrency 114 | rxtls domains --concurrency 10 115 | 116 | # Adjust buffer size 117 | rxtls domains --buffer 32768 118 | 119 | # Enable compression 120 | rxtls domains --compress 121 | 122 | # Enable high-speed mode 123 | rxtls domains --turbo 124 | ``` 125 | 126 | ## Configuration 127 | 128 | ### CLI Flags 129 | 130 | - `--ct-uri`: CT log URI to process (default: from config) 131 | - `--workers`: Number of worker goroutines (default: runtime.NumCPU()) 132 | - `--rate-limit`: Initial rate limit in requests/second (default: 100) 133 | - `--debug`: Enable debug logging 134 | - `--metrics-port`: Prometheus metrics port (default: 9090) 135 | - `--local-logs`: Use local logs list instead of fetching from internet 136 | 137 | ### Environment Variables 138 | 139 | - `RXTLS_CONFIG`: Path to config file 140 | - `RXTLS_LOG_LEVEL`: Log level (debug, info, warn, error) 141 | - `RXTLS_METRICS_PORT`: Prometheus metrics port 142 | 143 | ## Metrics 144 | 145 | The following Prometheus metrics are exposed: 146 | 147 | - `rxtls_worker_queue_size`: Current size of worker queues 148 | - `rxtls_worker_queue_pressure`: Queue pressure (0-1) 149 | - `rxtls_worker_processed_total`: Total processed items 150 | - `rxtls_worker_errors_total`: Total errors 151 | - `rxtls_rate_limit_current`: Current rate limit 152 | - `rxtls_rate_limit_success_total`: Total successful requests 153 | - `rxtls_rate_limit_failure_total`: Total failed requests 154 | 155 | ## Development 156 | 157 | ### Prerequisites 158 | 159 | - Go 1.24 or later 160 | 161 | ### Building 162 | 163 | ```bash 164 | # Build binary 165 | go build 166 | 167 | # Run tests 168 | go test ./... 169 | 170 | # Run benchmarks 171 | go test -bench=. ./... 172 | ``` 173 | 174 | ### Testing 175 | 176 | The codebase includes comprehensive tests: 177 | 178 | - Unit tests for all components 179 | - Integration tests for the full pipeline 180 | - Benchmarks for performance testing 181 | - Race condition detection enabled 182 | 183 | ## License 184 | 185 | GNU Affero General Public License v3 - see LICENSE file for details 186 | 187 | -------------------------------------------------------------------------------- /cmd/rxtls/main.go: -------------------------------------------------------------------------------- 1 | /* 2 | Package main is the entry point for the rxtls command-line application. 3 | 4 | rxtls is a tool designed for interacting with Certificate Transparency (CT) logs. 5 | Its primary functionalities include: 6 | - Listing available CT logs. 7 | - Downloading raw certificate entries (as base64 blobs) from specified CT logs. 8 | - Extracting domain names (Common Name and Subject Alternative Names) from certificate entries 9 | and saving them to CSV files. 10 | - Fetching and caching the official list of CT logs. 11 | 12 | The application uses the Cobra library for command-line interface structure and flag parsing. 13 | It leverages several internal packages: 14 | - `internal/certlib`: For CT log interaction logic, data models, and parsing certificate entries. 15 | - `internal/client`: For a configurable HTTP client used for network requests. 16 | - `internal/core`: For the core processing engine, including a concurrent scheduler, download manager, 17 | and domain extractor. 18 | - `internal/metrics`: For exposing Prometheus metrics for monitoring application performance. 19 | 20 | Global flags allow users to specify options like using a local log list cache. 21 | Subcommands (`list`, `download`, `domains`, `fetch-logs`) provide access to different functionalities, 22 | each with its own set of specific flags for configuration (e.g., output directory, concurrency). 23 | 24 | The main function initializes a Prometheus metrics server and then either processes a single CT log URI 25 | (if provided directly as a flag without a subcommand) or executes the appropriate Cobra subcommand. 26 | Graceful shutdown is handled via context cancellation triggered by OS signals (SIGINT, SIGTERM). 27 | */ 28 | package main 29 | 30 | /* 31 | rxtls — fast tool in Go for working with Certificate Transparency logs 32 | Copyright (C) 2025 Pepijn van der Stap 33 | 34 | This program is free software: you can redistribute it and/or modify 35 | it under the terms of the GNU Affero General Public License as published by 36 | the Free Software Foundation, either version 3 of the License, or 37 | (at your option) any later version. 38 | 39 | This program is distributed in the hope that it will be useful, 40 | but WITHOUT ANY WARRANTY; without even the implied warranty of 41 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 42 | GNU Affero General Public License for more details. 43 | 44 | You should have received a copy of the GNU Affero General Public License 45 | along with this program. If not, see . 46 | */ 47 | 48 | import ( 49 | "bufio" 50 | "context" 51 | "errors" 52 | "flag" 53 | "fmt" 54 | "io" 55 | "log" 56 | "net/http" 57 | "os" 58 | "os/signal" 59 | "runtime" 60 | "strconv" 61 | "strings" 62 | "sync" 63 | "syscall" 64 | "time" 65 | 66 | "github.com/spf13/cobra" 67 | "github.com/x-stp/rxtls/internal/certlib" 68 | "github.com/x-stp/rxtls/internal/client" 69 | "github.com/x-stp/rxtls/internal/core" 70 | "github.com/x-stp/rxtls/internal/metrics" 71 | ) 72 | 73 | // Global flags (persistent across commands) 74 | var useLocalLogs bool 75 | 76 | // Flags specific to the download command 77 | var ( 78 | outputDir string 79 | maxConcurrentLogs int 80 | bufferSize int 81 | showStats bool 82 | turbo bool 83 | compress bool 84 | logsFile string // Added for fetch-logs command 85 | ctURI = flag.String("ct-uri", "", "CT log URI to process (overrides config)") 86 | workers = flag.Int("workers", runtime.NumCPU(), "Number of worker goroutines") 87 | rateLimit = flag.Float64("rate-limit", 100, "Initial rate limit in requests/second") 88 | debug = flag.Bool("debug", false, "Enable debug logging") 89 | metricsPort = flag.Int("metrics-port", 9090, "Prometheus metrics port") 90 | ) 91 | 92 | var rootCmd = &cobra.Command{ 93 | Use: "rxtls", 94 | Short: "rxtls - A Certificate Transparency Log (domain/b64 blob) downloader and processor", 95 | PersistentPreRun: func(cmd *cobra.Command, args []string) { 96 | // Enable local logs if requested (applies to all commands) 97 | if useLocalLogs { 98 | certlib.UseLocalLogs = true 99 | log.Println("Using local logs list enabled.") 100 | } 101 | }, 102 | } 103 | 104 | var listCmd = &cobra.Command{ 105 | Use: "list", 106 | Short: "List all available Certificate Transparency logs", 107 | Run: func(cmd *cobra.Command, args []string) { 108 | listLogs() 109 | }, 110 | } 111 | 112 | var downloadCmd = &cobra.Command{ 113 | Use: "download", 114 | Short: "Download certificates (full B64 blob) from selected CT logs", 115 | Run: func(cmd *cobra.Command, args []string) { 116 | // Flags are parsed by Cobra and available via the variables 117 | downloadLogs(outputDir, maxConcurrentLogs, bufferSize, showStats, compress, turbo) 118 | }, 119 | } 120 | 121 | var domainsCmd = &cobra.Command{ 122 | Use: "domains", 123 | Short: "Extract domains from selected CT logs and save to CSV", 124 | Long: `Extracts domains (CN and SANs) from certificates found in selected CT logs. Output is a CSV file per log with format: offset,cn,primary_domain,all_domains_json,country,org,issuer_cn,domain_org_hash`, 125 | Run: func(cmd *cobra.Command, args []string) { 126 | // Call the new core function for domain extraction 127 | extractDomains(outputDir, maxConcurrentLogs, bufferSize, showStats, turbo, compress) 128 | }, 129 | } 130 | 131 | var fetchLogsCmd = &cobra.Command{ 132 | Use: "fetch-logs", 133 | Short: "Fetch and save the CT logs list to a local file", 134 | Run: func(cmd *cobra.Command, args []string) { 135 | fetchAndSaveLogs() 136 | }, 137 | } 138 | 139 | func init() { 140 | // Persistent flags (available for all commands) 141 | rootCmd.PersistentFlags().BoolVar(&useLocalLogs, "local-logs", false, "Use local all_logs_list.json instead of fetching from internet") 142 | 143 | // Flags for the download command 144 | downloadCmd.Flags().StringVarP(&outputDir, "output", "o", "output/certs", "Output directory for certificate blobs") 145 | downloadCmd.Flags().IntVarP(&maxConcurrentLogs, "concurrency", "c", 0, "Maximum number of concurrent logs to process (0 for auto based on CPU)") 146 | downloadCmd.Flags().IntVarP(&bufferSize, "buffer", "b", core.DefaultDiskBufferSize, "Internal buffer size in bytes for disk I/O") 147 | downloadCmd.Flags().BoolVarP(&showStats, "stats", "s", true, "Show statistics during processing") 148 | downloadCmd.Flags().BoolVar(&compress, "compress", false, "Compress output CSV files") 149 | downloadCmd.Flags().BoolVar(&turbo, "turbo", false, "Enable high-speed mode (DNS prewarm, persistent connections)") 150 | 151 | // Flags for the domains command (sharing some with download) 152 | domainsCmd.Flags().StringVarP(&outputDir, "output", "o", "output/domains", "Output directory for domain CSV files") // Default to subfolder 153 | domainsCmd.Flags().IntVarP(&maxConcurrentLogs, "concurrency", "c", 0, "Maximum number of concurrent logs to process (0 for auto based on CPU)") 154 | domainsCmd.Flags().IntVarP(&bufferSize, "buffer", "b", 32768, "Internal buffer size in bytes") 155 | domainsCmd.Flags().BoolVarP(&showStats, "stats", "s", true, "Show statistics during processing") 156 | domainsCmd.Flags().BoolVar(&turbo, "turbo", false, "Enable high-speed mode (DNS prewarm, persistent connections)") // Added turbo flag 157 | domainsCmd.Flags().BoolVar(&compress, "compress", false, "Compress output CSV files") 158 | 159 | // Flags for the fetch-logs command 160 | fetchLogsCmd.Flags().StringVarP(&logsFile, "output", "o", certlib.LocalLogsFile, "Output file for CT logs list") 161 | 162 | // Add subcommands to the root command 163 | rootCmd.AddCommand(listCmd) 164 | rootCmd.AddCommand(downloadCmd) 165 | rootCmd.AddCommand(domainsCmd) 166 | rootCmd.AddCommand(fetchLogsCmd) 167 | } 168 | 169 | func main() { 170 | flag.Parse() 171 | 172 | // Initialize metrics 173 | metrics.EnableMetrics() 174 | if err := metrics.StartMetricsServer(fmt.Sprintf(":%d", *metricsPort)); err != nil { 175 | log.Fatalf("Failed to start metrics server: %v", err) 176 | } 177 | 178 | // Only process -ct-uri directly if specified and no cobra command is used 179 | if *ctURI != "" && len(os.Args) == 1 { 180 | // Create output directory 181 | if err := os.MkdirAll(outputDir, 0755); err != nil { 182 | log.Fatalf("Failed to create output directory: %v", err) 183 | } 184 | 185 | // Create scheduler 186 | ctx := context.Background() 187 | scheduler, err := core.NewScheduler(ctx) 188 | if err != nil { 189 | log.Fatalf("Failed to create scheduler: %v", err) 190 | } 191 | defer scheduler.Shutdown() 192 | 193 | // Process CT log 194 | if err := processCTLog(ctx, *ctURI, scheduler); err != nil { 195 | log.Fatalf("Error processing CT log: %v", err) 196 | } 197 | 198 | // Wait for all work to complete 199 | scheduler.Wait() 200 | } else { 201 | // Execute cobra command 202 | if err := rootCmd.Execute(); err != nil { 203 | fmt.Fprintf(os.Stderr, "Error: %v\n", err) 204 | os.Exit(1) 205 | } 206 | } 207 | } 208 | 209 | func processCTLog(ctx context.Context, uri string, scheduler *core.Scheduler) error { 210 | // Create log info 211 | logInfo := &certlib.CTLogInfo{ 212 | URL: uri, 213 | } 214 | 215 | // Get log info 216 | if err := certlib.GetLogInfo(logInfo); err != nil { 217 | return err 218 | } 219 | 220 | // Process entries in batches 221 | batchSize := 1000 222 | for start := 0; start < int(logInfo.TreeSize); start += batchSize { 223 | end := min(start+batchSize, int(logInfo.TreeSize)) 224 | 225 | // Submit work for this batch 226 | err := scheduler.SubmitWork(ctx, logInfo, int64(start), int64(end), func(item *core.WorkItem) error { 227 | // Process entries in this batch 228 | entries, err := certlib.DownloadEntries(ctx, logInfo, int(item.Start), int(item.End)) 229 | if err != nil { 230 | return err 231 | } 232 | 233 | // Process each entry 234 | for _, entry := range entries.Entries { 235 | // Parse certificate data 236 | certData, err := certlib.ParseCertificateEntry(entry.LeafInput, entry.ExtraData, logInfo.URL) 237 | if err != nil { 238 | log.Printf("Error parsing certificate entry: %v", err) 239 | continue 240 | } 241 | 242 | // Write domains to file 243 | if len(certData.AllDomains) > 0 { 244 | // Create domains file for this batch 245 | domainsFile := outputDir + "/domains_" + logInfo.URL + "_" + strconv.FormatInt(item.Start, 10) + "_" + strconv.FormatInt(item.End, 10) + ".txt" 246 | f, err := os.OpenFile(domainsFile, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) 247 | if err != nil { 248 | log.Printf("Error opening domains file: %v", err) 249 | continue 250 | } 251 | 252 | // Write domains 253 | for _, domain := range certData.AllDomains { 254 | if _, err := f.WriteString(domain + "\n"); err != nil { 255 | log.Printf("Error writing domain: %v", err) 256 | } 257 | } 258 | 259 | f.Close() 260 | } 261 | 262 | // Write certificate data 263 | if certData.AsDER != "" { 264 | // Create certificates file for this batch 265 | certsFile := outputDir + "/certs_" + logInfo.URL + "_" + strconv.FormatInt(item.Start, 10) + "_" + strconv.FormatInt(item.End, 10) + ".pem" 266 | f, err := os.OpenFile(certsFile, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) 267 | if err != nil { 268 | log.Printf("Error opening certificates file: %v", err) 269 | continue 270 | } 271 | 272 | // Write certificate 273 | if _, err := f.WriteString("-----BEGIN CERTIFICATE-----\n"); err != nil { 274 | log.Printf("Error writing certificate header: %v", err) 275 | } 276 | if _, err := f.WriteString(certData.AsDER); err != nil { 277 | log.Printf("Error writing certificate data: %v", err) 278 | } 279 | if _, err := f.WriteString("\n-----END CERTIFICATE-----\n"); err != nil { 280 | log.Printf("Error writing certificate footer: %v", err) 281 | } 282 | 283 | f.Close() 284 | } 285 | } 286 | 287 | return nil 288 | }) 289 | 290 | if err != nil { 291 | log.Printf("Error submitting work for batch %d-%d: %v", start, end, err) 292 | continue 293 | } 294 | } 295 | 296 | return nil 297 | } 298 | 299 | func listLogs() { 300 | logs, err := core.ListCTLogs() 301 | if err != nil { 302 | log.Fatalf("Error listing CT logs: %v", err) 303 | } 304 | 305 | // Display each log 306 | for _, logEntry := range logs { // Renamed log to logEntry to avoid conflict with log package 307 | fmt.Printf("%s\n", logEntry.Description) 308 | fmt.Printf(" \\- URL: %s\n", logEntry.URL) 309 | fmt.Printf(" \\- Owner: %s\n", logEntry.OperatedBy) 310 | fmt.Printf(" \\- State: %s\n", getLogState(logEntry)) 311 | fmt.Println() 312 | } 313 | 314 | // Print final count 315 | fmt.Printf("Found %d Certificate Transparency Logs\n", len(logs)) 316 | } 317 | 318 | func getLogState(logInfo certlib.CTLogInfo) string { // Renamed log to logInfo 319 | // Get log info to determine state 320 | if err := certlib.GetLogInfo(&logInfo); err != nil { 321 | return "Unknown (error getting info)" 322 | } 323 | 324 | if logInfo.TreeSize == 0 { 325 | return "Empty" 326 | } 327 | 328 | return fmt.Sprintf("Active (%d certificates)", logInfo.TreeSize) 329 | } 330 | 331 | // downloadLogs is the handler for the 'download' command. 332 | func downloadLogs(outputDir string, maxConcurrentLogs int, bufferSize int, showStats bool, compress bool, turbo bool) { 333 | log.Printf("Starting certificate download: output='%s', concurrency=%d, buffer=%d, stats=%t, compress=%t, turbo=%t", 334 | outputDir, maxConcurrentLogs, bufferSize, showStats, compress, turbo) 335 | 336 | // Initialize HTTP client with turbo mode if requested 337 | if turbo { 338 | log.Println("Enabling turbo mode for HTTP client") 339 | client.ConfigureTurboMode() 340 | } 341 | 342 | // 1. List logs for selection 343 | allLogs, err := core.ListCTLogs() 344 | if err != nil { 345 | log.Fatalf("Error listing CT logs for selection: %v", err) 346 | } 347 | if len(allLogs) == 0 { 348 | log.Fatalf("No CT logs found to select from.") 349 | } 350 | 351 | // 2. Display and prompt for selection 352 | fmt.Println("Available Certificate Transparency Logs:") 353 | for i, lg := range allLogs { 354 | fmt.Printf(" [%d] %s (%s)\n", i+1, lg.Description, lg.URL) 355 | } 356 | fmt.Println(" [all] Download from all logs") 357 | fmt.Print("Enter log number(s) to download from (e.g., 1,3,5 or all): ") 358 | reader := bufio.NewReader(os.Stdin) 359 | input, _ := reader.ReadString('\n') 360 | input = strings.TrimSpace(input) 361 | var selectedLogs []certlib.CTLogInfo 362 | if strings.ToLower(input) == "all" { 363 | selectedLogs = allLogs 364 | fmt.Println("Selected all logs for download.") 365 | } else { 366 | parts := strings.Split(input, ",") 367 | selectedIndices := make(map[int]bool) 368 | for _, part := range parts { 369 | indexStr := strings.TrimSpace(part) 370 | if indexStr == "" { 371 | continue 372 | } 373 | index, err := strconv.Atoi(indexStr) 374 | if err != nil || index < 1 || index > len(allLogs) { 375 | log.Fatalf("Invalid input: %q is not a valid number in the range 1-%d", indexStr, len(allLogs)) 376 | } 377 | if !selectedIndices[index-1] { 378 | selectedLogs = append(selectedLogs, allLogs[index-1]) 379 | selectedIndices[index-1] = true 380 | } 381 | } 382 | if len(selectedLogs) == 0 { 383 | log.Fatalf("No valid logs selected.") 384 | } 385 | fmt.Printf("Selected %d log(s) for download.\n", len(selectedLogs)) 386 | } 387 | // ---------------------------------------------- 388 | 389 | // 3. Create and run the download manager 390 | log.Printf("Starting download for %d selected logs...", len(selectedLogs)) 391 | ctx, cancel := context.WithCancel(context.Background()) 392 | defer cancel() 393 | 394 | // Setup signal handling for graceful shutdown 395 | signalChan := make(chan os.Signal, 1) 396 | signal.Notify(signalChan, syscall.SIGINT, syscall.SIGTERM) 397 | go func() { 398 | <-signalChan 399 | log.Println("Interrupt received, initiating graceful shutdown...") 400 | cancel() 401 | }() 402 | 403 | // Create the download manager 404 | config := &core.DownloadConfig{ 405 | OutputDir: outputDir, 406 | BufferSize: bufferSize, 407 | MaxConcurrentLogs: maxConcurrentLogs, 408 | CompressOutput: compress, 409 | } 410 | 411 | // 4. Create and Run the Download Manager 412 | downloader, errManager := core.NewDownloadManager(ctx, config) // Renamed err to errManager 413 | if errManager != nil { 414 | log.Fatalf("Failed to create download manager: %v", errManager) 415 | } 416 | 417 | // 5. Launch Stats Display Goroutine (if enabled) 418 | var statsWg sync.WaitGroup 419 | if showStats { 420 | statsWg.Add(1) 421 | go func() { 422 | defer statsWg.Done() 423 | displayDownloadStats(ctx, downloader) // Swapped order 424 | }() 425 | } 426 | 427 | // 6. Start Download Process (BLOCKING) 428 | if err := downloader.DownloadCertificates(selectedLogs); err != nil && !errors.Is(err, context.Canceled) && !errors.Is(err, core.ErrDownloadCancelled) { 429 | log.Printf("Error during certificate download: %v", err) 430 | } 431 | log.Println("Main download process finished or cancelled.") 432 | 433 | // 7. Ensure stats goroutine finishes 434 | if showStats { 435 | log.Println("Waiting for statistics display to finish...") 436 | cancel() // Ensure context is cancelled 437 | statsWg.Wait() 438 | } 439 | 440 | // 8. Display Final Stats 441 | displayFinalDownloadStats(downloader) 442 | log.Println("Certificate download command complete.") 443 | } 444 | 445 | // displayDownloadStats periodically shows download progress. 446 | // ctx should be the first parameter for consistency with Go conventions. 447 | func displayDownloadStats(ctx context.Context, downloader *core.DownloadManager) { 448 | ticker := time.NewTicker(time.Second * 2) 449 | defer ticker.Stop() 450 | startTime := downloader.GetStats().StartTime 451 | log.Println("Starting download statistics display...") 452 | for { 453 | select { 454 | case <-ticker.C: 455 | stats := downloader.GetStats() 456 | elapsed := time.Since(startTime).Seconds() 457 | if elapsed < 0.1 { 458 | elapsed = 0.1 459 | } 460 | processedEntries := stats.ProcessedEntries.Load() 461 | totalEntries := stats.TotalEntries.Load() 462 | failedEntries := stats.FailedEntries.Load() 463 | entriesPerSec := float64(processedEntries) / elapsed 464 | percentDone := 0.0 465 | if totalEntries > 0 { 466 | percentDone = float64(processedEntries+failedEntries) / float64(totalEntries) * 100 467 | } 468 | fmt.Printf("\rProcessed: %d/%d logs | Entries: %d / ~%d (%.1f%%) | Failed: %d | Rate: %.0f ent/s | Written: %.2fMB | Retries: %.2f%%", 469 | stats.ProcessedLogs.Load(), 470 | stats.TotalLogs.Load(), 471 | processedEntries, 472 | totalEntries, 473 | percentDone, 474 | failedEntries, 475 | entriesPerSec, 476 | float64(stats.OutputBytesWritten.Load())/(1024*1024), 477 | stats.GetRetryRate()*100, 478 | ) 479 | case <-ctx.Done(): 480 | fmt.Println("\nDownload stats display stopping.") 481 | return 482 | } 483 | } 484 | } 485 | 486 | // displayFinalDownloadStats shows the summary download statistics. 487 | func displayFinalDownloadStats(downloader *core.DownloadManager) { 488 | stats := downloader.GetStats() 489 | elapsed := time.Since(stats.StartTime) 490 | processedEntries := stats.ProcessedEntries.Load() 491 | rate := 0.0 492 | if elapsed.Seconds() > 0 { 493 | rate = float64(processedEntries) / elapsed.Seconds() 494 | } 495 | fmt.Println() // Ensure stats start on a new line 496 | fmt.Printf("\n--- Final Download Statistics ---\n") 497 | fmt.Printf(" Processing Time: %v\n", elapsed.Round(time.Millisecond)) 498 | fmt.Printf(" Total Logs: %d\n", stats.TotalLogs.Load()) 499 | fmt.Printf(" Processed Logs: %d\n", stats.ProcessedLogs.Load()) 500 | fmt.Printf(" Failed Logs: %d\n", stats.FailedLogs.Load()) 501 | fmt.Printf(" Total Entries: ~%d\n", stats.TotalEntries.Load()) 502 | fmt.Printf("Processed Entries: %d (%.2f%% first try)\n", 503 | processedEntries, 504 | float64(stats.SuccessFirstTry.Load())/float64(processedEntries+1)*100) // +1 to avoid div by zero if no entries 505 | fmt.Printf(" Failed Entries: %d\n", stats.FailedEntries.Load()) 506 | fmt.Printf(" Overall Rate: %.0f entries/sec\n", rate) 507 | fmt.Printf(" Retry Rate: %.2f%% (Total Retries: %d)\n", 508 | stats.GetRetryRate()*100, stats.RetryCount.Load()) 509 | fmt.Printf(" Output Written: %.2f MB\n", float64(stats.OutputBytesWritten.Load())/(1024*1024)) 510 | fmt.Printf("-------------------------------\n") 511 | } 512 | 513 | // extractDomains is the handler for the 'domains' command. 514 | func extractDomains(outputDir string, maxConcurrentLogs int, bufferSize int, showStats bool, turbo bool, compress bool) { 515 | log.Printf("Starting domain extraction: output='%s', concurrency=%d, buffer=%d, stats=%t, turbo=%t, compress=%t", 516 | outputDir, maxConcurrentLogs, bufferSize, showStats, turbo, compress) 517 | 518 | // Initialize HTTP client with turbo mode if requested 519 | if turbo { 520 | log.Println("Enabling turbo mode for HTTP client") 521 | client.ConfigureTurboMode() 522 | } 523 | 524 | // 1. List logs for selection (Could be made non-interactive with flags/args later) 525 | allLogs, err := core.ListCTLogs() 526 | if err != nil { 527 | log.Fatalf("Error listing CT logs for selection: %v", err) 528 | } 529 | if len(allLogs) == 0 { 530 | log.Fatalf("No CT logs found to select from.") 531 | } 532 | 533 | // 2. Display and prompt for selection 534 | fmt.Println("Available Certificate Transparency Logs:") 535 | for i, lg := range allLogs { 536 | fmt.Printf(" [%d] %s (%s)\n", i+1, lg.Description, lg.URL) 537 | } 538 | fmt.Println(" [all] Extract from all logs") 539 | fmt.Print("Enter log number(s) to extract domains from (e.g., 1,3,5 or all): ") 540 | reader := bufio.NewReader(os.Stdin) 541 | input, _ := reader.ReadString('\n') 542 | input = strings.TrimSpace(input) 543 | var selectedLogs []certlib.CTLogInfo 544 | if strings.ToLower(input) == "all" { 545 | selectedLogs = allLogs 546 | fmt.Println("Selected all logs for domain extraction.") 547 | } else { 548 | parts := strings.Split(input, ",") 549 | selectedIndices := make(map[int]bool) 550 | for _, part := range parts { 551 | indexStr := strings.TrimSpace(part) 552 | if indexStr == "" { 553 | continue 554 | } 555 | index, err := strconv.Atoi(indexStr) 556 | if err != nil || index < 1 || index > len(allLogs) { 557 | log.Fatalf("Invalid input: %q is not a valid number in the range 1-%d", indexStr, len(allLogs)) 558 | } 559 | if !selectedIndices[index-1] { 560 | selectedLogs = append(selectedLogs, allLogs[index-1]) 561 | selectedIndices[index-1] = true 562 | } 563 | } 564 | if len(selectedLogs) == 0 { 565 | log.Fatalf("No valid logs selected.") 566 | } 567 | fmt.Printf("Selected %d log(s) for domain extraction.\n", len(selectedLogs)) 568 | } 569 | // ----------------------------------------------------- 570 | 571 | // 3. Create DomainExtractor Configuration 572 | config := &core.DomainExtractorConfig{ 573 | OutputDir: outputDir, 574 | BufferSize: bufferSize, 575 | MaxConcurrentLogs: maxConcurrentLogs, 576 | Turbo: turbo, 577 | CompressOutput: compress, 578 | } 579 | 580 | // 4. Setup Context and Signal Handling for graceful shutdown 581 | ctx, cancel := context.WithCancel(context.Background()) 582 | defer cancel() // Ensure context is cancelled on exit 583 | sigChan := make(chan os.Signal, 1) 584 | signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) 585 | 586 | // Goroutine to listen for signals and trigger shutdown 587 | go func() { 588 | sig := <-sigChan 589 | log.Printf("Received signal %v, initiating shutdown...", sig) 590 | cancel() // Cancel context first 591 | }() 592 | 593 | // 5. Create the Domain Extractor 594 | extractor, errManager := core.NewDomainExtractor(ctx, config) // Renamed err 595 | if errManager != nil { 596 | log.Fatalf("Failed to create domain extractor: %v", errManager) 597 | } 598 | 599 | // 6. Launch Stats Display Goroutine (if enabled) 600 | var statsWg sync.WaitGroup 601 | if showStats { 602 | statsWg.Add(1) 603 | go func() { 604 | defer statsWg.Done() 605 | displayDomainStats(ctx, extractor) // Swapped order 606 | }() 607 | } 608 | 609 | // 7. Start Domain Extraction Process (BLOCKING CALL) 610 | log.Printf("Starting extraction for %d selected logs...", len(selectedLogs)) 611 | if err := extractor.ExtractDomainsToCSV(selectedLogs); err != nil && !errors.Is(err, context.Canceled) && !errors.Is(err, core.ErrDownloadCancelled) { 612 | // Log error unless it was just context cancellation 613 | log.Printf("Error during domain extraction: %v", err) 614 | } 615 | 616 | // Extraction finished or was cancelled 617 | log.Println("Main extraction process finished or cancelled.") 618 | 619 | // 8. Ensure stats goroutine finishes (if started) 620 | if showStats { 621 | log.Println("Waiting for statistics display to finish...") 622 | // If context wasn't cancelled by signal, cancel it now to stop stats 623 | cancel() // Ensure context is cancelled 624 | statsWg.Wait() 625 | } 626 | 627 | // 9. Display Final Stats 628 | displayFinalDomainStats(extractor) 629 | log.Println("Domain extraction command complete.") 630 | } 631 | 632 | // displayDomainStats periodically shows domain extraction progress. 633 | // ctx should be the first parameter for consistency with Go conventions. 634 | func displayDomainStats(ctx context.Context, extractor *core.DomainExtractor) { 635 | ticker := time.NewTicker(time.Second * 2) // Update every 2 seconds 636 | defer ticker.Stop() 637 | startTime := extractor.GetStats().StartTime 638 | 639 | log.Println("Starting statistics display...") 640 | 641 | for { 642 | select { 643 | case <-ticker.C: 644 | stats := extractor.GetStats() 645 | elapsed := time.Since(startTime).Seconds() 646 | if elapsed < 0.1 { 647 | elapsed = 0.1 648 | } // Avoid division by zero initially 649 | 650 | processedEntries := stats.ProcessedEntries.Load() 651 | totalEntries := stats.TotalEntries.Load() 652 | failedEntries := stats.FailedEntries.Load() 653 | entriesPerSec := float64(processedEntries) / elapsed 654 | percentDone := 0.0 655 | if totalEntries > 0 { 656 | // Calculate percentage based on processed + failed vs. total 657 | percentDone = float64(processedEntries+failedEntries) / float64(totalEntries) * 100 658 | } 659 | 660 | // Use carriage return to update the line in place 661 | fmt.Printf("\rProcessed: %d/%d logs | Entries: %d / ~%d (%.1f%%) | Failed: %d | Rate: %.0f ent/s | Domains: %d | Retries: %.2f%%", 662 | stats.ProcessedLogs.Load(), 663 | stats.TotalLogs.Load(), 664 | processedEntries, 665 | totalEntries, 666 | percentDone, 667 | failedEntries, 668 | entriesPerSec, 669 | stats.TotalDomainsFound.Load(), 670 | stats.GetRetryRate()*100, // Assuming DomainExtractorStats also gets GetRetryRate 671 | ) 672 | case <-ctx.Done(): // Use the passed context 673 | fmt.Println("\nStats display stopping due to context cancellation.") 674 | return 675 | } 676 | } 677 | } 678 | 679 | // displayFinalDomainStats shows the summary statistics at the end. 680 | func displayFinalDomainStats(extractor *core.DomainExtractor) { 681 | stats := extractor.GetStats() 682 | elapsed := time.Since(stats.StartTime) 683 | processedEntries := stats.ProcessedEntries.Load() 684 | rate := 0.0 685 | if elapsed.Seconds() > 0 { 686 | rate = float64(processedEntries) / elapsed.Seconds() 687 | } 688 | 689 | // Ensure the final stats appear on a new line after the progress indicator 690 | fmt.Println() 691 | fmt.Printf("\n--- Final Domain Extraction Statistics ---\n") 692 | fmt.Printf(" Processing Time: %v\n", elapsed.Round(time.Millisecond)) 693 | fmt.Printf(" Total Logs: %d\n", stats.TotalLogs.Load()) 694 | fmt.Printf(" Processed Logs: %d\n", stats.ProcessedLogs.Load()) 695 | fmt.Printf(" Failed Logs: %d\n", stats.FailedLogs.Load()) 696 | fmt.Printf(" Total Entries: ~%d\n", stats.TotalEntries.Load()) 697 | fmt.Printf("Processed Entries: %d (%.2f%% first try)\n", 698 | processedEntries, 699 | float64(stats.SuccessFirstTry.Load())/float64(processedEntries+1)*100) // Assuming DomainExtractorStats has SuccessFirstTry 700 | fmt.Printf(" Failed Entries: %d\n", stats.FailedEntries.Load()) 701 | fmt.Printf(" Total Domains: %d\n", stats.TotalDomainsFound.Load()) 702 | fmt.Printf(" Overall Rate: %.0f entries/sec\n", rate) 703 | fmt.Printf(" Retry Rate: %.2f%% (Total Retries: %d)\n", 704 | stats.GetRetryRate()*100, stats.RetryCount.Load()) // Assuming DomainExtractorStats has GetRetryRate and RetryCount 705 | fmt.Printf(" Output Written: %.2f MB\n", float64(stats.OutputBytesWritten.Load())/(1024*1024)) 706 | fmt.Printf("----------------------------------------\n") 707 | } 708 | 709 | // fetchAndSaveLogs fetches the CT logs list and saves it to a local file. 710 | func fetchAndSaveLogs() { 711 | log.Printf("Fetching CT logs list to %s...", logsFile) 712 | 713 | // Temporarily disable UseLocalLogs to force fetching from remote 714 | oldUseLocalLogs := certlib.UseLocalLogs 715 | certlib.UseLocalLogs = false 716 | defer func() { certlib.UseLocalLogs = oldUseLocalLogs }() // Ensure it's restored 717 | 718 | // Use the client package to fetch the logs list directly 719 | httpClient := client.GetHTTPClient() 720 | resp, err := httpClient.Get(certlib.CTLListsURL) 721 | if err != nil { 722 | log.Fatalf("Error fetching CT logs list: %v", err) 723 | } 724 | defer resp.Body.Close() 725 | 726 | if resp.StatusCode != http.StatusOK { 727 | log.Fatalf("HTTP error %d fetching log list (%s)", resp.StatusCode, certlib.CTLListsURL) 728 | } 729 | 730 | body, err := io.ReadAll(resp.Body) 731 | if err != nil { 732 | log.Fatalf("Error reading CT logs list body: %v", err) 733 | } 734 | 735 | // Save the response to the specified file 736 | if err := os.WriteFile(logsFile, body, 0644); err != nil { 737 | log.Fatalf("Error saving logs to file '%s': %v", logsFile, err) 738 | } 739 | 740 | log.Printf("Successfully saved CT logs list to %s", logsFile) 741 | 742 | // Now try to parse and count the logs from the newly saved file. 743 | // This also serves as a basic validation of the saved file content. 744 | tempOriginalLocalLogsFile := certlib.LocalLogsFile // Save original for restoration 745 | certlib.LocalLogsFile = logsFile // Temporarily point certlib to the new file 746 | certlib.UseLocalLogs = true // Force use of this local file 747 | 748 | logs, err := core.ListCTLogs() // This will now use the new file. 749 | if err != nil { 750 | log.Printf("Warning: Saved logs file to '%s' but encountered an error parsing it: %v", logsFile, err) 751 | } else { 752 | log.Printf("Successfully parsed %d CT logs from the saved file '%s'.", len(logs), logsFile) 753 | } 754 | 755 | // Restore the original certlib settings. 756 | certlib.LocalLogsFile = tempOriginalLocalLogsFile 757 | certlib.UseLocalLogs = oldUseLocalLogs // Restore original UseLocalLogs setting 758 | } 759 | 760 | // Helper to find min of two integers (for batching end index calculation) 761 | func min(a, b int) int { 762 | if a < b { 763 | return a 764 | } 765 | return b 766 | } 767 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/x-stp/rxtls 2 | 3 | go 1.24.2 4 | 5 | require ( 6 | github.com/prometheus/client_golang v1.22.0 7 | github.com/spf13/cobra v1.9.1 8 | github.com/zeebo/xxh3 v1.0.2 9 | golang.org/x/sys v0.33.0 10 | golang.org/x/time v0.11.0 11 | ) 12 | 13 | require ( 14 | github.com/beorn7/perks v1.0.1 // indirect 15 | github.com/cespare/xxhash/v2 v2.3.0 // indirect 16 | github.com/inconshreveable/mousetrap v1.1.0 // indirect 17 | github.com/klauspost/cpuid/v2 v2.0.9 // indirect 18 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect 19 | github.com/prometheus/client_model v0.6.1 // indirect 20 | github.com/prometheus/common v0.62.0 // indirect 21 | github.com/prometheus/procfs v0.15.1 // indirect 22 | github.com/spf13/pflag v1.0.6 // indirect 23 | google.golang.org/protobuf v1.36.5 // indirect 24 | ) 25 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= 2 | github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= 3 | github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= 4 | github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= 5 | github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= 6 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 7 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 8 | github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= 9 | github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= 10 | github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= 11 | github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= 12 | github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= 13 | github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= 14 | github.com/klauspost/cpuid/v2 v2.0.9 h1:lgaqFMSdTdQYdZ04uHyN2d/eKdOMyi2YLSvlQIBFYa4= 15 | github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= 16 | github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= 17 | github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= 18 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= 19 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= 20 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 21 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 22 | github.com/prometheus/client_golang v1.22.0 h1:rb93p9lokFEsctTys46VnV1kLCDpVZ0a/Y92Vm0Zc6Q= 23 | github.com/prometheus/client_golang v1.22.0/go.mod h1:R7ljNsLXhuQXYZYtw6GAE9AZg8Y7vEW5scdCXrWRXC0= 24 | github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= 25 | github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY= 26 | github.com/prometheus/common v0.62.0 h1:xasJaQlnWAeyHdUBeGjXmutelfJHWMRr+Fg4QszZ2Io= 27 | github.com/prometheus/common v0.62.0/go.mod h1:vyBcEuLSvWos9B1+CyL7JZ2up+uFzXhkqml0W5zIY1I= 28 | github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= 29 | github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= 30 | github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= 31 | github.com/spf13/cobra v1.9.1 h1:CXSaggrXdbHK9CF+8ywj8Amf7PBRmPCOJugH954Nnlo= 32 | github.com/spf13/cobra v1.9.1/go.mod h1:nDyEzZ8ogv936Cinf6g1RU9MRY64Ir93oCnqb9wxYW0= 33 | github.com/spf13/pflag v1.0.6 h1:jFzHGLGAlb3ruxLB8MhbI6A8+AQX/2eW4qeyNZXNp2o= 34 | github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= 35 | github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= 36 | github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= 37 | github.com/zeebo/assert v1.3.0 h1:g7C04CbJuIDKNPFHmsk4hwZDO5O+kntRxzaUoNXj+IQ= 38 | github.com/zeebo/assert v1.3.0/go.mod h1:Pq9JiuJQpG8JLJdtkwrJESF0Foym2/D9XMU5ciN/wJ0= 39 | github.com/zeebo/xxh3 v1.0.2 h1:xZmwmqxHZA8AI603jOQ0tMqmBr9lPeFwGg6d+xy9DC0= 40 | github.com/zeebo/xxh3 v1.0.2/go.mod h1:5NWz9Sef7zIDm2JHfFlcQvNekmcEl9ekUZQQKCYaDcA= 41 | golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw= 42 | golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= 43 | golang.org/x/time v0.11.0 h1:/bpjEDfN9tkoN/ryeYHnv5hcMlc8ncjMcM4XBk5NWV0= 44 | golang.org/x/time v0.11.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg= 45 | google.golang.org/protobuf v1.36.5 h1:tPhr+woSbjfYvY6/GPufUoYizxw1cF/yFoxJ2fmpwlM= 46 | google.golang.org/protobuf v1.36.5/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= 47 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 48 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= 49 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 50 | -------------------------------------------------------------------------------- /internal/certlib/api.go: -------------------------------------------------------------------------------- 1 | package certlib 2 | 3 | /* 4 | rxtls — fast tool in Go for working with Certificate Transparency logs 5 | Copyright (C) 2025 Pepijn van der Stap 6 | 7 | This program is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU Affero General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | This program is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU Affero General Public License for more details. 16 | 17 | You should have received a copy of the GNU Affero General Public License 18 | along with this program. If not, see . 19 | */ 20 | 21 | import ( 22 | "bytes" 23 | "context" 24 | "crypto/x509" 25 | "encoding/base64" 26 | "encoding/binary" 27 | "encoding/json" 28 | "errors" 29 | "fmt" 30 | "io" 31 | "log" 32 | "net/http" 33 | "os" 34 | "strings" 35 | "time" 36 | 37 | "github.com/x-stp/rxtls/internal/client" // Import shared client package 38 | ) 39 | 40 | // CTLResponse represents the structure of the JSON log list. 41 | // Used for unmarshalling JSON (allocates). 42 | type CTLResponse struct { 43 | Operators []struct { 44 | ID int `json:"id"` 45 | Name string `json:"name"` 46 | } `json:"operators"` 47 | Logs []struct { 48 | Description string `json:"description"` 49 | Key string `json:"key"` 50 | URL string `json:"url"` 51 | MMD int `json:"mmd"` 52 | State struct { 53 | Timestamp string `json:"timestamp"` 54 | } `json:"state"` 55 | OperatedBy []int `json:"operated_by"` 56 | DNSAPIEndpoint string `json:"dns_api_endpoint,omitempty"` 57 | } `json:"logs"` 58 | } 59 | 60 | // TreeSizeResponse represents the JSON structure from the get-sth endpoint. 61 | // Used for unmarshalling JSON (allocates). 62 | type TreeSizeResponse struct { 63 | TreeSize int `json:"tree_size"` 64 | Timestamp int64 `json:"timestamp"` 65 | SHA256RootHash string `json:"sha256_root_hash"` 66 | TreeHeadSignature string `json:"tree_head_signature"` 67 | } 68 | 69 | // EntriesResponse represents the JSON structure from the get-entries endpoint. 70 | // Used for unmarshalling JSON (allocates). 71 | type EntriesResponse struct { 72 | Entries []struct { 73 | LeafInput string `json:"leaf_input"` // Base64 encoded MerkleTreeLeaf 74 | ExtraData string `json:"extra_data"` // Base64 encoded cert chain 75 | } `json:"entries"` 76 | } 77 | 78 | // GetCTLogs retrieves the list of known CT logs, either from a remote URL or a local file. 79 | // Operation: Network or Disk I/O bound. Allocates during HTTP fetch and JSON parsing. 80 | func GetCTLogs() ([]CTLogInfo, error) { 81 | if UseLocalLogs { 82 | log.Printf("Using local logs list from %s\n", LocalLogsFile) 83 | ctlogs, err := loadLocalCTLogs(LocalLogsFile) 84 | // If local file load fails, DO NOT fall back to network. 85 | if err != nil { 86 | return nil, fmt.Errorf("failed to load local logs file '%s': %w", LocalLogsFile, err) 87 | } 88 | return ctlogs, nil 89 | } 90 | 91 | // Network fetch using shared client 92 | log.Println("Fetching CT log list from", CTLListsURL) 93 | httpClient := client.GetHTTPClient() 94 | 95 | resp, err := httpClient.Get(CTLListsURL) 96 | if err != nil { 97 | return nil, fmt.Errorf("error retrieving CT logs list: %w", err) 98 | } 99 | defer resp.Body.Close() 100 | if resp.StatusCode != http.StatusOK { 101 | return nil, fmt.Errorf("HTTP error %d fetching log list", resp.StatusCode) 102 | } 103 | body, err := io.ReadAll(resp.Body) 104 | if err != nil { 105 | return nil, fmt.Errorf("error reading CT log list body: %w", err) 106 | } 107 | 108 | // Try to parse as V3 format first (same as in loadLocalCTLogs) 109 | var v3Response struct { 110 | Operators []struct { 111 | Name string `json:"name"` 112 | Logs []struct { 113 | Description string `json:"description"` 114 | URL string `json:"url"` 115 | State map[string]interface{} `json:"state"` 116 | } `json:"logs"` 117 | } `json:"operators"` 118 | } 119 | 120 | if err := json.Unmarshal(body, &v3Response); err == nil { 121 | // Process V3 format 122 | var ctlogs []CTLogInfo 123 | for _, operator := range v3Response.Operators { 124 | for _, logEntry := range operator.Logs { 125 | if logEntry.URL == "" { 126 | continue 127 | } 128 | url := cleanLogURL(logEntry.URL) 129 | if isLogUsable(logEntry.State) { 130 | ctlogs = append(ctlogs, CTLogInfo{ 131 | URL: url, 132 | Description: logEntry.Description, 133 | OperatedBy: operator.Name, 134 | BlockSize: 64, // Default 135 | }) 136 | } 137 | } 138 | } 139 | log.Printf("Found %d usable CT logs from remote (V3 format)", len(ctlogs)) 140 | return ctlogs, nil 141 | } 142 | 143 | // Fallback to V2/older format 144 | log.Printf("Failed to parse remote logs as V3, trying older format") 145 | var ctlResponse CTLResponse 146 | if errFallback := json.Unmarshal(body, &ctlResponse); errFallback != nil { 147 | // Save the response to a file for debugging 148 | debugFile := "debug_ct_logs_response.json" 149 | if err := os.WriteFile(debugFile, body, 0644); err == nil { 150 | log.Printf("Saved problematic response to %s for debugging", debugFile) 151 | } 152 | return nil, fmt.Errorf("error parsing CT logs list JSON with known formats: %w", errFallback) 153 | } 154 | 155 | // Process response using old format 156 | logs, err := processOldFormat(&ctlResponse) 157 | if err != nil { 158 | return nil, fmt.Errorf("error processing old format logs: %w", err) 159 | } 160 | 161 | // If we got logs successfully, save them to the local file for future use 162 | if len(logs) > 0 { 163 | if err := os.WriteFile(LocalLogsFile, body, 0644); err != nil { 164 | log.Printf("Warning: Failed to save logs to local file: %v", err) 165 | } else { 166 | log.Printf("Saved logs to %s for future use", LocalLogsFile) 167 | } 168 | } 169 | 170 | return logs, nil 171 | } 172 | 173 | // loadLocalCTLogs reads and parses the log list from a local JSON file. 174 | // Operation: Disk I/O bound, allocates for file read and JSON parsing. 175 | func loadLocalCTLogs(filename string) ([]CTLogInfo, error) { 176 | data, err := os.ReadFile(filename) 177 | if err != nil { 178 | return nil, fmt.Errorf("error reading local logs file: %w", err) 179 | } 180 | // Attempt V3 format parse first 181 | var v3Response struct { 182 | Operators []struct { 183 | Name string `json:"name"` 184 | Logs []struct { 185 | Description string `json:"description"` 186 | URL string `json:"url"` 187 | State map[string]interface{} `json:"state"` 188 | } `json:"logs"` 189 | } `json:"operators"` 190 | } 191 | if err := json.Unmarshal(data, &v3Response); err == nil { 192 | // Process V3 format 193 | var ctlogs []CTLogInfo 194 | for _, operator := range v3Response.Operators { 195 | for _, logEntry := range operator.Logs { 196 | if logEntry.URL == "" { 197 | continue 198 | } 199 | url := cleanLogURL(logEntry.URL) 200 | if isLogUsable(logEntry.State) { 201 | ctlogs = append(ctlogs, CTLogInfo{ 202 | URL: url, 203 | Description: logEntry.Description, 204 | OperatedBy: operator.Name, 205 | BlockSize: 64, // Default 206 | }) 207 | } 208 | } 209 | } 210 | log.Printf("Found %d usable CT logs in local file (V3 format)", len(ctlogs)) 211 | return ctlogs, nil 212 | } 213 | // Fallback to V2/older format 214 | log.Printf("Failed to parse local logs as V3, trying older format: %v", err) 215 | var ctlResponse CTLResponse 216 | if errFallback := json.Unmarshal(data, &ctlResponse); errFallback != nil { 217 | return nil, fmt.Errorf("error parsing local logs file with known formats: %w (primary V3 err) / %w (fallback V2 err)", err, errFallback) 218 | } 219 | return processOldFormat(&ctlResponse) 220 | } 221 | 222 | // cleanLogURL helper 223 | func cleanLogURL(rawURL string) string { 224 | url := rawURL 225 | if strings.HasPrefix(url, "https://") { 226 | url = url[8:] 227 | } else if strings.HasPrefix(url, "http://") { 228 | url = url[7:] 229 | } 230 | 231 | return strings.TrimSuffix(url, "/") 232 | } 233 | 234 | // isLogUsable helper 235 | func isLogUsable(state map[string]interface{}) bool { 236 | if _, ok := state["rejected"]; ok { 237 | return false 238 | } 239 | if _, ok := state["retired"]; ok { 240 | return false 241 | } 242 | logType, _ := state["log_type"].(string) 243 | return logType != "test" 244 | } 245 | 246 | // processOldFormat handles the fallback parsing scenario. 247 | // Operation: Similar allocation patterns to the main processing loop (slice append, string ops). 248 | func processOldFormat(ctlResponse *CTLResponse) ([]CTLogInfo, error) { 249 | operatorNames := make(map[int]string) 250 | for _, operator := range ctlResponse.Operators { 251 | operatorNames[operator.ID] = operator.Name 252 | } 253 | var ctlogs []CTLogInfo 254 | for _, logEntry := range ctlResponse.Logs { 255 | if logEntry.URL == "" { 256 | continue 257 | } 258 | url := cleanLogURL(logEntry.URL) 259 | operatedBy := "" 260 | if len(logEntry.OperatedBy) > 0 { 261 | operatedBy = operatorNames[logEntry.OperatedBy[0]] 262 | } 263 | ctlog := CTLogInfo{ 264 | URL: url, 265 | Description: logEntry.Description, 266 | OperatedBy: operatedBy, 267 | BlockSize: 64, 268 | } 269 | if ctlog.IsResolvable() { // Simple parse check 270 | ctlogs = append(ctlogs, ctlog) 271 | } 272 | } 273 | log.Printf("Found %d usable CT logs in local file (Fallback format)", len(ctlogs)) 274 | 275 | if len(ctlogs) == 0 { 276 | return nil, fmt.Errorf("no usable CT logs found in fallback format") 277 | } 278 | 279 | return ctlogs, nil 280 | } 281 | 282 | // GetLogInfo retrieves the tree size from a CT log. 283 | // Operation: Network bound. Allocates during HTTP fetch and JSON parsing. 284 | func GetLogInfo(ctlog *CTLogInfo) error { 285 | // Use shared HTTP client 286 | httpClient := client.GetHTTPClient() 287 | 288 | // Construct URL 289 | url := fmt.Sprintf("https://%s/ct/v1/get-sth", ctlog.URL) 290 | 291 | // Make the request with retry logic 292 | var resp *http.Response 293 | var err error 294 | maxRetries := 3 295 | retryDelay := 100 * time.Millisecond 296 | 297 | for attempt := range maxRetries { 298 | resp, err = httpClient.Get(url) 299 | if err == nil && resp.StatusCode == http.StatusOK { 300 | break 301 | } 302 | 303 | if resp != nil { 304 | resp.Body.Close() 305 | } 306 | 307 | if attempt < maxRetries-1 { 308 | log.Printf("Retrying GetLogInfo for %s after error: %v (attempt %d/%d)", 309 | ctlog.URL, err, attempt+1, maxRetries) 310 | time.Sleep(retryDelay) 311 | retryDelay *= 2 // Exponential backoff 312 | } 313 | } 314 | 315 | if err != nil { 316 | return fmt.Errorf("error retrieving log info after %d attempts: %w", maxRetries, err) 317 | } 318 | if resp.StatusCode != http.StatusOK { 319 | resp.Body.Close() 320 | return fmt.Errorf("HTTP error %d fetching log info for %s", resp.StatusCode, ctlog.URL) 321 | } 322 | defer resp.Body.Close() 323 | 324 | body, err := io.ReadAll(resp.Body) 325 | if err != nil { 326 | return fmt.Errorf("error reading log info body: %w", err) 327 | } 328 | 329 | var treeSize TreeSizeResponse 330 | if err := json.Unmarshal(body, &treeSize); err != nil { 331 | return fmt.Errorf("error parsing log info JSON: %w", err) 332 | } 333 | 334 | ctlog.TreeSize = treeSize.TreeSize 335 | return nil 336 | } 337 | 338 | // DownloadEntries retrieves a range of entries from a CT log. 339 | // Operation: Network bound. Allocates during HTTP fetch and JSON parsing. 340 | func DownloadEntries(ctx context.Context, ctlog *CTLogInfo, start, end int) (*EntriesResponse, error) { 341 | // Use shared HTTP client 342 | httpClient := client.GetHTTPClient() 343 | 344 | // Construct URL 345 | url := fmt.Sprintf("https://%s/ct/v1/get-entries?start=%d&end=%d", ctlog.URL, start, end) 346 | 347 | // Create request with context 348 | req, err := http.NewRequestWithContext(ctx, "GET", url, nil) 349 | if err != nil { 350 | return nil, fmt.Errorf("error creating request: %w", err) 351 | } 352 | req.Header.Set("User-Agent", "rxtls (+https://github.com/x-stp/rxtls)") 353 | 354 | // Make the request with retry logic 355 | var resp *http.Response 356 | maxRetries := 3 357 | retryDelay := 500 * time.Millisecond 358 | 359 | for attempt := range maxRetries { 360 | resp, err = httpClient.Do(req) 361 | if err == nil && resp.StatusCode == http.StatusOK { 362 | break 363 | } 364 | 365 | if resp != nil { 366 | resp.Body.Close() 367 | } 368 | 369 | // Check if context is cancelled before retrying 370 | if ctx.Err() != nil { 371 | return nil, ctx.Err() 372 | } 373 | 374 | if attempt < maxRetries-1 { 375 | log.Printf("Retrying DownloadEntries for %s (%d-%d) after error: %v (attempt %d/%d)", 376 | ctlog.URL, start, end, err, attempt+1, maxRetries) 377 | 378 | // Use context-aware sleep 379 | select { 380 | case <-time.After(retryDelay): 381 | retryDelay *= 2 // Exponential backoff 382 | case <-ctx.Done(): 383 | return nil, ctx.Err() 384 | } 385 | } 386 | } 387 | 388 | if err != nil { 389 | return nil, fmt.Errorf("error downloading entries after %d attempts: %w", maxRetries, err) 390 | } 391 | if resp.StatusCode != http.StatusOK { 392 | resp.Body.Close() 393 | return nil, fmt.Errorf("HTTP error %d fetching entries for %s (%d-%d)", resp.StatusCode, ctlog.URL, start, end) 394 | } 395 | defer resp.Body.Close() 396 | 397 | body, err := io.ReadAll(resp.Body) 398 | if err != nil { 399 | return nil, fmt.Errorf("error reading entries body: %w", err) 400 | } 401 | 402 | var entries EntriesResponse 403 | if err := json.Unmarshal(body, &entries); err != nil { 404 | return nil, fmt.Errorf("error parsing entries JSON: %w", err) 405 | } 406 | 407 | return &entries, nil 408 | } 409 | 410 | // ParseCertificateEntry decodes the MerkleTreeLeaf framing and parses the inner certificate data. 411 | // Handles Version 0, LeafType 0 (TimestampedEntry) containing X.509 or Precert. 412 | func ParseCertificateEntry(leafInput, extraData, logURL string) (*CertificateData, error) { 413 | leafBytes, err := base64.StdEncoding.DecodeString(leafInput) 414 | if err != nil { 415 | return nil, fmt.Errorf("failed to decode leaf input base64: %w", err) 416 | } 417 | 418 | // --- Check CT Framing Prefix --- 419 | if len(leafBytes) < 2 { 420 | return nil, fmt.Errorf("leaf input too short for CT framing (len %d)", len(leafBytes)) 421 | } 422 | version := uint8(leafBytes[0]) 423 | leafType := uint8(leafBytes[1]) 424 | if version != 0 { 425 | return nil, fmt.Errorf("unsupported MerkleTreeLeaf version: %d", version) 426 | } 427 | if leafType != 0 { 428 | return nil, fmt.Errorf("unsupported MerkleLeafType: %d", leafType) 429 | } 430 | // -------------------------------- 431 | 432 | // --- Manually Parse TimestampedEntry --- 433 | r := bytes.NewReader(leafBytes[2:]) // Reader for the payload after framing 434 | 435 | var timestamp uint64 436 | if err := binary.Read(r, binary.BigEndian, ×tamp); err != nil { 437 | return nil, fmt.Errorf("failed to read timestamp: %w", err) 438 | } 439 | 440 | var entryTypeUint16 uint16 441 | if err := binary.Read(r, binary.BigEndian, &entryTypeUint16); err != nil { 442 | return nil, fmt.Errorf("failed to read entry type: %w", err) 443 | } 444 | entryTypeString := "Unknown" 445 | 446 | var certDER []byte 447 | 448 | switch entryTypeUint16 { 449 | case 0: // x509_entry 450 | entryTypeString = "X509LogEntry" 451 | // Read the 3-byte length field for the certificate 452 | var certLenBytes [3]byte 453 | if _, err := io.ReadFull(r, certLenBytes[:]); err != nil { 454 | return nil, fmt.Errorf("failed to read x509 entry length: %w", err) 455 | } 456 | certLen := uint32(certLenBytes[0])<<16 | uint32(certLenBytes[1])<<8 | uint32(certLenBytes[2]) 457 | 458 | // Check for unreasonable length 459 | if certLen > uint32(r.Len()) { 460 | return nil, fmt.Errorf("x509 entry length (%d) exceeds remaining data (%d)", certLen, r.Len()) 461 | } 462 | 463 | // Read the certificate bytes 464 | certDER = make([]byte, certLen) 465 | if _, err := io.ReadFull(r, certDER); err != nil { 466 | return nil, fmt.Errorf("failed to read x509 entry data: %w", err) 467 | } 468 | 469 | case 1: // precert_entry 470 | entryTypeString = "PrecertLogEntry" 471 | // Read Issuer Key Hash (32 bytes) - we don't use it currently, but need to consume it. 472 | var issuerKeyHash [32]byte 473 | if _, err := io.ReadFull(r, issuerKeyHash[:]); err != nil { 474 | return nil, fmt.Errorf("failed to read precert issuer key hash: %w", err) 475 | } 476 | 477 | // Read the 3-byte length field for the TBS certificate 478 | var tbsCertLenBytes [3]byte 479 | if _, err := io.ReadFull(r, tbsCertLenBytes[:]); err != nil { 480 | return nil, fmt.Errorf("failed to read precert TBS length: %w", err) 481 | } 482 | tbsCertLen := uint32(tbsCertLenBytes[0])<<16 | uint32(tbsCertLenBytes[1])<<8 | uint32(tbsCertLenBytes[2]) 483 | 484 | // Check length 485 | if tbsCertLen > uint32(r.Len()) { 486 | return nil, fmt.Errorf("precert TBS length (%d) exceeds remaining data (%d)", tbsCertLen, r.Len()) 487 | } 488 | 489 | // Read the TBS certificate bytes 490 | certDER = make([]byte, tbsCertLen) 491 | if _, err := io.ReadFull(r, certDER); err != nil { 492 | return nil, fmt.Errorf("failed to read precert TBS data: %w", err) 493 | } 494 | 495 | default: 496 | return nil, fmt.Errorf("unknown TimestampedEntry.EntryType: %d", entryTypeUint16) 497 | } 498 | 499 | // Extensions follow the signed_entry; read their length (2 bytes) and consume them. 500 | // We don't parse extensions in this version, but must read past them. 501 | var extensionsLen uint16 502 | if err := binary.Read(r, binary.BigEndian, &extensionsLen); err != nil { 503 | // Allow EOF here if extensions are truly absent, although spec implies length should be present. 504 | if !errors.Is(err, io.EOF) && !errors.Is(err, io.ErrUnexpectedEOF) { 505 | log.Printf("Warning: Failed to read extensions length for %s (%d-%d): %v. Remaining bytes: %d", logURL, 0, 0, err, r.Len()) // Need index context here if possible 506 | } 507 | } else if extensionsLen > 0 { 508 | if extensionsLen > uint16(r.Len()) { 509 | return nil, fmt.Errorf("extensions length (%d) exceeds remaining data (%d)", extensionsLen, r.Len()) 510 | } 511 | // Consume extension bytes 512 | extensionBytes := make([]byte, extensionsLen) 513 | if _, err := io.ReadFull(r, extensionBytes); err != nil { 514 | return nil, fmt.Errorf("failed to read extensions data: %w", err) 515 | } 516 | } 517 | // -------------------------------------- 518 | 519 | if len(certDER) == 0 { 520 | return nil, fmt.Errorf("no certificate DER data extracted for entry type %d", entryTypeUint16) 521 | } 522 | 523 | // --- Parse the final DER bytes --- 524 | cert, err := x509.ParseCertificate(certDER) 525 | if err != nil { 526 | if entryTypeString == "PrecertLogEntry" { 527 | // Known failure mode for TBS certs 528 | return nil, fmt.Errorf("skipped parsing Precert TBS: %w", err) 529 | } 530 | return nil, fmt.Errorf("failed to parse certificate DER (type %s): %w", entryTypeString, err) 531 | } 532 | 533 | // Convert to our internal struct 534 | cd := CertificateFromX509(cert, logURL) 535 | cd.Type = entryTypeString // Set the correct type 536 | return cd, nil 537 | } -------------------------------------------------------------------------------- /internal/certlib/domain_normalization_test.go: -------------------------------------------------------------------------------- 1 | package certlib 2 | 3 | /* 4 | rxtls — fast tool in Go for working with Certificate Transparency logs 5 | Copyright (C) 2025 Pepijn van der Stap 6 | 7 | This program is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU Affero General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | This program is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU Affero General Public License for more details. 16 | 17 | You should have received a copy of the GNU Affero General Public License 18 | along with this program. If not, see . 19 | */ 20 | 21 | import ( 22 | "strings" 23 | "testing" 24 | ) 25 | 26 | // TestNormalizeDomain provides table-driven tests for various domain formats and edge cases. 27 | // Goal: Ensure NormalizeDomain behaves correctly for diverse inputs. 28 | // Uses t.Parallel() to allow tests within this function to run concurrently. 29 | func TestNormalizeDomain(t *testing.T) { 30 | t.Parallel() // Mark this test function as safe to run in parallel with others. 31 | testCases := []struct { 32 | name string 33 | input string 34 | expected string 35 | }{ 36 | {"Simple domain", "example.com", "example.com"}, 37 | {"Subdomain", "www.example.com", "www.example.com"}, 38 | {"Uppercase", "EXAMPLE.COM", "example.com"}, 39 | {"Mixed case", "Www.Example.Com", "www.example.com"}, 40 | {"Trailing dot", "example.com.", "example.com"}, 41 | {"Multiple trailing dots", "example.com...", "example.com"}, 42 | {"Leading dot", ".example.com", "example.com"}, 43 | {"Leading/Trailing dots", ".example.com.", "example.com"}, 44 | {"Leading/Trailing spaces", " example.com ", "example.com"}, 45 | {"Wildcard", "*.example.com", "*.example.com"}, 46 | {"Wildcard uppercase", "*.EXAMPLE.COM", "*.example.com"}, 47 | {"Wildcard trailing dot", "*.example.com.", "*.example.com"}, 48 | {"Multiple wildcards", "*.*.example.com", "*.*.example.com"}, // Assuming this is valid/desired 49 | {"Punycode", "xn--bcher-kva.example.com", "xn--bcher-kva.example.com"}, // bücher.example.com 50 | {"Punycode uppercase", "XN--BCHER-KVA.EXAMPLE.COM", "xn--bcher-kva.example.com"}, 51 | {"Empty string", "", ""}, 52 | {"Just spaces", " ", ""}, 53 | {"Just dots", "...", ""}, 54 | {"IP Address v4", "192.168.1.1", "192.168.1.1"}, // Should probably remain unchanged or be identified 55 | {"IP Address v6", "::1", "::1"}, // Should probably remain unchanged or be identified 56 | {"Domain with port", "example.com:443", "example.com:443"}, // Should likely remain unchanged 57 | {"Internal spaces", "example test.com", "example test.com"}, // Junk, expect no change or specific handling 58 | {"Leading dash", "-example.com", "-example.com"}, // Technically invalid label, expect no change 59 | {"Trailing dash", "example-.com", "example-.com"}, // Technically invalid label, expect no change 60 | {"Very long domain", strings.Repeat("a.", 100) + "com", strings.Repeat("a.", 100) + "com"}, // Keep as is 61 | } 62 | 63 | for _, tc := range testCases { 64 | // Capture range variable for parallel execution. 65 | tc := tc 66 | // Run each test case as a parallel subtest. 67 | t.Run(tc.name, func(t *testing.T) { 68 | t.Parallel() 69 | actual := NormalizeDomain(tc.input) 70 | if actual != tc.expected { 71 | t.Errorf("NormalizeDomain(%q) = %q; want %q", tc.input, actual, tc.expected) 72 | } 73 | }) 74 | } 75 | } 76 | 77 | // BenchmarkNormalizeDomainSimple measures performance for a common, simple domain. 78 | // Goal: Establish baseline performance. 79 | // Operation: Runs NormalizeDomain repeatedly in a loop. 80 | func BenchmarkNormalizeDomainSimple(b *testing.B) { 81 | domain := "www.example.com" 82 | // b.N is adjusted by the testing framework to achieve stable measurements. 83 | for i := 0; i < b.N; i++ { 84 | _ = NormalizeDomain(domain) // Assign to blank identifier to prevent optimization removal. 85 | } 86 | } 87 | 88 | // BenchmarkNormalizeDomainMixedCaseTrailingDot measures performance for domains needing case and dot normalization. 89 | func BenchmarkNormalizeDomainMixedCaseTrailingDot(b *testing.B) { 90 | domain := "Www.Example.COM." 91 | for i := 0; i < b.N; i++ { 92 | _ = NormalizeDomain(domain) 93 | } 94 | } 95 | 96 | // BenchmarkNormalizeDomainWildcard measures performance for wildcard domains needing normalization. 97 | func BenchmarkNormalizeDomainWildcard(b *testing.B) { 98 | domain := "*.SubDomain.Example.COM." 99 | for i := 0; i < b.N; i++ { 100 | _ = NormalizeDomain(domain) 101 | } 102 | } 103 | 104 | // BenchmarkSortedNormalizedDomains (Placeholder) 105 | // Goal: Measure performance of getting unique, sorted, normalized domains from a CertificateData struct. 106 | // Constraints: Would depend heavily on the number of domains in AllDomains and the sorting algorithm. 107 | // TODO: Implement this benchmark once the corresponding function (e.g., CertificateData.SortedNormalizedDomains) is optimized (uses sort.Strings). 108 | /* 109 | func BenchmarkSortedNormalizedDomains(b *testing.B) { 110 | // Setup: Create a CertificateData with a large, diverse list of domains. 111 | size := 1000 // Example size 112 | allDomains := make([]string, size) 113 | for i := 0; i < size; i++ { 114 | // Generate realistic domain variations (mixed case, dots, wildcards, duplicates) 115 | allDomains[i] = fmt.Sprintf("sub%d.EXAMPLE%d.com.", i%10, i%50) 116 | } 117 | certData := &certlib.CertificateData{ 118 | AllDomains: allDomains, 119 | Subject: certlib.SubjectData{O: "Test Org"}, // Needed for DomainOrgHash if testing that 120 | } 121 | 122 | b.ResetTimer() // Start timing after setup 123 | for i := 0; i < b.N; i++ { 124 | _ = certData.SortedNormalizedDomains() // Call the function under test 125 | } 126 | } 127 | */ 128 | -------------------------------------------------------------------------------- /internal/certlib/models.go: -------------------------------------------------------------------------------- 1 | package certlib 2 | 3 | /* 4 | rxtls — fast tool in Go for working with Certificate Transparency logs 5 | Copyright (C) 2025 Pepijn van der Stap 6 | 7 | This program is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU Affero General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | This program is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU Affero General Public License for more details. 16 | 17 | You should have received a copy of the GNU Affero General Public License 18 | along with this program. If not, see . 19 | */ 20 | 21 | import ( 22 | "crypto/tls" 23 | "crypto/x509" 24 | "encoding/base64" 25 | "fmt" 26 | "net/url" 27 | "sort" 28 | "strings" 29 | 30 | "github.com/zeebo/xxh3" 31 | ) 32 | 33 | // Constants related to CT log interaction. 34 | const ( 35 | CTLListsURL = "https://www.gstatic.com/ct/log_list/v3/log_list.json" 36 | CTLInfoURLTemplate = "https://%s/ct/v1/get-sth" 37 | DownloadURLTemplate = "https://%s/ct/v1/get-entries?start=%d&end=%d" 38 | HTTPTimeout = 30 // seconds 39 | ) 40 | 41 | // Global settings influencing certlib behavior. 42 | var ( 43 | UseLocalLogs = false 44 | LocalLogsFile = "./all_logs_list.json" 45 | ) 46 | 47 | // CTLogInfo holds metadata about a single Certificate Transparency log. 48 | type CTLogInfo struct { 49 | URL string `json:"url"` 50 | Description string `json:"description"` 51 | OperatedBy string `json:"operated_by"` 52 | TreeSize int `json:"tree_size"` 53 | BlockSize int `json:"block_size"` 54 | } 55 | 56 | // IsCloudflare checks if the log URL suggests it's operated by Cloudflare. 57 | func (c *CTLogInfo) IsCloudflare() bool { 58 | return strings.Contains(c.URL, "cloudflare.com") 59 | } 60 | 61 | // IsDigiCert checks if the log URL suggests it's operated by DigiCert. 62 | func (c *CTLogInfo) IsDigiCert() bool { 63 | return strings.Contains(c.URL, "digicert.com") || 64 | strings.Contains(c.URL, "wyvern") || 65 | strings.Contains(c.URL, "nessie") 66 | } 67 | 68 | // Host extracts the hostname part from the log URL. 69 | func (c *CTLogInfo) Host() string { 70 | parts := strings.Split(c.URL, "/") 71 | return parts[0] 72 | } 73 | 74 | // IsResolvable checks if the log's hostname can be parsed. 75 | func (c *CTLogInfo) IsResolvable() bool { 76 | _, err := url.Parse("https://" + c.URL) 77 | return err == nil 78 | } 79 | 80 | // GetTLSConfig provides a TLS configuration optimized for performance. 81 | func (c *CTLogInfo) GetTLSConfig() *tls.Config { 82 | return &tls.Config{ 83 | MinVersion: tls.VersionTLS12, 84 | MaxVersion: tls.VersionTLS13, 85 | CipherSuites: []uint16{ 86 | tls.TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256, 87 | tls.TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256, 88 | tls.TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384, 89 | tls.TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384, 90 | tls.TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305, 91 | tls.TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305, 92 | }, 93 | NextProtos: []string{"http/1.1"}, 94 | } 95 | } 96 | 97 | // SubjectData holds components of an X.509 Subject or Issuer Name. 98 | type SubjectData struct { 99 | Aggregated string `json:"aggregated"` 100 | C string `json:"C,omitempty"` 101 | ST string `json:"ST,omitempty"` 102 | L string `json:"L,omitempty"` 103 | O string `json:"O,omitempty"` 104 | OU string `json:"OU,omitempty"` 105 | CN string `json:"CN,omitempty"` 106 | } 107 | 108 | // Extensions simplified storage. 109 | type Extensions struct { 110 | SubjectAltName string `json:"subjectAltName,omitempty"` 111 | } 112 | 113 | // CertificateData represents the parsed data from a single certificate entry. 114 | type CertificateData struct { 115 | Subject SubjectData 116 | Issuer SubjectData 117 | Extensions map[string]string // Simplified 118 | NotBefore int64 119 | NotAfter int64 120 | AsDER string // Base64 DER 121 | AllDomains []string 122 | Type string 123 | Source map[string]string 124 | } 125 | 126 | // Chain calculates a NON-CRYPTOGRAPHIC hash (xxh3) of the base64 DER string. 127 | func (c *CertificateData) Chain() string { 128 | h := xxh3.HashString(c.AsDER) 129 | return fmt.Sprintf("%x", h) 130 | } 131 | 132 | // NormalizedDomainsSet returns a set (map[string]struct{}) of normalized domains. 133 | func (c *CertificateData) NormalizedDomainsSet() map[string]struct{} { 134 | result := make(map[string]struct{}, len(c.AllDomains)) 135 | for _, domain := range c.AllDomains { 136 | normalized := NormalizeDomain(domain) 137 | if normalized != "" { 138 | result[normalized] = struct{}{} 139 | } 140 | } 141 | return result 142 | } 143 | 144 | // SortedNormalizedDomains returns sorted, unique, normalized domains. 145 | func (c *CertificateData) SortedNormalizedDomains() []string { 146 | domainSet := c.NormalizedDomainsSet() 147 | domains := make([]string, 0, len(domainSet)) 148 | for domain := range domainSet { 149 | domains = append(domains, domain) 150 | } 151 | sort.Strings(domains) 152 | return domains 153 | } 154 | 155 | // calculateDomainOrgHash uses xxh3 hash. 156 | func calculateDomainOrgHash(sortedUniqueNormalizedDomains []string, org string) string { 157 | estimatedLen := len(org) + 1 158 | for _, d := range sortedUniqueNormalizedDomains { 159 | estimatedLen += len(d) + 1 160 | } 161 | var sb strings.Builder 162 | sb.Grow(estimatedLen) 163 | for i, domain := range sortedUniqueNormalizedDomains { 164 | if i > 0 { 165 | sb.WriteByte(',') 166 | } 167 | sb.WriteString(domain) 168 | } 169 | sb.WriteByte('|') 170 | sb.WriteString(org) 171 | h := xxh3.HashString(sb.String()) 172 | return fmt.Sprintf("%x", h) 173 | } 174 | 175 | // DomainOrgHash calculates the xxh3 hash based on sorted, unique, normalized domains and Org. 176 | func (c *CertificateData) DomainOrgHash() string { 177 | return calculateDomainOrgHash(c.SortedNormalizedDomains(), c.Subject.O) 178 | } 179 | 180 | // ToCSVLine creates a simple CSV for raw certificate download output. 181 | func (c *CertificateData) ToCSVLine(certIndex int) string { 182 | return fmt.Sprintf("%s,%d,%s,%s,%s,%d,%d\n", 183 | c.Source["url"], 184 | certIndex, 185 | c.Chain(), 186 | c.AsDER, 187 | strings.Join(c.AllDomains, " "), 188 | c.NotBefore, 189 | c.NotAfter, 190 | ) 191 | } 192 | 193 | // ToDomainsCSVLine creates the specific CSV format for the 'domains' command. 194 | func (c *CertificateData) ToDomainsCSVLine(certIndex int) string { 195 | normalizedCN := NormalizeDomain(c.Subject.CN) 196 | normalizedDomains := c.SortedNormalizedDomains() 197 | outputDomains := make([]string, len(normalizedDomains)) 198 | for i, d := range normalizedDomains { 199 | if strings.HasPrefix(d, "*.") { 200 | outputDomains[i] = d[2:] 201 | } else { 202 | outputDomains[i] = d 203 | } 204 | } 205 | outputDomainsStr := strings.Join(outputDomains, ",") 206 | primaryDomain := "" 207 | if len(normalizedDomains) > 0 { 208 | primaryDomain = normalizedDomains[0] 209 | } 210 | hash := calculateDomainOrgHash(normalizedDomains, c.Subject.O) 211 | return fmt.Sprintf("%d,%s,%s,%s,%s,\"%s\",\"%s\",\"%s\",\"%s\",%s\n", 212 | certIndex, 213 | normalizedCN, 214 | primaryDomain, 215 | outputDomainsStr, 216 | c.Subject.C, 217 | c.Subject.ST, 218 | c.Subject.L, 219 | c.Subject.O, 220 | c.Issuer.CN, 221 | hash, 222 | ) 223 | } 224 | 225 | // CertificateFromX509 creates a CertificateData from an x509 Certificate. 226 | func CertificateFromX509(cert *x509.Certificate, source string) *CertificateData { 227 | cd := &CertificateData{ 228 | Type: "X509LogEntry", 229 | Subject: SubjectData{ 230 | Aggregated: cert.Subject.String(), 231 | CN: cert.Subject.CommonName, 232 | }, 233 | Issuer: SubjectData{ 234 | Aggregated: cert.Issuer.String(), 235 | CN: cert.Issuer.CommonName, 236 | }, 237 | NotBefore: cert.NotBefore.Unix(), 238 | NotAfter: cert.NotAfter.Unix(), 239 | Source: map[string]string{"url": source}, 240 | Extensions: make(map[string]string), 241 | } 242 | if len(cert.Subject.Country) > 0 { 243 | cd.Subject.C = cert.Subject.Country[0] 244 | } 245 | if len(cert.Subject.Organization) > 0 { 246 | cd.Subject.O = cert.Subject.Organization[0] 247 | } 248 | if len(cert.Subject.OrganizationalUnit) > 0 { 249 | cd.Subject.OU = cert.Subject.OrganizationalUnit[0] 250 | } 251 | if len(cert.Subject.Locality) > 0 { 252 | cd.Subject.L = cert.Subject.Locality[0] 253 | } 254 | if len(cert.Subject.Province) > 0 { 255 | cd.Subject.ST = cert.Subject.Province[0] 256 | } 257 | if len(cert.Issuer.Country) > 0 { 258 | cd.Issuer.C = cert.Issuer.Country[0] 259 | } 260 | if len(cert.Issuer.Organization) > 0 { 261 | cd.Issuer.O = cert.Issuer.Organization[0] 262 | } 263 | derBytes := cert.Raw 264 | cd.AsDER = base64.StdEncoding.EncodeToString(derBytes) 265 | domains := make([]string, 0, len(cert.DNSNames)+1) 266 | if cert.Subject.CommonName != "" { 267 | domains = append(domains, cert.Subject.CommonName) 268 | } 269 | domains = append(domains, cert.DNSNames...) 270 | seenDomains := make(map[string]bool, len(domains)) 271 | cd.AllDomains = make([]string, 0, len(domains)) 272 | for _, domain := range domains { 273 | if !seenDomains[domain] { 274 | seenDomains[domain] = true 275 | cd.AllDomains = append(cd.AllDomains, domain) 276 | } 277 | } 278 | return cd 279 | } 280 | 281 | // NormalizeDomain standardizes domain names. 282 | func NormalizeDomain(domain string) string { 283 | domain = strings.TrimSpace(domain) 284 | if domain == "" || strings.ContainsAny(domain, " \t\n") { 285 | if strings.ContainsAny(domain, " :/") || domain == "::1" || strings.HasPrefix(domain, "-") { 286 | return domain 287 | } 288 | return "" 289 | } 290 | domain = strings.ToLower(domain) 291 | for strings.HasPrefix(domain, ".") { 292 | domain = domain[1:] 293 | } 294 | for strings.HasSuffix(domain, ".") { 295 | domain = domain[:len(domain)-1] 296 | } 297 | if domain == "" { 298 | return "" 299 | } 300 | if len(domain) > 2 && domain[:2] == "*." { 301 | domain = domain[2:] // Strip leading wildcard 302 | } 303 | parts := strings.SplitSeq(domain, ".") 304 | for part := range parts { 305 | if strings.HasPrefix(part, "-") || strings.HasSuffix(part, "-") || strings.HasPrefix(part, "*") { 306 | return domain // Invalid label structure after potential stripping 307 | } 308 | } 309 | return domain 310 | } 311 | -------------------------------------------------------------------------------- /internal/certlib/models_test.go: -------------------------------------------------------------------------------- 1 | package certlib 2 | 3 | /* 4 | rxtls — fast tool in Go for working with Certificate Transparency logs 5 | Copyright (C) 2025 Pepijn van der Stap 6 | 7 | This program is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU Affero General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | This program is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU Affero General Public License for more details. 16 | 17 | You should have received a copy of the GNU Affero General Public License 18 | along with this program. If not, see . 19 | */ 20 | 21 | import ( 22 | "fmt" 23 | "sort" 24 | "strings" 25 | "testing" 26 | 27 | "github.com/zeebo/xxh3" 28 | ) 29 | 30 | // calculateExpectedDomainOrgHash is a test helper using xxh3. 31 | func calculateExpectedDomainOrgHash(domains []string, org string) string { 32 | uniqueMap := make(map[string]bool) 33 | var normalizedDomains []string 34 | for _, d := range domains { 35 | n := NormalizeDomain(d) 36 | if n != "" && !uniqueMap[n] { 37 | uniqueMap[n] = true 38 | normalizedDomains = append(normalizedDomains, n) 39 | } 40 | } 41 | sort.Strings(normalizedDomains) 42 | domainsStr := strings.Join(normalizedDomains, ",") 43 | h := xxh3.HashString(fmt.Sprintf("%s|%s", domainsStr, org)) 44 | return fmt.Sprintf("%x", h) 45 | } 46 | 47 | // TestToDomainsCSVLine validates the domain-focused CSV output. 48 | func TestToDomainsCSVLine(t *testing.T) { 49 | t.Parallel() 50 | certIndex := 12345 51 | testCases := []struct { 52 | name string 53 | certData CertificateData 54 | }{ 55 | { 56 | name: "Simple CN, single SAN", 57 | certData: CertificateData{ 58 | Subject: SubjectData{CN: "example.com", O: "Test Org Inc.", C: "US", ST: "California", L: "Mountain View"}, 59 | Issuer: SubjectData{CN: "Test CA"}, 60 | AllDomains: []string{"example.com", "www.example.com"}, 61 | }, 62 | }, 63 | { 64 | name: "Mixed case, trailing dots, duplicate SAN", 65 | certData: CertificateData{ 66 | Subject: SubjectData{CN: "EXAMPLE.net.", O: "Another, Org", C: "GB", ST: "", L: "London"}, 67 | Issuer: SubjectData{CN: "Issuing CA Ltd.", O: "Issuer Org"}, 68 | AllDomains: []string{"EXAMPLE.net.", "WWW.example.net", "www.example.net"}, 69 | }, 70 | }, 71 | { 72 | name: "Wildcard domain (gets stripped in output list)", 73 | certData: CertificateData{ 74 | Subject: SubjectData{CN: "*.example.org", O: "Wild Org", C: "", ST: "", L: ""}, 75 | Issuer: SubjectData{CN: "Wild CA"}, 76 | AllDomains: []string{"*.example.org", "example.org"}, 77 | }, 78 | }, 79 | { 80 | name: "No CN, only SANs", 81 | certData: CertificateData{ 82 | Subject: SubjectData{CN: "", O: "SAN Org", C: "DE", ST: "Berlin", L: "Berlin"}, 83 | Issuer: SubjectData{CN: "SAN Issuer"}, 84 | AllDomains: []string{"san1.com", "san2.com"}, 85 | }, 86 | }, 87 | { 88 | name: "No domains at all", 89 | certData: CertificateData{ 90 | Subject: SubjectData{CN: "", O: "Empty Org", C: "JP", ST: "Tokyo", L: "Tokyo"}, 91 | Issuer: SubjectData{CN: "Empty Issuer"}, 92 | AllDomains: []string{}, 93 | }, 94 | }, 95 | { 96 | name: "CN needs normalization, SAN is primary", 97 | certData: CertificateData{ 98 | Subject: SubjectData{CN: " INVALID CN ", O: "Norm Org", C: "CA", ST: "Ontario", L: "Toronto"}, 99 | Issuer: SubjectData{CN: "Norm CA"}, 100 | AllDomains: []string{" a.valid.domain ", " INVALID CN "}, 101 | }, 102 | }, 103 | } 104 | 105 | for _, tc := range testCases { 106 | tc := tc 107 | t.Run(tc.name, func(t *testing.T) { 108 | t.Parallel() 109 | expectedNormalizedCN := NormalizeDomain(tc.certData.Subject.CN) 110 | normalizedSortedDomains := tc.certData.SortedNormalizedDomains() 111 | expectedOutputDomains := make([]string, len(normalizedSortedDomains)) 112 | for i, d := range normalizedSortedDomains { 113 | if strings.HasPrefix(d, "*.") { 114 | expectedOutputDomains[i] = d[2:] 115 | } else { 116 | expectedOutputDomains[i] = d 117 | } 118 | } 119 | expectedOutputDomainsStr := strings.Join(expectedOutputDomains, ",") 120 | expectedPrimaryDomain := "" 121 | if len(normalizedSortedDomains) > 0 { 122 | expectedPrimaryDomain = normalizedSortedDomains[0] 123 | } 124 | hashExpected := calculateExpectedDomainOrgHash(tc.certData.AllDomains, tc.certData.Subject.O) 125 | expectedOutput := fmt.Sprintf("%d,%s,%s,%s,%s,\"%s\",\"%s\",\"%s\",\"%s\",%s\n", 126 | certIndex, expectedNormalizedCN, expectedPrimaryDomain, expectedOutputDomainsStr, 127 | tc.certData.Subject.C, tc.certData.Subject.ST, tc.certData.Subject.L, tc.certData.Subject.O, 128 | tc.certData.Issuer.CN, hashExpected) 129 | actualOutput := tc.certData.ToDomainsCSVLine(certIndex) 130 | if actualOutput != expectedOutput { 131 | t.Errorf("ToDomainsCSVLine() mismatch:\n Input: %+v\n Want: %q\n Got: %q", tc.certData, expectedOutput, actualOutput) 132 | } 133 | }) 134 | } 135 | } 136 | 137 | // BenchmarkSortedNormalizedDomains measures performance of getting unique, sorted, 138 | // normalized domains from a CertificateData struct with a large SAN list. 139 | func BenchmarkSortedNormalizedDomains(b *testing.B) { 140 | size := 100000 141 | allDomains := make([]string, size) 142 | for i := range size { 143 | prefix := "" 144 | suffix := ".com" 145 | if i%10 == 0 { 146 | prefix = "*.Sub." 147 | suffix = ".NET." 148 | } 149 | if i%3 == 0 { 150 | prefix += " " 151 | } 152 | baseDomain := fmt.Sprintf("%sexample-%d-%d%s", prefix, i%1000, i%50, suffix) 153 | if i > 0 && i%7 == 0 { 154 | allDomains[i] = allDomains[i-1] 155 | } else { 156 | allDomains[i] = baseDomain 157 | } 158 | } 159 | certData := &CertificateData{ 160 | AllDomains: allDomains, 161 | Subject: SubjectData{O: "Benchmark Org"}, 162 | } 163 | b.ReportAllocs() 164 | b.ResetTimer() 165 | for i := 0; i < b.N; i++ { 166 | _ = certData.SortedNormalizedDomains() 167 | } 168 | } 169 | -------------------------------------------------------------------------------- /internal/client/http.go: -------------------------------------------------------------------------------- 1 | package client 2 | 3 | /* 4 | rxtls — fast tool in Go for working with Certificate Transparency logs 5 | Copyright (C) 2025 Pepijn van der Stap 6 | 7 | This program is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU Affero General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | This program is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU Affero General Public License for more details. 16 | 17 | You should have received a copy of the GNU Affero General Public License 18 | along with this program. If not, see . 19 | */ 20 | 21 | /* 22 | Package client provides a configurable HTTP client for making requests to Certificate Transparency logs and other services. 23 | It includes support for connection pooling, timeouts, and a "turbo" mode for aggressive, high-throughput scenarios. 24 | 25 | The package manages a shared global HTTP client instance that can be configured once and then retrieved by multiple 26 | parts of the application. This promotes reuse of TCP connections and consistent client behavior. 27 | */ 28 | 29 | import ( 30 | "net" 31 | "net/http" 32 | "sync" 33 | "time" 34 | ) 35 | 36 | // HTTP client-specific constants. 37 | const ( 38 | // DialTimeout is the maximum amount of time a dial will wait for a connect to complete. 39 | DialTimeout = 5 * time.Second 40 | // KeepAliveTimeout is the interval between keep-alive probes for active network connections. 41 | // If zero, keep-alive probes are sent with a default OS-dependent interval. 42 | KeepAliveTimeout = 60 * time.Second 43 | // RequestTimeout is the timeout for the entire HTTP request, including connection time, all redirects, and reading the response body. 44 | RequestTimeout = 15 * time.Second 45 | // MaxIdleConnsPerHost is the maximum number of idle (keep-alive) connections to keep per-host. 46 | MaxIdleConnsPerHost = 150 // Default value, can be overridden by Config. 47 | ) 48 | 49 | var ( 50 | // defaultDialTimeout specifies the default timeout for establishing a new connection. 51 | defaultDialTimeout = 5 * time.Second 52 | // defaultKeepAliveTimeout specifies the default keep-alive period for an active network connection. 53 | defaultKeepAliveTimeout = 60 * time.Second 54 | // defaultIdleConnTimeout is the maximum amount of time an idle (keep-alive) connection will remain 55 | // idle before closing itself. 56 | defaultIdleConnTimeout = 90 * time.Second 57 | // defaultMaxIdleConns controls the maximum number of idle (keep-alive) connections across all hosts. 58 | defaultMaxIdleConns = 100 59 | // defaultMaxConnsPerHost controls the maximum number of connections per host (includes dial, active, and idle). 60 | defaultMaxConnsPerHost = 100 61 | // defaultRequestTimeout specifies the default timeout for a complete HTTP request. 62 | defaultRequestTimeout = 15 * time.Second 63 | 64 | // sharedClient is the global HTTP client instance used by the application. 65 | // It is lazily initialized on first use or when explicitly configured. 66 | sharedClient *http.Client 67 | // sharedClientLock protects access to sharedClient and clientInitialized. 68 | sharedClientLock sync.RWMutex 69 | // clientInitialized indicates whether the sharedClient has been initialized. 70 | clientInitialized bool 71 | ) 72 | 73 | // Config holds configuration parameters for the HTTP client. 74 | // These settings allow tuning of connection pooling, timeouts, and other transport-level behaviors. 75 | // A zero-value Config will result in default settings being used. 76 | type Config struct { 77 | // DialTimeout is the maximum duration for establishing a new connection. 78 | DialTimeout time.Duration 79 | // KeepAliveTimeout specifies the keep-alive period for an active network connection. 80 | KeepAliveTimeout time.Duration 81 | // IdleConnTimeout is the maximum amount of time an idle (keep-alive) connection 82 | // will remain idle before closing itself. 83 | IdleConnTimeout time.Duration 84 | // MaxIdleConns controls the maximum number of idle (keep-alive) connections across all hosts. 85 | MaxIdleConns int 86 | // MaxConnsPerHost controls the maximum number of connections per host, including connections in the dialing, 87 | // active, and idle states. On limit violation, dials will block. 88 | MaxConnsPerHost int 89 | // RequestTimeout is the timeout for the entire HTTP request, including connection time, 90 | // all redirects, and reading the response body. 91 | RequestTimeout time.Duration 92 | } 93 | 94 | // DefaultConfig returns a new Config struct populated with default HTTP client settings. 95 | // These defaults are sensible for general-purpose HTTP interactions but may need tuning 96 | // for specific high-performance or constrained environments. 97 | func DefaultConfig() *Config { 98 | return &Config{ 99 | DialTimeout: defaultDialTimeout, 100 | KeepAliveTimeout: defaultKeepAliveTimeout, 101 | IdleConnTimeout: defaultIdleConnTimeout, 102 | MaxIdleConns: defaultMaxIdleConns, 103 | MaxConnsPerHost: defaultMaxConnsPerHost, 104 | RequestTimeout: defaultRequestTimeout, 105 | } 106 | } 107 | 108 | // InitHTTPClient initializes or reconfigures the shared global HTTP client with the provided configuration. 109 | // If a nil config is provided, it uses the default configuration obtained from DefaultConfig(). 110 | // This function is thread-safe. 111 | // 112 | // Note: Calling this function will replace the existing shared client, potentially affecting 113 | // in-flight requests made with the old client if its transport was not reusable or if connections 114 | // were specific to the old transport's settings. 115 | func InitHTTPClient(config *Config) { 116 | sharedClientLock.Lock() 117 | defer sharedClientLock.Unlock() 118 | 119 | if config == nil { 120 | config = DefaultConfig() 121 | } 122 | 123 | // Configure the transport with timeouts and connection pooling options. 124 | // ForceAttemptHTTP2 is enabled to prefer HTTP/2 if available. 125 | transport := &http.Transport{ 126 | Proxy: http.ProxyFromEnvironment, // Respect standard proxy environment variables. 127 | DialContext: (&net.Dialer{ 128 | Timeout: config.DialTimeout, 129 | KeepAlive: config.KeepAliveTimeout, // Enables TCP keep-alives. 130 | }).DialContext, 131 | MaxIdleConns: config.MaxIdleConns, 132 | MaxIdleConnsPerHost: config.MaxConnsPerHost, 133 | IdleConnTimeout: config.IdleConnTimeout, 134 | DisableCompression: false, // Enable compression (e.g., gzip) by default. 135 | ForceAttemptHTTP2: true, // Try to use HTTP/2. 136 | } 137 | 138 | sharedClient = &http.Client{ 139 | Transport: transport, 140 | Timeout: config.RequestTimeout, // Overall request timeout. 141 | } 142 | 143 | clientInitialized = true 144 | } 145 | 146 | // GetHTTPClient returns the shared global HTTP client instance. 147 | // If the client has not been initialized, it will be initialized with default settings. 148 | // This function is thread-safe. 149 | func GetHTTPClient() *http.Client { 150 | sharedClientLock.RLock() // Use RLock for initial check to allow concurrent reads. 151 | if !clientInitialized { 152 | sharedClientLock.RUnlock() 153 | // Client not initialized, need to acquire a write lock. 154 | // This double-check locking pattern minimizes write lock contention. 155 | InitHTTPClient(nil) // Initialize with defaults under a write lock. 156 | sharedClientLock.RLock() // Re-acquire read lock to safely access sharedClient. 157 | } 158 | client := sharedClient 159 | sharedClientLock.RUnlock() 160 | return client 161 | } 162 | 163 | // ConfigureHTTPClient provides a convenience function to update the shared HTTP client's configuration. 164 | // It's equivalent to calling InitHTTPClient. 165 | // This function is thread-safe. 166 | func ConfigureHTTPClient(config *Config) { 167 | InitHTTPClient(config) // InitHTTPClient handles locking. 168 | } 169 | 170 | // ConfigureTurboMode applies a set of aggressive HTTP client settings optimized for 171 | // high-throughput scenarios, such as massively parallel log fetching. 172 | // This typically involves shorter dial timeouts, longer keep-alive and idle timeouts, 173 | // and higher connection pool limits. 174 | // This function is thread-safe. 175 | func ConfigureTurboMode() { 176 | turboConfig := &Config{ 177 | DialTimeout: 2 * time.Second, // Faster dial attempts. 178 | KeepAliveTimeout: 120 * time.Second, // Keep connections alive longer. 179 | IdleConnTimeout: 120 * time.Second, // Allow idle connections to persist longer. 180 | MaxIdleConns: 500, // Larger overall idle connection pool. 181 | MaxConnsPerHost: 200, // More connections allowed per host. 182 | RequestTimeout: 30 * time.Second, // Slightly longer request timeout for potentially slower turbo operations. 183 | } 184 | ConfigureHTTPClient(turboConfig) 185 | } 186 | -------------------------------------------------------------------------------- /internal/core/common.go: -------------------------------------------------------------------------------- 1 | /* 2 | Package core provides the central logic for rxtls, including the scheduler, download manager, 3 | and domain extractor. It defines common data structures and constants used across these components. 4 | 5 | Key responsibilities of the core package include: 6 | - Managing concurrent operations through a worker pool (Scheduler). 7 | - Orchestrating the download of certificate entries from Certificate Transparency logs (DownloadManager). 8 | - Processing downloaded entries to extract domain names and other relevant metadata (DomainExtractor - if used). 9 | - Defining shared data types like WorkItem and CTLogInfo (though CTLogInfo is primarily from certlib). 10 | - Establishing common constants for retry logic, queue sizes, and default behaviors. 11 | */ 12 | package core 13 | 14 | import ( 15 | "bufio" 16 | "context" 17 | "os" 18 | "sync" 19 | "time" 20 | 21 | "github.com/x-stp/rxtls/internal/certlib" 22 | ) 23 | 24 | // Common constants used across the core package. 25 | // These values configure aspects like worker queue capacities, scheduler behavior, and retry policies. 26 | const ( 27 | // WorkerQueueCapacity defines the maximum number of work items that can be buffered in a single worker's queue. 28 | // A larger capacity can absorb more bursty workloads but consumes more memory. 29 | WorkerQueueCapacity = 500000 30 | 31 | // MaxShardQueueSize is the maximum size of a shard's queue in the scheduler. 32 | // This is used when initializing workers and their individual limiter burst sizes. 33 | // It defines how many items can be enqueued for a specific shard (log URL hash) before backpressure occurs. 34 | MaxShardQueueSize = 1000 35 | 36 | // WorkerMultiplier determines the number of worker goroutines relative to the number of CPU cores. 37 | // For example, a multiplier of 2 on an 8-core machine would create 16 workers. 38 | WorkerMultiplier = 2 39 | 40 | // RetryBaseDelay is the initial delay before the first retry attempt for a failed operation. 41 | // Subsequent retries use exponential backoff based on this delay. 42 | RetryBaseDelay = 125 * time.Millisecond 43 | // RetryMaxDelay is the maximum delay between retry attempts, capping the exponential backoff. 44 | RetryMaxDelay = 30 * time.Second 45 | // RetryBackoffMultiplier is the factor by which the retry delay increases after each failed attempt. 46 | RetryBackoffMultiplier = 1.5 47 | // RetryJitterFactor introduces randomness to retry delays to prevent thundering herd problems. 48 | // The actual jitter is calculated as a percentage of the current delay (e.g., 0.2 means +/- 20% jitter). 49 | RetryJitterFactor = 0.2 50 | ) 51 | 52 | // WorkItem represents a discrete unit of work to be processed by a worker in the scheduler. 53 | // It encapsulates all necessary information for a task, including the target log, entry range, 54 | // callback function, and retry state. 55 | // WorkItems are typically pooled and reused to reduce allocations. 56 | type WorkItem struct { 57 | // Immutable fields, set at creation and not changed during the WorkItem's lifecycle. 58 | 59 | // LogURL is the URL of the Certificate Transparency log server for this work item. 60 | LogURL string 61 | // LogInfo provides detailed metadata about the CT log, such as its tree size and block size. 62 | // This is a pointer to a shared certlib.CTLogInfo struct. 63 | LogInfo *certlib.CTLogInfo 64 | // Start is the starting index of the certificate entry range for this work item. 65 | Start int64 66 | // End is the ending index (inclusive) of the certificate entry range. 67 | End int64 68 | // Callback is the function that will be executed by a worker to process this WorkItem. 69 | // It takes the WorkItem itself as an argument and returns an error if processing fails. 70 | Callback WorkCallback 71 | // Ctx is the context associated with this specific work item. It can be used for cancellation 72 | // that is specific to this item, separate from the broader scheduler or worker context. 73 | Ctx context.Context 74 | // CreatedAt records the time when the WorkItem was initially created or retrieved from a pool. 75 | // Useful for tracking queue latency or item age. 76 | CreatedAt time.Time 77 | 78 | // Mutable fields, potentially modified during processing or retry attempts. 79 | 80 | // Attempt an_integer_representing_the_number_of_times_this_WorkItem_has_been_attempted. 81 | // Starts at 0 for the first attempt. 82 | Attempt int 83 | // Error stores any error encountered during the execution of the Callback function. 84 | // It is nil if the callback was successful. 85 | Error error 86 | } 87 | 88 | // WorkCallback defines the signature for functions that can process a WorkItem. 89 | // These functions are executed by the scheduler's worker goroutines. 90 | // The WorkItem itself is passed as an argument, allowing the callback to access 91 | // log information, entry ranges, and its own context. 92 | // An error should be returned if the processing fails, which may trigger retry logic. 93 | type WorkCallback func(item *WorkItem) error 94 | 95 | // lockedWriter provides a thread-safe wrapper around a bufio.Writer, typically used for 96 | // writing output to files concurrently from multiple goroutines. 97 | // It embeds a sync.Mutex to protect access to the underlying writer and associated file resources. 98 | // 99 | // Fields for filePath and finalPath are included to support atomic-like file operations 100 | // where data is written to a temporary file and then renamed to its final destination upon 101 | // successful completion, preventing partially written or corrupt files from being visible. 102 | type lockedWriter struct { 103 | // writer is the buffered writer used for efficient I/O. 104 | writer *bufio.Writer 105 | // gzWriter is an optional gzip.Writer, used if output compression is enabled. 106 | // It implements the io.Closer interface for proper resource release. 107 | gzWriter interface{ Close() error } 108 | // file is the underlying os.File being written to. 109 | file *os.File 110 | // mu is the mutex protecting concurrent access to the writer, gzWriter, and file. 111 | mu sync.Mutex 112 | // filePath is the path to the temporary file being written. 113 | filePath string 114 | // finalPath is the intended final path for the file after all writes are complete and successful. 115 | finalPath string 116 | } 117 | -------------------------------------------------------------------------------- /internal/core/constants.go: -------------------------------------------------------------------------------- 1 | /* 2 | Package core constants that are not specific to a single manager/component but are shared across the core logic. 3 | This file centralizes various configurable parameters related to memory management, networking behavior, 4 | CT log interaction defaults, disk I/O, and observability. 5 | 6 | These constants are intended to provide sensible defaults and can be tuned for different performance profiles 7 | or operational environments. They are distinct from the very fundamental constants defined in common.go 8 | (like worker multipliers or base retry delays) and focus more on higher-level application behavior settings. 9 | */ 10 | package core 11 | 12 | /* 13 | rxtls — fast tool in Go for working with Certificate Transparency logs 14 | Copyright (C) 2025 Pepijn van der Stap 15 | 16 | This program is free software: you can redistribute it and/or modify 17 | it under the terms of the GNU Affero General Public License as published by 18 | the Free Software Foundation, either version 3 of the License, or 19 | (at your option) any later version. 20 | 21 | This program is distributed in the hope that it will be useful, 22 | but WITHOUT ANY WARRANTY; without even the implied warranty of 23 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 24 | GNU Affero General Public License for more details. 25 | 26 | You should have received a copy of the GNU Affero General Public License 27 | along with this program. If not, see . 28 | */ 29 | 30 | import ( 31 | "time" 32 | ) 33 | 34 | // Application-wide constants for tuning performance and behavior. 35 | const ( 36 | // --- Memory --- 37 | 38 | // MaxWorkers defines the absolute upper limit on the number of concurrent worker goroutines 39 | // that the scheduler will create. This acts as a safeguard regardless of CPU core count or multipliers. 40 | MaxWorkers = 2048 41 | 42 | // DefaultShards specifies the default number of shards used by the scheduler for distributing 43 | // work based on log URL hashing. This helps in balancing load across workers. 44 | // This value is not directly used by the current scheduler implementation, which shards by numWorkers. 45 | DefaultShards = 32 // TODO: Re-evaluate or remove if scheduler sharding remains worker-based. 46 | 47 | // CacheLineSize is a common CPU cache line size in bytes. It's used as a guideline for padding 48 | // in data structures to help prevent false sharing when multiple CPU cores access adjacent memory locations. 49 | CacheLineSize = 64 50 | 51 | // DefaultNetworkBufferSize is the default size for buffers used in network read operations. 52 | // Larger buffers can reduce the number of read syscalls but increase memory footprint. 53 | DefaultNetworkBufferSize = 256 * 1024 // 256KB 54 | 55 | // DefaultDiskBufferSize is the default size for `bufio.Writer` instances used for disk I/O. 56 | // Similar to network buffers, this trades memory for potentially fewer write syscalls. 57 | DefaultDiskBufferSize = 256 * 1024 // 256KB 58 | 59 | // CertProcessingBatchSize dictates how many certificates are grouped together for logical processing steps, 60 | // such as batching writes to disk or updating progress metrics. 61 | CertProcessingBatchSize = 1024 * 10 62 | 63 | // --- Networking --- 64 | 65 | // MaxNetworkRetries specifies the maximum number of times a failed network operation 66 | // (like fetching STH or log entries) will be retried by components in `certlib`. 67 | MaxNetworkRetries = 6 68 | 69 | // MaxSubmitRetries is the maximum number of times a component (like DownloadManager or DomainExtractor) 70 | // will attempt to submit a work item to a worker's queue if it's initially full (ErrQueueFull). 71 | // This is for retrying the *submission* to the queue, not the work item execution itself. 72 | MaxSubmitRetries = 2 // Reduced from 5 as queue full should be handled by rate limiting ideally. 73 | 74 | // DialTimeout limits the time spent establishing a new TCP connection to a remote server. 75 | DialTimeout = 10 * time.Second 76 | 77 | // RequestTimeout sets the maximum duration for an entire HTTP request, encompassing 78 | // connection establishment, sending the request, and receiving the full response body. 79 | // This is typically applied at the http.Client level. 80 | RequestTimeout = 15 * time.Second 81 | 82 | // KeepAliveTimeout defines the keep-alive period for an active network connection. 83 | // This is used by the net.Dialer to configure TCP keep-alives. 84 | KeepAliveTimeout = 60 * time.Second 85 | 86 | // ReadTimeout is the maximum duration for reading the next chunk of data from a connection 87 | // after a successful connection and request send. Not directly used by client, but a common HTTP server setting. 88 | ReadTimeout = 15 * time.Second // Typically a server-side setting or per-request on client. 89 | 90 | // IdleConnTimeout is the maximum amount of time an idle (keep-alive) connection will remain 91 | // in the HTTP client's connection pool before being closed. 92 | IdleConnTimeout = 120 * time.Second 93 | 94 | // ResponseHeaderTimeout limits the time spent waiting to receive the complete response headers 95 | // from the server after the request has been sent. 96 | ResponseHeaderTimeout = 15 * time.Second 97 | 98 | // MaxIdleConnsPerHost controls the maximum number of idle connections that will be maintained 99 | // in the pool for any single host. This helps prevent resource exhaustion when interacting 100 | // with many different hosts. 101 | MaxIdleConnsPerHost = 55 102 | 103 | // DefaultRequestTimeout is a general default timeout for HTTP requests, potentially used 104 | // by components that don't have a more specific timeout configured. 105 | // It's similar to RequestTimeout but might be used as a fallback. 106 | DefaultRequestTimeout = 30 * time.Second 107 | 108 | // --- CT Log Specific --- 109 | 110 | // DefaultLogEntryBlockSize is the number of entries to request in a single `get-entries` 111 | // call if the CT log does not specify its own preferred block size (max_entries_per_get). 112 | DefaultLogEntryBlockSize = 64 113 | 114 | // DefaultBatchSize defines a common batch size for fetching entries from CT logs. 115 | // This is often a multiple of the log's block size. 116 | DefaultBatchSize = 1024 * 4 117 | 118 | // DefaultMaxParallelBatches sets a soft limit on how many batches of log entries 119 | // might be processed in parallel by the application. This can help manage memory and CPU load. 120 | DefaultMaxParallelBatches = 50 // This constant appears to be for higher-level batching strategy. 121 | 122 | // MaxConcurrentDownloadsPerHost limits how many concurrent `get-entries` requests rxtls 123 | // will make to a single CT log server host. This is crucial for being a good network citizen. 124 | // This would typically be enforced by the HTTP client's MaxConnsPerHost or similar, or custom logic. 125 | MaxConcurrentDownloadsPerHost = 50 126 | 127 | // MaxRetries defines the maximum number of retries for failed network operations. 128 | // This is similar to MaxNetworkRetries but might be used by different components with different retry policies. 129 | MaxRetries = 5 130 | 131 | // --- Disk I/O --- 132 | 133 | // DiskFlushBatchSize indicates how many *processed* certificate entries should trigger 134 | // a flush of the output file buffer to disk. This helps ensure data is persisted regularly. 135 | DiskFlushBatchSize = CertProcessingBatchSize 136 | 137 | // --- Observability --- 138 | 139 | // RequestHistorySize is the number of recent network request details to retain in memory 140 | // for observability or debugging purposes (e.g., for a live dashboard or error analysis). 141 | RequestHistorySize = 1000 // Currently not implemented, but a common pattern. 142 | 143 | // LogHistorySize determines the number of recent log messages to keep in an in-memory buffer 144 | // for potential display or inspection, especially in UIs or diagnostic tools. 145 | LogHistorySize = 5000 // Currently not implemented. 146 | 147 | // StatsReportInterval specifies how frequently summary statistics (e.g., download progress, 148 | // processing rates) should be reported, typically to standard output or a log file. 149 | StatsReportInterval = 10 * time.Second 150 | 151 | // MinimumProgressLoggingInterval defines the minimum time that must elapse between 152 | // progress log updates to avoid flooding logs with too frequent updates. 153 | MinimumProgressLoggingInterval = 5 * time.Second 154 | ) 155 | -------------------------------------------------------------------------------- /internal/core/download_manager.go: -------------------------------------------------------------------------------- 1 | package core 2 | 3 | /* 4 | rxtls — fast tool in Go for working with Certificate Transparency logs 5 | Copyright (C) 2025 Pepijn van der Stap 6 | 7 | This program is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU Affero General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | This program is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU Affero General Public License for more details. 16 | 17 | You should have received a copy of the GNU Affero General Public License 18 | along with this program. If not, see . 19 | */ 20 | 21 | import ( 22 | "bufio" 23 | "compress/gzip" 24 | "context" 25 | "errors" 26 | "fmt" 27 | "log" 28 | "math/rand" 29 | "os" 30 | "path/filepath" 31 | "runtime" 32 | "strings" 33 | "sync" 34 | "sync/atomic" 35 | "time" 36 | 37 | "github.com/x-stp/rxtls/internal/certlib" 38 | "github.com/x-stp/rxtls/internal/util" 39 | 40 | "github.com/zeebo/xxh3" 41 | ) 42 | 43 | // Constants for download performance 44 | const ( 45 | // OutputFlushInterval is how often to flush buffers to disk 46 | OutputFlushInterval = 5 * time.Second 47 | 48 | // Setup concurrency maximums 49 | MaxSetupConcurrency = 16 50 | 51 | // Writer buffer sizes 52 | DefaultBufferSize = 8 * 1024 * 1024 // 8MB 53 | 54 | // Memory pool size for string building 55 | StringPoolSize = 1024 * 1024 // 1MB 56 | 57 | // Distribution strategy - submit in batches to allow better parallelism 58 | batchSize int64 = 100 // Submit blocks in batches 59 | ) 60 | 61 | // Error types specific to download operations 62 | var ( 63 | ErrDownloadCancelled = errors.New("download operation cancelled") 64 | ErrLogSetupFailed = errors.New("log setup failed") 65 | ErrDownloadFailed = errors.New("download failed") 66 | ) 67 | 68 | // DownloadManager manages the process of downloading raw cert entries from CT logs. 69 | type DownloadManager struct { 70 | scheduler *Scheduler 71 | config *DownloadConfig 72 | stats *DownloadStats 73 | ctx context.Context 74 | cancel context.CancelFunc 75 | outputMap sync.Map // Maps log URL -> *lockedWriter 76 | stringPool sync.Pool // Reusable string builders 77 | setupComplete atomic.Bool 78 | } 79 | 80 | // DownloadConfig holds configuration for downloading. 81 | type DownloadConfig struct { 82 | OutputDir string 83 | BufferSize int 84 | MaxConcurrentLogs int 85 | CompressOutput bool // If true, output files will be .gz 86 | } 87 | 88 | // DownloadStats holds runtime statistics for downloads. 89 | type DownloadStats struct { 90 | TotalLogs atomic.Int64 91 | ProcessedLogs atomic.Int64 92 | FailedLogs atomic.Int64 93 | TotalEntries atomic.Int64 94 | ProcessedEntries atomic.Int64 // Entries successfully fetched and written 95 | FailedEntries atomic.Int64 // Entries failed (download, parse leaf, write) 96 | OutputBytesWritten atomic.Int64 97 | StartTime time.Time 98 | RetryCount atomic.Int64 // Count of retried blocks 99 | SuccessFirstTry atomic.Int64 // Count of blocks successful on first try 100 | } 101 | 102 | // GetStartTime returns the start time of the download process 103 | func (s *DownloadStats) GetStartTime() time.Time { return s.StartTime } 104 | 105 | // GetTotalLogs returns the total number of logs being processed 106 | func (s *DownloadStats) GetTotalLogs() int64 { return s.TotalLogs.Load() } 107 | 108 | // GetProcessedLogs returns the number of logs successfully processed 109 | func (s *DownloadStats) GetProcessedLogs() int64 { return s.ProcessedLogs.Load() } 110 | 111 | // GetFailedLogs returns the number of logs that failed processing 112 | func (s *DownloadStats) GetFailedLogs() int64 { return s.FailedLogs.Load() } 113 | 114 | // GetTotalEntries returns the total number of entries to be processed 115 | func (s *DownloadStats) GetTotalEntries() int64 { return s.TotalEntries.Load() } 116 | 117 | // GetProcessedEntries returns the number of entries successfully processed 118 | func (s *DownloadStats) GetProcessedEntries() int64 { return s.ProcessedEntries.Load() } 119 | 120 | // GetFailedEntries returns the number of entries that failed processing 121 | func (s *DownloadStats) GetFailedEntries() int64 { return s.FailedEntries.Load() } 122 | 123 | // GetOutputBytesWritten returns the total bytes written to output files 124 | func (s *DownloadStats) GetOutputBytesWritten() int64 { return s.OutputBytesWritten.Load() } 125 | 126 | // GetTotalDomainsFound returns the total domains found (not applicable for download stats) 127 | func (s *DownloadStats) GetTotalDomainsFound() int64 { return 0 } 128 | 129 | // GetRetryRate returns the retry rate as a fraction of processed entries 130 | func (s *DownloadStats) GetRetryRate() float64 { 131 | if s.ProcessedEntries.Load() == 0 { 132 | return 0 133 | } 134 | return float64(s.RetryCount.Load()) / float64(s.ProcessedEntries.Load()) 135 | } 136 | 137 | // NewDownloadManager creates a new download manager instance. 138 | func NewDownloadManager(ctx context.Context, config *DownloadConfig) (*DownloadManager, error) { 139 | scheduler, err := NewScheduler(ctx) 140 | if err != nil { 141 | return nil, fmt.Errorf("failed to initialize scheduler: %w", err) 142 | } 143 | 144 | // Set a sensible default buffer size if not specified 145 | if config.BufferSize <= 0 { 146 | config.BufferSize = DefaultBufferSize 147 | } 148 | 149 | dmCtx, cancel := context.WithCancel(ctx) 150 | dm := &DownloadManager{ 151 | scheduler: scheduler, 152 | config: config, 153 | stats: &DownloadStats{StartTime: time.Now()}, 154 | ctx: dmCtx, 155 | cancel: cancel, 156 | stringPool: sync.Pool{ 157 | New: func() interface{} { 158 | return &strings.Builder{} 159 | }, 160 | }, 161 | } 162 | 163 | // Start background flush worker 164 | go dm.periodicFlush() 165 | 166 | return dm, nil 167 | } 168 | 169 | // periodicFlush runs in background to periodically flush output files 170 | func (dm *DownloadManager) periodicFlush() { 171 | ticker := time.NewTicker(OutputFlushInterval) 172 | defer ticker.Stop() 173 | 174 | for { 175 | select { 176 | case <-dm.ctx.Done(): 177 | // Flush one last time before exiting 178 | dm.flushAllWriters() 179 | return 180 | case <-ticker.C: 181 | dm.flushAllWriters() 182 | } 183 | } 184 | } 185 | 186 | // flushAllWriters flushes all writers but doesn't close them 187 | func (dm *DownloadManager) flushAllWriters() { 188 | var flushCount int 189 | dm.outputMap.Range(func(key, value interface{}) bool { 190 | if value == nil { 191 | return true 192 | } 193 | 194 | lw, ok := value.(*lockedWriter) 195 | if !ok || lw == nil { 196 | log.Printf("Warning: Invalid writer type in map during flush for key %v", key) 197 | return true 198 | } 199 | 200 | // Use a short-term lock just for flushing 201 | func() { 202 | lw.mu.Lock() 203 | defer lw.mu.Unlock() 204 | if lw.writer != nil { 205 | if err := lw.writer.Flush(); err != nil { 206 | log.Printf("Warning: Error flushing writer for %s: %v", key.(string), err) 207 | } else { 208 | flushCount++ 209 | } 210 | } 211 | }() 212 | return true 213 | }) 214 | 215 | if flushCount > 0 { 216 | log.Printf("Flushed %d output files to disk", flushCount) 217 | } 218 | } 219 | 220 | // DownloadCertificates orchestrates the download process for the given logs. 221 | func (dm *DownloadManager) DownloadCertificates(logsToProcess interface{}) error { 222 | // Convert the interface to the expected type 223 | logs, ok := logsToProcess.([]certlib.CTLogInfo) 224 | if !ok { 225 | return fmt.Errorf("invalid logs type: expected []certlib.CTLogInfo") 226 | } 227 | 228 | dm.stats.TotalLogs.Store(int64(len(logs))) 229 | log.Printf("Starting certificate download for %d logs...", len(logs)) 230 | 231 | // Create base output directory 232 | if err := os.MkdirAll(dm.config.OutputDir, 0755); err != nil { 233 | return fmt.Errorf("failed to create output directory '%s': %w", dm.config.OutputDir, err) 234 | } 235 | 236 | // Limit concurrent setup 237 | concurrencyLimit := runtime.NumCPU() 238 | if concurrencyLimit > MaxSetupConcurrency { 239 | concurrencyLimit = MaxSetupConcurrency 240 | } 241 | 242 | // Setup logs concurrently with limited concurrency 243 | var wg sync.WaitGroup 244 | setupSem := make(chan struct{}, concurrencyLimit) 245 | setupErrors := make(chan error, len(logs)) // Collect errors 246 | 247 | for i := range logs { 248 | select { 249 | case <-dm.ctx.Done(): 250 | log.Println("Download cancelled during log setup.") 251 | return ErrDownloadCancelled 252 | case setupSem <- struct{}{}: 253 | wg.Add(1) 254 | go func(logInfo certlib.CTLogInfo) { 255 | defer wg.Done() 256 | defer func() { <-setupSem }() 257 | 258 | if err := dm.processSingleLogForDownload(&logInfo); err != nil { 259 | if !errors.Is(err, ErrDownloadCancelled) { // Don't log cancellations 260 | log.Printf("Error processing log %s for download: %v", logInfo.URL, err) 261 | } 262 | dm.stats.FailedLogs.Add(1) 263 | setupErrors <- fmt.Errorf("log %s: %w", logInfo.URL, err) 264 | } else { 265 | dm.stats.ProcessedLogs.Add(1) 266 | } 267 | }(logs[i]) 268 | } 269 | } 270 | 271 | wg.Wait() // Wait for setup goroutines 272 | close(setupErrors) 273 | 274 | // Mark setup complete 275 | dm.setupComplete.Store(true) 276 | 277 | // Check for setup errors 278 | var setupErrorsList []error 279 | for err := range setupErrors { 280 | setupErrorsList = append(setupErrorsList, err) 281 | } 282 | 283 | // If all logs failed, return a combined error 284 | if len(setupErrorsList) == len(logs) { 285 | return fmt.Errorf("%w: all logs failed setup: %v", ErrLogSetupFailed, errors.Join(setupErrorsList...)) 286 | } 287 | 288 | if dm.ctx.Err() != nil { 289 | log.Println("Download cancelled after log setup phase.") 290 | dm.Shutdown() 291 | return ErrDownloadCancelled 292 | } 293 | 294 | totalLogSize := dm.stats.TotalEntries.Load() 295 | log.Printf("All download work submitted (%d entries). Waiting for scheduler...", totalLogSize) 296 | 297 | // Wait for all submitted download tasks 298 | dm.scheduler.Wait() 299 | 300 | // Check for cancellation during processing 301 | if dm.ctx.Err() != nil { 302 | log.Println("Download cancelled during processing phase.") 303 | dm.Shutdown() 304 | return ErrDownloadCancelled 305 | } 306 | 307 | // Check if we had complete success or partial success 308 | processedEntries := dm.stats.ProcessedEntries.Load() 309 | failedEntries := dm.stats.FailedEntries.Load() 310 | 311 | log.Printf("Download processing complete. Finalizing... (Success: %d, Failed: %d entries)", 312 | processedEntries, failedEntries) 313 | 314 | // Shutdown (this will flush and close all writers) 315 | dm.Shutdown() 316 | 317 | // Return error if there were significant failures 318 | if failedEntries > 0 && failedEntries >= processedEntries/10 { // More than 10% failure rate 319 | return fmt.Errorf("%w: %d of %d entries failed to download", 320 | ErrDownloadFailed, failedEntries, processedEntries+failedEntries) 321 | } 322 | 323 | retryRate := dm.stats.GetRetryRate() 324 | log.Printf("Certificate download finished successfully. Retry rate: %.2f%%", retryRate*100) 325 | return nil 326 | } 327 | 328 | // processSingleLogForDownload handles STH fetch, output setup, and work submission for one log. 329 | func (dm *DownloadManager) processSingleLogForDownload(ctlog *certlib.CTLogInfo) error { 330 | log.Printf("Setting up download for log: %s", ctlog.URL) 331 | 332 | // Fetch log info with a short timeout 333 | ctxWithTimeout, cancel := context.WithTimeout(dm.ctx, 30*time.Second) 334 | defer cancel() 335 | 336 | // Create a derived context for this specific log 337 | logCtx, logCancel := context.WithCancel(dm.ctx) 338 | defer func() { 339 | // If we exit with error, cancel any pending work for this log 340 | if logCtx.Err() == nil { 341 | logCancel() 342 | } 343 | }() 344 | 345 | // Get log info with timeout 346 | if err := certlib.GetLogInfo(ctlog); err != nil { 347 | return fmt.Errorf("failed to get log info for %s: %w", ctlog.URL, err) 348 | } 349 | 350 | // Check context before proceeding 351 | if ctxWithTimeout.Err() != nil { 352 | return ErrDownloadCancelled 353 | } 354 | 355 | treeSize := int64(ctlog.TreeSize) 356 | if treeSize == 0 { 357 | log.Printf("Skipping log %s: tree size is 0", ctlog.URL) 358 | return nil 359 | } 360 | 361 | blockSize := int64(ctlog.BlockSize) 362 | if blockSize <= 0 { 363 | blockSize = DefaultLogEntryBlockSize 364 | } 365 | 366 | // Setup Output Writer 367 | filename := fmt.Sprintf("%s_certs.csv", util.SanitizeFilename(ctlog.URL)) 368 | if dm.config.CompressOutput { 369 | filename += ".gz" 370 | } 371 | filePath := filepath.Join(dm.config.OutputDir, filename) 372 | 373 | // Create output file with temp name, then rename when complete to avoid partial files 374 | tempFilePath := filePath + ".tmp" 375 | file, err := os.Create(tempFilePath) 376 | if err != nil { 377 | return fmt.Errorf("failed to create output file %s: %w", tempFilePath, err) 378 | } 379 | 380 | var writer *bufio.Writer 381 | var gzWriter *gzip.Writer 382 | 383 | if dm.config.CompressOutput { 384 | gzWriter, _ = gzip.NewWriterLevel(file, gzip.BestSpeed) 385 | writer = bufio.NewWriterSize(gzWriter, dm.config.BufferSize) 386 | } else { 387 | writer = bufio.NewWriterSize(file, dm.config.BufferSize) 388 | } 389 | 390 | // Write header: offset,leaf_input_b64,extra_data_b64 391 | headerLine := "offset,leaf_input_b64,extra_data_b64\n" 392 | _, err = writer.WriteString(headerLine) 393 | if err != nil { 394 | file.Close() 395 | return fmt.Errorf("failed to write header to %s: %w", tempFilePath, err) 396 | } 397 | 398 | // Store the locked writer instance 399 | lw := &lockedWriter{ 400 | writer: writer, 401 | file: file, 402 | filePath: tempFilePath, 403 | finalPath: filePath, 404 | } 405 | // Only set gzWriter if compression is enabled to avoid nil interface issues 406 | if dm.config.CompressOutput && gzWriter != nil { 407 | lw.gzWriter = gzWriter 408 | } 409 | dm.outputMap.Store(ctlog.URL, lw) 410 | 411 | // Submit Work Blocks in chunks for more even distribution 412 | numBlocks := (treeSize + blockSize - 1) / blockSize 413 | log.Printf("Log %s: TreeSize=%d, BlockSize=%d, NumBlocks=%d (Download)", 414 | ctlog.URL, treeSize, blockSize, numBlocks) 415 | 416 | // Track total entries 417 | dm.stats.TotalEntries.Add(treeSize) 418 | 419 | // Distribution strategy - submit in batches to allow better parallelism 420 | var submittedBlocks, droppedBlocks int64 421 | 422 | for i := int64(0); i < numBlocks; i += batchSize { 423 | // Check for context cancellation between batches 424 | if dm.ctx.Err() != nil { 425 | return ErrDownloadCancelled 426 | } 427 | 428 | end := i + batchSize 429 | if end > numBlocks { 430 | end = numBlocks 431 | } 432 | 433 | // Submit blocks in this batch 434 | for j := i; j < end; j++ { 435 | if dm.ctx.Err() != nil { 436 | return ErrDownloadCancelled 437 | } 438 | 439 | start := j * blockSize 440 | endEntry := start + blockSize - 1 441 | if endEntry >= treeSize { 442 | endEntry = treeSize - 1 443 | } 444 | 445 | // Use log-specific context for the work item 446 | err := dm.submitDownloadBlock(logCtx, ctlog, start, endEntry) 447 | if err != nil { 448 | if errors.Is(err, ErrQueueFull) { 449 | // Adjust total entries for dropped blocks 450 | entriesInBlock := endEntry - start + 1 451 | dm.stats.TotalEntries.Add(-entriesInBlock) 452 | droppedBlocks++ 453 | } else if errors.Is(err, ErrDownloadCancelled) { 454 | return err 455 | } else { 456 | log.Printf("Error submitting block %d-%d for %s: %v", 457 | start, endEntry, ctlog.URL, err) 458 | } 459 | } else { 460 | submittedBlocks++ 461 | } 462 | } 463 | 464 | // Small sleep between batches to avoid overwhelming scheduler 465 | if end < numBlocks { 466 | time.Sleep(250 * time.Millisecond) 467 | } 468 | } 469 | 470 | // Report submission stats 471 | if droppedBlocks > 0 { 472 | log.Printf("Log %s: Submitted %d blocks, dropped %d blocks due to backpressure", 473 | ctlog.URL, submittedBlocks, droppedBlocks) 474 | } else { 475 | log.Printf("Successfully submitted all %d download blocks for %s", 476 | submittedBlocks, ctlog.URL) 477 | } 478 | 479 | return nil 480 | } 481 | 482 | // submitDownloadBlock attempts to submit a work block with retries 483 | func (dm *DownloadManager) submitDownloadBlock(ctx context.Context, ctlog *certlib.CTLogInfo, start, end int64) error { 484 | // Determine target worker based on log URL (consistent sharding) 485 | hash := xxh3.HashString(ctlog.URL) 486 | shardIndex := int(hash % uint64(dm.scheduler.numWorkers)) 487 | targetWorker := dm.scheduler.workers[shardIndex] 488 | 489 | // Wait on rate limiter 490 | waitStart := time.Now() 491 | if err := targetWorker.limiter.Wait(ctx); err != nil { 492 | if errors.Is(err, context.Canceled) { 493 | return ErrDownloadCancelled 494 | } 495 | return fmt.Errorf("rate limiter wait failed: %w", err) 496 | } 497 | 498 | waitDuration := time.Since(waitStart) 499 | if waitDuration > 100*time.Millisecond { 500 | log.Printf("Worker %d rate limit caused %v wait for log %s (%d-%d), limit: %.2f req/s", 501 | targetWorker.id, waitDuration, ctlog.URL, start, end, 502 | float64(targetWorker.limiter.Limit())) 503 | } 504 | 505 | // Attempt submission with retry for transient full queue 506 | maxRetries := MaxSubmitRetries 507 | retryDelay := 1000 * time.Millisecond 508 | 509 | for attempt := 0; attempt < maxRetries; attempt++ { 510 | if ctx.Err() != nil { 511 | return ErrDownloadCancelled 512 | } 513 | 514 | err := dm.scheduler.SubmitWork(ctx, ctlog, start, end, dm.downloadCallback) 515 | if err == nil { 516 | return nil // Success 517 | } 518 | 519 | // Handle specific error types 520 | if errors.Is(err, ErrQueueFull) || strings.Contains(err.Error(), "queue full") { 521 | // Exponential backoff with jitter 522 | jitter := time.Duration(float64(retryDelay) * (0.5 + rand.Float64())) 523 | select { 524 | case <-time.After(jitter): 525 | retryDelay = retryDelay * 2 526 | if retryDelay > 500*time.Millisecond { 527 | retryDelay = 500 * time.Millisecond 528 | } 529 | continue // Retry submission 530 | case <-ctx.Done(): 531 | return ErrDownloadCancelled 532 | } 533 | } 534 | 535 | // Non-retriable error 536 | log.Printf("Permanent error submitting download work for %s (%d-%d): %v", 537 | ctlog.URL, start, end, err) 538 | return err 539 | } 540 | 541 | // All retries exhausted 542 | log.Printf("Dropped download block %s (%d-%d) after %d retries (queue full).", 543 | ctlog.URL, start, end, maxRetries) 544 | return ErrQueueFull 545 | } 546 | 547 | // downloadCallback fetches entries and writes raw data to the output file. 548 | // It's called by the worker for each block to be downloaded. 549 | func (dm *DownloadManager) downloadCallback(item *WorkItem) error { 550 | logInfo := item.LogInfo 551 | if logInfo == nil { 552 | return fmt.Errorf("internal error: WorkItem missing LogInfo (download)") 553 | } 554 | 555 | // Extract context from the work item 556 | ctx := item.Ctx 557 | if ctx == nil { 558 | ctx = context.Background() 559 | } 560 | 561 | // Track retries 562 | isRetry := item.Attempt > 0 563 | if isRetry { 564 | dm.stats.RetryCount.Add(1) 565 | } 566 | 567 | // Download entries with retry logic already in certlib.DownloadEntries 568 | downloadStart := time.Now() 569 | entriesResponse, err := certlib.DownloadEntries(ctx, logInfo, int(item.Start), int(item.End)) 570 | downloadDuration := time.Since(downloadStart) 571 | 572 | if err != nil { 573 | dm.stats.FailedEntries.Add(item.End - item.Start + 1) 574 | 575 | // Log different error levels based on context 576 | if errors.Is(err, context.Canceled) { 577 | // This is expected during shutdown, don't log as error 578 | return err 579 | } 580 | 581 | return fmt.Errorf("failed to download entries %d-%d for %s (attempt %d): %w", 582 | item.Start, item.End, item.LogURL, item.Attempt+1, err) 583 | } 584 | 585 | // Get the locked writer for this log 586 | writerUntyped, ok := dm.outputMap.Load(item.LogURL) 587 | if !ok { 588 | dm.stats.FailedEntries.Add(int64(len(entriesResponse.Entries))) 589 | return fmt.Errorf("output writer not found for log %s (download)", item.LogURL) 590 | } 591 | 592 | lw, ok := writerUntyped.(*lockedWriter) 593 | if !ok || lw == nil { 594 | return fmt.Errorf("invalid writer type found in map for log %s (download)", item.LogURL) 595 | } 596 | 597 | // Process entries in batches to minimize lock contention 598 | entriesCount := len(entriesResponse.Entries) 599 | 600 | // Get a string builder from the pool 601 | sbInterface := dm.stringPool.Get() 602 | sb := sbInterface.(*strings.Builder) 603 | sb.Reset() 604 | sb.Grow(entriesCount * 512) // Pre-allocate approximate space 605 | 606 | // Build output in memory first 607 | for i, entry := range entriesResponse.Entries { 608 | certIndex := item.Start + int64(i) 609 | fmt.Fprintf(sb, "%d,%s,%s\n", certIndex, entry.LeafInput, entry.ExtraData) 610 | } 611 | 612 | // Get the built string 613 | outputData := sb.String() 614 | 615 | // Reset and return the builder to the pool 616 | sb.Reset() 617 | dm.stringPool.Put(sb) 618 | 619 | // Lock once for the entire write 620 | lw.mu.Lock() 621 | n, err := lw.writer.WriteString(outputData) 622 | lw.mu.Unlock() 623 | 624 | if err != nil { 625 | dm.stats.FailedEntries.Add(int64(entriesCount)) 626 | return fmt.Errorf("error writing to output buffer for %s: %w", item.LogURL, err) 627 | } 628 | 629 | // Update stats 630 | dm.stats.ProcessedEntries.Add(int64(entriesCount)) 631 | dm.stats.OutputBytesWritten.Add(int64(n)) 632 | 633 | // Track first-attempt success 634 | if !isRetry { 635 | dm.stats.SuccessFirstTry.Add(1) 636 | } 637 | 638 | // Performance logging for slow blocks 639 | if downloadDuration > 2*time.Second { 640 | entriesPerSec := float64(entriesCount) / downloadDuration.Seconds() 641 | log.Printf("Slow download: %s (%d-%d): %.2f entries/sec, %d bytes written", 642 | item.LogURL, item.Start, item.End, entriesPerSec, n) 643 | } 644 | 645 | return nil 646 | } 647 | 648 | // Shutdown gracefully closes resources. 649 | func (dm *DownloadManager) Shutdown() { 650 | if dm.ctx.Err() != nil { 651 | // Already shut down 652 | return 653 | } 654 | 655 | log.Println("Shutting down Download Manager...") 656 | dm.cancel() // Cancel context 657 | 658 | // Shutdown scheduler (this will wait for worker queues to empty) 659 | if dm.scheduler != nil { 660 | dm.scheduler.Shutdown() 661 | } 662 | 663 | log.Println("Flushing and closing download writers...") 664 | 665 | // Close and rename all writers 666 | var successCount, errorCount int 667 | 668 | dm.outputMap.Range(func(key, value interface{}) bool { 669 | if value == nil { 670 | return true 671 | } 672 | 673 | lw, ok := value.(*lockedWriter) 674 | if !ok || lw == nil { 675 | log.Printf("Warning: Invalid writer type in map during download shutdown for key %v", key) 676 | return true 677 | } 678 | 679 | // Lock, flush, close and rename 680 | func() { 681 | lw.mu.Lock() 682 | defer lw.mu.Unlock() 683 | 684 | closeErr := false 685 | 686 | // Flush buffers 687 | if lw.writer != nil { 688 | if err := lw.writer.Flush(); err != nil { 689 | log.Printf("Error flushing download writer for %s: %v", key.(string), err) 690 | closeErr = true 691 | } 692 | } 693 | 694 | // Close gzip writer if present 695 | if lw.gzWriter != nil { 696 | if err := lw.gzWriter.Close(); err != nil { 697 | log.Printf("Error closing gzip download writer for %s: %v", key.(string), err) 698 | closeErr = true 699 | } 700 | } 701 | 702 | // Close file 703 | if lw.file != nil { 704 | if err := lw.file.Close(); err != nil { 705 | log.Printf("Error closing file for download %s: %v", key.(string), err) 706 | closeErr = true 707 | } 708 | } 709 | 710 | // Rename temp file to final name if we're fully set up 711 | if dm.setupComplete.Load() && !closeErr && lw.filePath != "" && lw.finalPath != "" { 712 | if err := os.Rename(lw.filePath, lw.finalPath); err != nil { 713 | log.Printf("Error renaming temp file %s to %s: %v", 714 | lw.filePath, lw.finalPath, err) 715 | errorCount++ 716 | } else { 717 | successCount++ 718 | } 719 | } else if closeErr { 720 | errorCount++ 721 | } 722 | }() 723 | return true 724 | }) 725 | 726 | log.Printf("Download Manager shutdown complete. Finalized %d files with %d errors.", 727 | successCount, errorCount) 728 | } 729 | 730 | // GetStats returns the current statistics. 731 | func (dm *DownloadManager) GetStats() *DownloadStats { 732 | return dm.stats 733 | } 734 | -------------------------------------------------------------------------------- /internal/core/error.go: -------------------------------------------------------------------------------- 1 | /* 2 | Package core provides the central logic for rxtls, including the scheduler, download manager, 3 | and domain extractor. It defines common data structures and constants used across these components. 4 | */ 5 | package core 6 | 7 | // customError is an error type that includes a retryable flag. 8 | // This allows components to determine if an operation that resulted in this error 9 | // should be retried. 10 | // It implements the standard `error` interface. 11 | type customError struct { 12 | message string // The error message. 13 | retryable bool // True if the error indicates a condition that might be resolved by retrying. 14 | } 15 | 16 | // NewError creates a new customError with the given message and retryable status. 17 | // 18 | // Parameters: 19 | // msg: The textual description of the error. 20 | // retryable: A boolean indicating if the error condition is potentially transient 21 | // and the operation could succeed on a subsequent attempt. 22 | // 23 | // Returns: 24 | // An error of type *customError. 25 | func NewError(msg string, retryable bool) error { 26 | return &customError{ 27 | message: msg, 28 | retryable: retryable, 29 | } 30 | } 31 | 32 | // Error implements the standard Go `error` interface. 33 | // It returns the textual message associated with the customError. 34 | func (e *customError) Error() string { 35 | return e.message 36 | } 37 | 38 | // IsRetryable returns true if the error is designated as retryable, false otherwise. 39 | // This method allows consuming code to check the retryable nature of the error 40 | // without needing to type-assert to the concrete `customError` type if they 41 | // are working with a standard `error` interface variable. 42 | func (e *customError) IsRetryable() bool { 43 | return e.retryable 44 | } 45 | 46 | // IsRetryable is a helper function to check if a given error is of type *customError 47 | // and if its retryable flag is set. 48 | // If the error is nil, it returns false. 49 | // If the error is not a *customError, it defaults to false (non-retryable). 50 | // 51 | // Parameters: 52 | // err: The error to check. 53 | // 54 | // Returns: 55 | // True if the error is a retryable *customError, false otherwise. 56 | func IsRetryable(err error) bool { 57 | if err == nil { 58 | return false 59 | } 60 | 61 | // Type assert to *customError to access the IsRetryable method. 62 | if e, ok := err.(*customError); ok { 63 | return e.IsRetryable() 64 | } 65 | 66 | // If not a *customError, assume not retryable by default for unknown error types. 67 | return false 68 | } 69 | 70 | // Common error constants used within the core package. 71 | // These provide standardized error values for frequent conditions like full queues 72 | // or worker shutdowns, facilitating consistent error handling and checking. 73 | var ( 74 | // ErrQueueFull indicates that a worker's queue is at capacity and cannot accept new work items. 75 | // This error is typically considered retryable, as the queue might free up later. 76 | ErrQueueFull = NewError("queue full", true) 77 | // ErrWorkerShutdown indicates that a worker or the scheduler is in the process of shutting down 78 | // and can no longer process new work items. This is generally not a retryable error 79 | // in the context of the current operation, as the component is terminating. 80 | ErrWorkerShutdown = NewError("worker shutdown", false) 81 | ) 82 | -------------------------------------------------------------------------------- /internal/core/list.go: -------------------------------------------------------------------------------- 1 | /* 2 | Package core provides the central logic for rxtls, including the scheduler, download manager, 3 | and domain extractor. It defines common data structures and constants used across these components. 4 | */ 5 | package core 6 | 7 | /* 8 | rxtls — fast tool in Go for working with Certificate Transparency logs 9 | Copyright (C) 2025 Pepijn van der Stap 10 | 11 | This program is free software: you can redistribute it and/or modify 12 | it under the terms of the GNU Affero General Public License as published by 13 | the Free Software Foundation, either version 3 of the License, or 14 | (at your option) any later version. 15 | 16 | This program is distributed in the hope that it will be useful, 17 | but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | GNU Affero General Public License for more details. 20 | 21 | You should have received a copy of the GNU Affero General Public License 22 | along with this program. If not, see . 23 | */ 24 | 25 | import ( 26 | "fmt" 27 | 28 | "github.com/x-stp/rxtls/internal/certlib" 29 | ) 30 | 31 | // ListCTLogs retrieves the list of available Certificate Transparency (CT) logs. 32 | // It serves as a simple wrapper around `certlib.GetCTLogs`, which encapsulates the logic 33 | // for fetching the log list either from a remote source (e.g., Google's JSON list) or 34 | // from a local file cache, depending on the `certlib.UseLocalLogs` global setting. 35 | // 36 | // This function is primarily used by command-line interface (CLI) commands that need to 37 | // display available logs or allow users to select logs for processing. 38 | // 39 | // Performance Note: This function itself does not perform detailed STH (Signed Tree Head) 40 | // fetching for each log to determine its size or state, as that would be too slow for 41 | // a simple listing operation. The `certlib.GetCTLogs` function focuses on retrieving the 42 | // basic log metadata (URL, description, operator). 43 | // 44 | // Returns: 45 | // - A slice of `certlib.CTLogInfo` structs, each representing a known CT log. 46 | // - An error if retrieving or parsing the log list fails. 47 | func ListCTLogs() ([]certlib.CTLogInfo, error) { 48 | ctlogs, err := certlib.GetCTLogs() 49 | if err != nil { 50 | // Wrap the error from certlib to provide more context. 51 | return nil, fmt.Errorf("error retrieving CT logs list: %w", err) 52 | } 53 | return ctlogs, nil 54 | } 55 | -------------------------------------------------------------------------------- /internal/core/ratelimiter.go: -------------------------------------------------------------------------------- 1 | /* 2 | Package core provides the central logic for rxtls, including the scheduler, download manager, 3 | and domain extractor. It defines common data structures and constants used across these components. 4 | */ 5 | package core 6 | 7 | import ( 8 | "math" 9 | "sync/atomic" 10 | "time" 11 | ) 12 | 13 | // Rate limiting constants defining the behavior of the adaptive rate limiter. 14 | const ( 15 | // MinRate is the minimum allowed rate in requests per second (RPS). 16 | // The rate limiter will not decrease the rate below this value. 17 | MinRate = 2.0 18 | // MaxRate is the maximum allowed rate in requests per second (RPS). 19 | // The rate limiter will not increase the rate above this value. 20 | MaxRate = 1000.0 21 | // RateIncreaseStep is the additive amount by which the rate is increased upon a successful operation. 22 | RateIncreaseStep = 20.0 23 | // RateDecreaseStep is the subtractive amount by which the rate is decreased upon a failed operation 24 | // or when backpressure is detected. 25 | RateDecreaseStep = 50.0 26 | ) 27 | 28 | // RateLimiter implements a simple adaptive rate limiting mechanism. 29 | // It adjusts the rate based on success/failure of operations and can respond to backpressure signals. 30 | // The current rate is stored as a float64 manipulated via atomic operations on its uint64 bit representation 31 | // to ensure thread-safe updates without locks for `getRate` and `setRate` hot paths. 32 | // 33 | // This rate limiter is a basic token bucket variant where tokens are implicitly refilled based on elapsed time 34 | // and the current rate. 35 | // 36 | // Concurrency: The `currentRate` is accessed atomically. `successCount`, `failureCount`, 37 | // and `backpressure` are also atomic, making most operations non-blocking. 38 | // `lastAdjustment` is not atomic but primarily used for calculating elapsed time in `Allow`, 39 | // where its exact precision is less critical than overall rate control. 40 | type RateLimiter struct { 41 | // currentRate stores the bit representation of the current float64 rate limit. 42 | // This allows for atomic load/store of the rate. 43 | currentRate uint64 44 | // successCount tracks the number of successful operations recorded. 45 | successCount atomic.Uint64 46 | // failureCount tracks the number of failed operations recorded. 47 | failureCount atomic.Uint64 48 | // lastAdjustment records the time of the last `Allow` call that consumed a token. 49 | // It's used to calculate token replenishment. 50 | lastAdjustment time.Time 51 | // backpressure, if true, forces the Allow method to return false, effectively halting 52 | // operations. This can be triggered externally (e.g., by a full queue). 53 | backpressure atomic.Bool 54 | } 55 | 56 | // NewRateLimiter creates a new RateLimiter instance with the specified initial rate. 57 | // 58 | // Parameters: 59 | // 60 | // initialRate: The starting rate limit in requests per second (RPS). 61 | // 62 | // Returns: 63 | // 64 | // A pointer to the newly created RateLimiter. 65 | func NewRateLimiter(initialRate float64) *RateLimiter { 66 | rl := &RateLimiter{ 67 | lastAdjustment: time.Now(), // Initialize lastAdjustment to current time. 68 | } 69 | rl.setRate(initialRate) // Set the initial rate atomically. 70 | return rl 71 | } 72 | 73 | // Allow determines if an operation should be permitted based on the current rate limit. 74 | // It implements a simple token bucket logic: tokens are replenished over time based on `currentRate`. 75 | // If backpressure is active, Allow will always return false. 76 | // If the rate is zero or negative, Allow will also return false. 77 | // 78 | // Returns: 79 | // 80 | // True if the operation is allowed, false otherwise. 81 | // 82 | // Hot Path: This method is expected to be called frequently and should be highly performant. 83 | // It primarily involves atomic reads and time calculations. 84 | func (rl *RateLimiter) Allow() bool { 85 | if rl.backpressure.Load() { 86 | return false // Backpressure is active, disallow operation. 87 | } 88 | 89 | rate := rl.getRate() 90 | if rate <= 0 { 91 | return false // Rate is zero or negative, no operations allowed. 92 | } 93 | 94 | // Simple token bucket: Calculate tokens accrued since last allowed operation. 95 | now := time.Now() 96 | elapsed := now.Sub(rl.lastAdjustment).Seconds() // Time since last token consumption. 97 | tokens := elapsed * rate // Tokens generated during elapsed time. 98 | 99 | if tokens >= 1.0 { 100 | rl.lastAdjustment = now // Consume one token by updating lastAdjustment. 101 | return true // Enough tokens, allow operation. 102 | } 103 | 104 | return false // Not enough tokens. 105 | } 106 | 107 | // RecordSuccess is called to indicate that an operation controlled by this rate limiter was successful. 108 | // It increments the success counter and may trigger an increase in the rate limit. 109 | func (rl *RateLimiter) RecordSuccess() { 110 | rl.successCount.Add(1) 111 | rl.adjustRate(true) // Attempt to increase rate. 112 | } 113 | 114 | // RecordFailure is called to indicate that an operation controlled by this rate limiter failed. 115 | // It increments the failure counter and may trigger a decrease in the rate limit. 116 | func (rl *RateLimiter) RecordFailure() { 117 | rl.failureCount.Add(1) 118 | rl.adjustRate(false) // Attempt to decrease rate. 119 | } 120 | 121 | // UpdateBackpressure sets the backpressure state of the rate limiter. 122 | // If `hasBackpressure` is true, the `Allow` method will subsequently return false until 123 | // backpressure is cleared by calling UpdateBackpressure(false). 124 | // This provides a mechanism for external components (e.g., a queue monitor) to signal the 125 | // rate limiter to pause operations. 126 | func (rl *RateLimiter) UpdateBackpressure(hasBackpressure bool) { 127 | rl.backpressure.Store(hasBackpressure) 128 | } 129 | 130 | // GetCurrentRate returns the current effective rate limit in requests per second. 131 | func (rl *RateLimiter) GetCurrentRate() float64 { 132 | return rl.getRate() 133 | } 134 | 135 | // adjustRate dynamically modifies the rate limit based on the success or failure of an operation. 136 | // If `success` is true, it attempts to increase the rate by `RateIncreaseStep`, 137 | // capped at `MaxRate`. 138 | // If `success` is false, it attempts to decrease the rate by `RateDecreaseStep`, 139 | // floored at `MinRate`. 140 | // 141 | // This method is called internally by RecordSuccess and RecordFailure. 142 | func (rl *RateLimiter) adjustRate(success bool) { 143 | current := rl.getRate() 144 | var newRate float64 145 | 146 | if success { 147 | newRate = current + RateIncreaseStep 148 | if newRate > MaxRate { 149 | newRate = MaxRate // Cap at maximum allowed rate. 150 | } 151 | } else { 152 | newRate = current - RateDecreaseStep 153 | if newRate < MinRate { 154 | newRate = MinRate // Floor at minimum allowed rate. 155 | } 156 | } 157 | 158 | rl.setRate(newRate) // Atomically update the rate. 159 | } 160 | 161 | // GetStats returns a map containing current statistics of the rate limiter. 162 | // This is useful for monitoring and debugging the rate limiter's behavior. 163 | // The returned map includes the current rate, total success/failure counts, 164 | // backpressure state, and the timestamp of the last rate adjustment (token consumption). 165 | func (rl *RateLimiter) GetStats() map[string]interface{} { 166 | return map[string]interface{}{ 167 | "current_rate": rl.getRate(), 168 | "success_count": rl.successCount.Load(), 169 | "failure_count": rl.failureCount.Load(), 170 | "backpressure": rl.backpressure.Load(), 171 | "last_adjustment": rl.lastAdjustment, 172 | } 173 | } 174 | 175 | // Reset reinitializes the rate limiter to a given initial rate and clears its statistics. 176 | // Success/failure counts are reset, backpressure is turned off, and lastAdjustment is set to now. 177 | // 178 | // Parameters: 179 | // 180 | // initialRate: The new initial rate limit in requests per second (RPS). 181 | func (rl *RateLimiter) Reset(initialRate float64) { 182 | rl.setRate(initialRate) 183 | rl.successCount.Store(0) 184 | rl.failureCount.Store(0) 185 | rl.backpressure.Store(false) 186 | rl.lastAdjustment = time.Now() 187 | } 188 | 189 | // getRate atomically retrieves the current rate limit as a float64. 190 | // It reads the uint64 bits and converts them to a float64. 191 | func (rl *RateLimiter) getRate() float64 { 192 | bits := atomic.LoadUint64(&rl.currentRate) 193 | return math.Float64frombits(bits) 194 | } 195 | 196 | // setRate atomically sets the current rate limit. 197 | // It converts the float64 rate to its uint64 bit representation for atomic storage. 198 | func (rl *RateLimiter) setRate(rate float64) { 199 | bits := math.Float64bits(rate) 200 | atomic.StoreUint64(&rl.currentRate, bits) 201 | } 202 | -------------------------------------------------------------------------------- /internal/core/scheduler_stub.go: -------------------------------------------------------------------------------- 1 | //go:build !linux 2 | // +build !linux 3 | 4 | /* 5 | rxtls — fast tool in Go for working with Certificate Transparency logs 6 | Copyright (C) 2025 Pepijn van der Stap 7 | 8 | This program is free software: you can redistribute it and/or modify 9 | it under the terms of the GNU Affero General Public License as published by 10 | the Free Software Foundation, either version 3 of the License, or 11 | (at your option) any later version. 12 | 13 | This program is distributed in the hope that it will be useful, 14 | but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | GNU Affero General Public License for more details. 17 | 18 | You should have received a copy of the GNU Affero General Public License 19 | along with this program. If not, see . 20 | */ 21 | 22 | // This file provides a stub implementation of the scheduler for non-Linux platforms 23 | // where CPU affinity setting is not available or not implemented via x/sys/unix. 24 | 25 | package core 26 | 27 | import ( 28 | "context" 29 | "fmt" 30 | "log" 31 | "runtime" 32 | "sync" 33 | "sync/atomic" 34 | "time" 35 | 36 | "github.com/x-stp/rxtls/internal/certlib" 37 | 38 | "github.com/zeebo/xxh3" // Consistent hashing 39 | "golang.org/x/time/rate" 40 | ) 41 | 42 | // Scheduler definition MUST be identical across builds. 43 | // Manages workers and dispatch, but without affinity. 44 | type Scheduler struct { 45 | numWorkers int 46 | workers []*Worker 47 | ctx context.Context 48 | cancel context.CancelFunc 49 | shutdown atomic.Bool 50 | workItemPool sync.Pool 51 | activeWork sync.WaitGroup // Tracks active work 52 | } 53 | 54 | // Worker definition MUST be identical, cpuAffinity field is present but unused. 55 | type Worker struct { 56 | // Immutable fields 57 | id int 58 | ctx context.Context 59 | cancel context.CancelFunc 60 | scheduler *Scheduler 61 | queue chan *WorkItem 62 | limiter *rate.Limiter 63 | cpuAffinity int 64 | 65 | // Metrics 66 | processed atomic.Int64 67 | errors atomic.Int64 68 | panics atomic.Int64 69 | busy atomic.Bool 70 | lastActive atomic.Int64 71 | } 72 | 73 | // NewScheduler creates and starts the scheduler (stub version without affinity). 74 | // Operation: Blocking (at startup), allocates worker/channel resources. 75 | func NewScheduler(parentCtx context.Context) (*Scheduler, error) { 76 | numWorkers := runtime.NumCPU() * WorkerMultiplier 77 | if numWorkers <= 0 { 78 | numWorkers = 1 79 | } 80 | 81 | sctx, cancel := context.WithCancel(parentCtx) 82 | 83 | s := &Scheduler{ 84 | numWorkers: numWorkers, 85 | workers: make([]*Worker, numWorkers), 86 | ctx: sctx, 87 | cancel: cancel, 88 | workItemPool: sync.Pool{ 89 | New: func() interface{} { 90 | return &WorkItem{ 91 | CreatedAt: time.Now(), 92 | } 93 | }, 94 | }, 95 | } 96 | 97 | initialRate := rate.Limit(1000) 98 | burstSize := MaxShardQueueSize 99 | 100 | for i := 0; i < numWorkers; i++ { 101 | w := &Worker{ 102 | id: i, 103 | cpuAffinity: -1, // Mark as unused 104 | queue: make(chan *WorkItem, MaxShardQueueSize), 105 | scheduler: s, 106 | ctx: sctx, 107 | limiter: rate.NewLimiter(initialRate, burstSize), // Init limiter 108 | } 109 | s.workers[i] = w 110 | go w.run() // Start the worker goroutine 111 | } 112 | 113 | fmt.Printf("Scheduler initialized with %d workers (CPU affinity disabled).\n", numWorkers) 114 | return s, nil 115 | } 116 | 117 | // run is the main loop for a worker goroutine (stub version without affinity setup). 118 | // Hot Path: Yes. Must be zero-GC, non-blocking (except on queue read). 119 | func (w *Worker) run() { 120 | // No LockOSThread or affinity setting needed/possible on non-Linux. 121 | for { 122 | select { 123 | case <-w.ctx.Done(): 124 | return 125 | case item := <-w.queue: 126 | if item == nil { 127 | continue 128 | } 129 | 130 | // Mark work as done when the callback finishes or panics 131 | func() { 132 | defer w.scheduler.activeWork.Done() // Signal completion via WaitGroup 133 | defer func() { 134 | if r := recover(); r != nil { 135 | log.Printf("Panic recovered in worker %d processing item for %s (%d-%d): %v", w.id, item.LogURL, item.Start, item.End, r) 136 | // TODO: Increment failure counter 137 | } 138 | }() 139 | 140 | err := item.Callback(item) 141 | if err != nil { 142 | // Basic error logging. 143 | // TODO: Implement retry mechanism using item.Attempt. 144 | fmt.Printf("Error processing item for %s (%d-%d): %v\n", item.LogURL, item.Start, item.End, err) 145 | } 146 | }() 147 | 148 | // Return item to pool, resetting fields. 149 | item.Callback = nil 150 | item.LogURL = "" 151 | item.LogInfo = nil 152 | item.Ctx = nil 153 | item.Error = nil 154 | w.scheduler.workItemPool.Put(item) 155 | } 156 | } 157 | } 158 | 159 | // setAffinity is a no-op stub on non-Linux platforms. 160 | func setAffinity(workerID, cpuID int) { 161 | // Affinity not supported/implemented on this OS. 162 | } 163 | 164 | // SubmitWork definition MUST be identical across builds. 165 | // Hot Path: Yes. Non-blocking, low allocation. 166 | func (s *Scheduler) SubmitWork(ctx context.Context, logInfo *certlib.CTLogInfo, start, end int64, callback WorkCallback) error { 167 | if s.shutdown.Load() { 168 | return fmt.Errorf("scheduler is shutting down") 169 | } 170 | 171 | logURL := logInfo.URL 172 | hash := xxh3.HashString(logURL) 173 | shardIndex := int(hash % uint64(s.numWorkers)) 174 | targetWorker := s.workers[shardIndex] 175 | 176 | // NOTE: Rate limiting handled by caller 177 | 178 | item := s.workItemPool.Get().(*WorkItem) 179 | item.LogURL = logURL 180 | item.LogInfo = logInfo 181 | item.Start = start 182 | item.End = end 183 | item.Attempt = 0 184 | item.Callback = callback 185 | item.Ctx = ctx 186 | item.CreatedAt = time.Now() 187 | s.activeWork.Add(1) 188 | 189 | select { 190 | case targetWorker.queue <- item: 191 | // Optional: Increase rate limit on success 192 | return nil 193 | default: 194 | // Backpressure: Queue full. 195 | s.activeWork.Done() 196 | s.workItemPool.Put(item) 197 | // Optional: Decrease rate limit 198 | return fmt.Errorf("worker %d for log %s: %w", targetWorker.id, logURL, ErrQueueFull) 199 | } 200 | } 201 | 202 | // Wait definition MUST be identical across builds. 203 | func (s *Scheduler) Wait() { 204 | s.activeWork.Wait() 205 | } 206 | 207 | // Shutdown definition MUST be identical across builds. 208 | func (s *Scheduler) Shutdown() { 209 | s.shutdown.Store(true) 210 | s.cancel() 211 | s.Wait() 212 | } 213 | -------------------------------------------------------------------------------- /internal/core/work.go: -------------------------------------------------------------------------------- 1 | // Package core provides the central logic for rxtls, including the scheduler, download manager, 2 | // and domain extractor. It defines common data structures and constants used across these components. 3 | package core 4 | 5 | import ( 6 | "context" 7 | "time" 8 | ) 9 | 10 | // Work defines an interface for a unit of work that can be processed. 11 | // This interface allows for different types of tasks to be handled by a generic 12 | // processing system (like a scheduler or worker pool) as long as they conform to this contract. 13 | // 14 | // Implementations of Work should encapsulate all necessary data and logic for their execution. 15 | type Work interface { 16 | // Process executes the primary logic of the work unit. 17 | // It takes a context that can be used for cancellation or deadlines. 18 | // An error is returned if processing fails. 19 | Process(ctx context.Context) error 20 | // GetID returns a unique identifier for this work unit. 21 | // This ID can be used for logging, tracking, or sharding purposes. 22 | GetID() string 23 | // GetCreatedAt returns the timestamp when this work unit was created. 24 | // This can be useful for metrics, priority queuing, or staleness checks. 25 | GetCreatedAt() time.Time 26 | } 27 | 28 | // Task is a concrete implementation of the Work interface. 29 | // It provides a flexible way to define a work unit by associating arbitrary data 30 | // with a specific processing function (ProcessFn). 31 | // 32 | // Fields: 33 | // 34 | // ID: A string identifier for the task. 35 | // CreatedAt: The time the task was created. 36 | // Data: An interface{} to hold any data required by the ProcessFn. 37 | // ProcessFn: The function that encapsulates the actual processing logic for this task. 38 | type Task struct { 39 | ID string 40 | CreatedAt time.Time 41 | Data interface{} 42 | ProcessFn func(ctx context.Context, data interface{}) error 43 | } 44 | 45 | // Process executes the task by calling its ProcessFn with the associated context and data. 46 | // It conforms to the Work interface. 47 | func (t *Task) Process(ctx context.Context) error { 48 | return t.ProcessFn(ctx, t.Data) 49 | } 50 | 51 | // GetID returns the unique identifier of the task. 52 | // It conforms to the Work interface. 53 | func (t *Task) GetID() string { 54 | return t.ID 55 | } 56 | 57 | // GetCreatedAt returns the creation timestamp of the task. 58 | // It conforms to the Work interface. 59 | func (t *Task) GetCreatedAt() time.Time { 60 | return t.CreatedAt 61 | } 62 | 63 | // NewTask creates and returns a new Task instance. 64 | // 65 | // Parameters: 66 | // 67 | // id: The unique string identifier for the new task. 68 | // data: The data payload to be associated with the task. 69 | // processFn: The function that will be called to process this task's data. 70 | // This function must match the signature `func(ctx context.Context, data interface{}) error`. 71 | // 72 | // Returns: 73 | // 74 | // A pointer to the newly created Task. 75 | func NewTask(id string, data interface{}, processFn func(ctx context.Context, data interface{}) error) *Task { 76 | return &Task{ 77 | ID: id, 78 | CreatedAt: time.Now(), // Set creation time to now. 79 | Data: data, 80 | ProcessFn: processFn, 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /internal/io/buffer.go: -------------------------------------------------------------------------------- 1 | package io 2 | 3 | /* 4 | rxtls — fast tool in Go for working with Certificate Transparency logs 5 | Copyright (C) 2025 Pepijn van der Stap 6 | 7 | This program is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU Affero General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | This program is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU Affero General Public License for more details. 16 | 17 | You should have received a copy of the GNU Affero General Public License 18 | along with this program. If not, see . 19 | */ 20 | 21 | import ( 22 | "bufio" 23 | "compress/gzip" 24 | "context" 25 | "errors" 26 | "fmt" 27 | "os" 28 | "path/filepath" 29 | "runtime" 30 | "sync" 31 | "sync/atomic" 32 | "time" 33 | ) 34 | 35 | const ( 36 | // DefaultBufferSize is the default buffer size for disk I/O 37 | DefaultBufferSize = 256 * 1024 // 256KB 38 | 39 | // PageSize is the OS page size for aligned writes 40 | PageSize = 4096 // 4KB, typical OS page size 41 | 42 | // FlushInterval is how often to flush buffers automatically 43 | FlushInterval = 2 * time.Second 44 | 45 | // BackpressureThreshold is the percentage of buffer capacity that triggers backpressure 46 | BackpressureThreshold = 0.8 // 80% 47 | ) 48 | 49 | var ( 50 | // ErrBufferFull is returned when the buffer is full and backpressure is applied 51 | ErrBufferFull = errors.New("write buffer full, applying backpressure") 52 | 53 | // ErrBufferClosed is returned when attempting to write to a closed buffer 54 | ErrBufferClosed = errors.New("write buffer closed") 55 | 56 | // ErrFlushTimeout is returned when a flush operation times out 57 | ErrFlushTimeout = errors.New("flush operation timed out") 58 | ) 59 | 60 | // BufferMetrics holds metrics for a buffer 61 | type BufferMetrics struct { 62 | BytesWritten atomic.Int64 63 | BytesFlushed atomic.Int64 64 | FlushCount atomic.Int64 65 | WriteCount atomic.Int64 66 | BackpressureHits atomic.Int64 67 | ErrorCount atomic.Int64 68 | LastFlushTime atomic.Int64 // Unix timestamp in nanoseconds 69 | LastWriteTime atomic.Int64 // Unix timestamp in nanoseconds 70 | LastErrorTime atomic.Int64 // Unix timestamp in nanoseconds 71 | } 72 | 73 | // AsyncBuffer is a high-performance buffer for disk I/O with async flushing 74 | type AsyncBuffer struct { 75 | // Immutable after creation 76 | file *os.File 77 | gzWriter *gzip.Writer 78 | bufWriter *bufio.Writer 79 | flushInterval time.Duration 80 | bufferSize int 81 | alignWrites bool 82 | compressed bool 83 | flushThreshold float64 84 | fileDescriptor int 85 | identifier string // For logging/metrics 86 | 87 | // Mutable state protected by mutex 88 | mu sync.Mutex 89 | closed bool 90 | lastFlushTime time.Time 91 | flushInProgress bool 92 | writeQueue [][]byte // Pending writes that couldn't fit in buffer 93 | 94 | // Context for cancellation 95 | ctx context.Context 96 | cancel context.CancelFunc 97 | 98 | // Wait group for flush operations 99 | flushWg sync.WaitGroup 100 | 101 | // Metrics (atomic) 102 | metrics BufferMetrics 103 | 104 | // Signaling channels 105 | flushComplete chan struct{} // Signals when a flush is complete 106 | backpressure chan struct{} // Signals when backpressure is applied/released 107 | } 108 | 109 | // AsyncBufferOptions configures an AsyncBuffer 110 | type AsyncBufferOptions struct { 111 | BufferSize int 112 | FlushInterval time.Duration 113 | AlignWrites bool 114 | Compressed bool 115 | FlushThreshold float64 116 | Identifier string 117 | } 118 | 119 | // DefaultAsyncBufferOptions returns the default options for AsyncBuffer 120 | func DefaultAsyncBufferOptions() *AsyncBufferOptions { 121 | return &AsyncBufferOptions{ 122 | BufferSize: DefaultBufferSize, 123 | FlushInterval: FlushInterval, 124 | AlignWrites: true, 125 | Compressed: false, 126 | FlushThreshold: BackpressureThreshold, 127 | Identifier: "", 128 | } 129 | } 130 | 131 | // NewAsyncBuffer creates a new AsyncBuffer 132 | func NewAsyncBuffer(ctx context.Context, path string, options *AsyncBufferOptions) (*AsyncBuffer, error) { 133 | if options == nil { 134 | options = DefaultAsyncBufferOptions() 135 | } 136 | 137 | // Ensure directory exists 138 | dir := filepath.Dir(path) 139 | if err := os.MkdirAll(dir, 0755); err != nil { 140 | return nil, fmt.Errorf("failed to create directory %s: %w", dir, err) 141 | } 142 | 143 | // Open file with direct I/O if supported and requested 144 | flag := os.O_CREATE | os.O_WRONLY | os.O_TRUNC 145 | if options.AlignWrites && runtime.GOOS == "linux" { 146 | // oDirect is only available on Linux 147 | // Use a constant value instead of syscall.O_DIRECT to avoid build errors on other platforms 148 | const oDirect = 0x4000 // Linux specific 149 | flag |= oDirect 150 | } 151 | 152 | file, err := os.OpenFile(path, flag, 0644) 153 | if err != nil { 154 | return nil, fmt.Errorf("failed to open file %s: %w", path, err) 155 | } 156 | 157 | // Get file descriptor for direct operations 158 | fd := int(file.Fd()) 159 | 160 | // Create buffer context 161 | bufCtx, bufCancel := context.WithCancel(ctx) 162 | 163 | // Create the buffer 164 | ab := &AsyncBuffer{ 165 | file: file, 166 | bufferSize: options.BufferSize, 167 | alignWrites: options.AlignWrites, 168 | compressed: options.Compressed, 169 | flushInterval: options.FlushInterval, 170 | flushThreshold: options.FlushThreshold, 171 | fileDescriptor: fd, 172 | identifier: options.Identifier, 173 | lastFlushTime: time.Now(), 174 | ctx: bufCtx, 175 | cancel: bufCancel, 176 | flushComplete: make(chan struct{}, 1), 177 | backpressure: make(chan struct{}, 1), 178 | } 179 | 180 | // Set up the writer chain 181 | if options.Compressed { 182 | gzw, err := gzip.NewWriterLevel(file, gzip.BestSpeed) 183 | if err != nil { 184 | file.Close() 185 | bufCancel() 186 | return nil, fmt.Errorf("failed to create gzip writer: %w", err) 187 | } 188 | ab.gzWriter = gzw 189 | ab.bufWriter = bufio.NewWriterSize(gzw, options.BufferSize) 190 | } else { 191 | ab.bufWriter = bufio.NewWriterSize(file, options.BufferSize) 192 | } 193 | 194 | // Start background flusher 195 | ab.startBackgroundFlusher() 196 | 197 | return ab, nil 198 | } 199 | 200 | // startBackgroundFlusher starts a goroutine that periodically flushes the buffer 201 | func (ab *AsyncBuffer) startBackgroundFlusher() { 202 | ticker := time.NewTicker(ab.flushInterval) 203 | 204 | go func() { 205 | defer ticker.Stop() 206 | for { 207 | select { 208 | case <-ticker.C: 209 | if err := ab.Flush(); err != nil && !errors.Is(err, ErrFlushTimeout) { 210 | ab.metrics.ErrorCount.Add(1) 211 | ab.metrics.LastErrorTime.Store(time.Now().UnixNano()) 212 | // TODO: Log error 213 | } 214 | case <-ab.ctx.Done(): 215 | return 216 | } 217 | } 218 | }() 219 | } 220 | 221 | // Write writes data to the buffer 222 | func (ab *AsyncBuffer) Write(data []byte) (int, error) { 223 | ab.mu.Lock() 224 | defer ab.mu.Unlock() 225 | 226 | if ab.closed { 227 | return 0, ErrBufferClosed 228 | } 229 | 230 | // Check if we need to apply backpressure 231 | if float64(ab.bufWriter.Buffered())/float64(ab.bufferSize) >= ab.flushThreshold { 232 | // Signal backpressure 233 | select { 234 | case ab.backpressure <- struct{}{}: 235 | default: 236 | // Channel already has a value 237 | } 238 | 239 | ab.metrics.BackpressureHits.Add(1) 240 | 241 | // If we have too many pending writes, return error 242 | if len(ab.writeQueue) > 100 { 243 | return 0, ErrBufferFull 244 | } 245 | 246 | // Queue the write for later 247 | dataCopy := make([]byte, len(data)) 248 | copy(dataCopy, data) 249 | ab.writeQueue = append(ab.writeQueue, dataCopy) 250 | 251 | // Trigger a flush 252 | go ab.Flush() 253 | 254 | return len(data), nil 255 | } 256 | 257 | // Write to buffer 258 | n, err := ab.bufWriter.Write(data) 259 | if err != nil { 260 | ab.metrics.ErrorCount.Add(1) 261 | ab.metrics.LastErrorTime.Store(time.Now().UnixNano()) 262 | return n, fmt.Errorf("failed to write to buffer: %w", err) 263 | } 264 | 265 | ab.metrics.BytesWritten.Add(int64(n)) 266 | ab.metrics.WriteCount.Add(1) 267 | ab.metrics.LastWriteTime.Store(time.Now().UnixNano()) 268 | 269 | // Process queued writes if buffer has space 270 | if len(ab.writeQueue) > 0 && float64(ab.bufWriter.Buffered())/float64(ab.bufferSize) < ab.flushThreshold { 271 | // Process some queued writes 272 | processed := 0 273 | for i, queuedData := range ab.writeQueue { 274 | if float64(ab.bufWriter.Buffered()+len(queuedData))/float64(ab.bufferSize) >= ab.flushThreshold { 275 | break 276 | } 277 | 278 | n, err := ab.bufWriter.Write(queuedData) 279 | if err != nil { 280 | ab.metrics.ErrorCount.Add(1) 281 | ab.metrics.LastErrorTime.Store(time.Now().UnixNano()) 282 | break 283 | } 284 | 285 | ab.metrics.BytesWritten.Add(int64(n)) 286 | ab.metrics.WriteCount.Add(1) 287 | processed = i + 1 288 | } 289 | 290 | // Remove processed items from queue 291 | if processed > 0 { 292 | ab.writeQueue = ab.writeQueue[processed:] 293 | } 294 | 295 | // If queue is empty, release backpressure 296 | if len(ab.writeQueue) == 0 { 297 | // Clear backpressure signal 298 | select { 299 | case <-ab.backpressure: 300 | default: 301 | } 302 | } 303 | } 304 | 305 | return n, nil 306 | } 307 | 308 | // Flush flushes the buffer to disk 309 | func (ab *AsyncBuffer) Flush() error { 310 | ab.mu.Lock() 311 | 312 | if ab.closed { 313 | ab.mu.Unlock() 314 | return ErrBufferClosed 315 | } 316 | 317 | if ab.flushInProgress { 318 | // Another flush is already in progress 319 | ab.mu.Unlock() 320 | 321 | // Wait for it to complete with timeout 322 | select { 323 | case <-ab.flushComplete: 324 | return nil 325 | case <-time.After(5 * time.Second): 326 | return ErrFlushTimeout 327 | case <-ab.ctx.Done(): 328 | return ab.ctx.Err() 329 | } 330 | } 331 | 332 | // Nothing to flush 333 | if ab.bufWriter.Buffered() == 0 { 334 | ab.mu.Unlock() 335 | return nil 336 | } 337 | 338 | // Mark flush in progress 339 | ab.flushInProgress = true 340 | ab.flushWg.Add(1) 341 | ab.mu.Unlock() 342 | 343 | // Perform the flush in a separate goroutine to avoid blocking 344 | go func() { 345 | defer ab.flushWg.Done() 346 | defer func() { 347 | ab.mu.Lock() 348 | ab.flushInProgress = false 349 | ab.lastFlushTime = time.Now() 350 | ab.mu.Unlock() 351 | 352 | // Signal flush complete 353 | select { 354 | case ab.flushComplete <- struct{}{}: 355 | default: 356 | } 357 | }() 358 | 359 | // Flush the buffer 360 | if err := ab.bufWriter.Flush(); err != nil { 361 | ab.metrics.ErrorCount.Add(1) 362 | ab.metrics.LastErrorTime.Store(time.Now().UnixNano()) 363 | return 364 | } 365 | 366 | // If compressed, flush the gzip writer 367 | if ab.compressed && ab.gzWriter != nil { 368 | if err := ab.gzWriter.Flush(); err != nil { 369 | ab.metrics.ErrorCount.Add(1) 370 | ab.metrics.LastErrorTime.Store(time.Now().UnixNano()) 371 | return 372 | } 373 | } 374 | 375 | // Sync to disk 376 | if err := ab.file.Sync(); err != nil { 377 | ab.metrics.ErrorCount.Add(1) 378 | ab.metrics.LastErrorTime.Store(time.Now().UnixNano()) 379 | return 380 | } 381 | 382 | // Update metrics 383 | ab.metrics.FlushCount.Add(1) 384 | ab.metrics.BytesFlushed.Add(int64(ab.bufWriter.Buffered())) 385 | ab.metrics.LastFlushTime.Store(time.Now().UnixNano()) 386 | }() 387 | 388 | return nil 389 | } 390 | 391 | // Close flushes and closes the buffer 392 | func (ab *AsyncBuffer) Close() error { 393 | ab.mu.Lock() 394 | 395 | if ab.closed { 396 | ab.mu.Unlock() 397 | return nil 398 | } 399 | 400 | ab.closed = true 401 | ab.mu.Unlock() 402 | 403 | // Cancel context to stop background flusher 404 | ab.cancel() 405 | 406 | // Wait for any in-progress flushes to complete 407 | ab.flushWg.Wait() 408 | 409 | // Final flush 410 | if err := ab.bufWriter.Flush(); err != nil { 411 | return fmt.Errorf("failed to flush buffer on close: %w", err) 412 | } 413 | 414 | // Close gzip writer if used 415 | if ab.compressed && ab.gzWriter != nil { 416 | if err := ab.gzWriter.Close(); err != nil { 417 | return fmt.Errorf("failed to close gzip writer: %w", err) 418 | } 419 | } 420 | 421 | // Close file 422 | if err := ab.file.Close(); err != nil { 423 | return fmt.Errorf("failed to close file: %w", err) 424 | } 425 | 426 | return nil 427 | } 428 | 429 | // WaitForBackpressure waits for backpressure to be applied 430 | func (ab *AsyncBuffer) WaitForBackpressure(ctx context.Context) error { 431 | select { 432 | case <-ab.backpressure: 433 | return nil 434 | case <-ctx.Done(): 435 | return ctx.Err() 436 | } 437 | } 438 | 439 | // GetMetrics returns the current metrics for the buffer 440 | func (ab *AsyncBuffer) GetMetrics() *BufferMetrics { 441 | return &ab.metrics 442 | } 443 | 444 | // BufferPool manages a pool of AsyncBuffers 445 | type BufferPool struct { 446 | mu sync.RWMutex 447 | buffers map[string]*AsyncBuffer 448 | ctx context.Context 449 | cancel context.CancelFunc 450 | options *AsyncBufferOptions 451 | } 452 | 453 | // NewBufferPool creates a new BufferPool 454 | func NewBufferPool(ctx context.Context, options *AsyncBufferOptions) *BufferPool { 455 | poolCtx, poolCancel := context.WithCancel(ctx) 456 | 457 | return &BufferPool{ 458 | buffers: make(map[string]*AsyncBuffer), 459 | ctx: poolCtx, 460 | cancel: poolCancel, 461 | options: options, 462 | } 463 | } 464 | 465 | // GetBuffer returns a buffer for the given path, creating it if necessary 466 | func (bp *BufferPool) GetBuffer(path string) (*AsyncBuffer, error) { 467 | // First check if buffer exists with read lock 468 | bp.mu.RLock() 469 | buffer, exists := bp.buffers[path] 470 | bp.mu.RUnlock() 471 | 472 | if exists { 473 | return buffer, nil 474 | } 475 | 476 | // Create new buffer with write lock 477 | bp.mu.Lock() 478 | defer bp.mu.Unlock() 479 | 480 | // Check again in case another goroutine created it 481 | buffer, exists = bp.buffers[path] 482 | if exists { 483 | return buffer, nil 484 | } 485 | 486 | // Create new buffer 487 | options := *bp.options // Copy options 488 | options.Identifier = path 489 | 490 | buffer, err := NewAsyncBuffer(bp.ctx, path, &options) 491 | if err != nil { 492 | return nil, err 493 | } 494 | 495 | bp.buffers[path] = buffer 496 | return buffer, nil 497 | } 498 | 499 | // Close closes all buffers in the pool 500 | func (bp *BufferPool) Close() error { 501 | bp.cancel() // Cancel context to stop all background operations 502 | 503 | bp.mu.Lock() 504 | defer bp.mu.Unlock() 505 | 506 | var lastErr error 507 | for path, buffer := range bp.buffers { 508 | if err := buffer.Close(); err != nil { 509 | lastErr = fmt.Errorf("failed to close buffer %s: %w", path, err) 510 | } 511 | } 512 | 513 | return lastErr 514 | } 515 | 516 | // Flush flushes all buffers in the pool 517 | func (bp *BufferPool) Flush() error { 518 | bp.mu.RLock() 519 | defer bp.mu.RUnlock() 520 | 521 | var lastErr error 522 | for path, buffer := range bp.buffers { 523 | if err := buffer.Flush(); err != nil { 524 | lastErr = fmt.Errorf("failed to flush buffer %s: %w", path, err) 525 | } 526 | } 527 | 528 | return lastErr 529 | } 530 | -------------------------------------------------------------------------------- /internal/metrics/metrics.go: -------------------------------------------------------------------------------- 1 | package metrics 2 | 3 | /* 4 | rxtls — fast tool in Go for working with Certificate Transparency logs 5 | Copyright (C) 2025 Pepijn van der Stap 6 | 7 | This program is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU Affero General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | This program is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU Affero General Public License for more details. 16 | 17 | You should have received a copy of the GNU Affero General Public License 18 | along with this program. If not, see . 19 | */ 20 | 21 | import ( 22 | "context" 23 | "log" 24 | "net/http" 25 | "strconv" 26 | "sync" 27 | "time" 28 | 29 | "github.com/prometheus/client_golang/prometheus" 30 | "github.com/prometheus/client_golang/prometheus/promauto" 31 | "github.com/prometheus/client_golang/prometheus/promhttp" 32 | ) 33 | 34 | var ( 35 | registry = prometheus.NewRegistry() 36 | defaultRegisterer = promauto.With(registry) 37 | metricsInitialized sync.Once 38 | metricsEnabled bool 39 | metricsServer *http.Server 40 | ) 41 | 42 | // Metrics contains all the Prometheus metrics for the application 43 | type Metrics struct { 44 | // Certificate processing metrics 45 | CertProcessingDuration *prometheus.HistogramVec 46 | CertProcessedTotal *prometheus.CounterVec 47 | CertFailedTotal *prometheus.CounterVec 48 | 49 | // Network metrics 50 | NetworkRequestDuration *prometheus.HistogramVec 51 | NetworkRequestsTotal *prometheus.CounterVec 52 | NetworkErrorsTotal *prometheus.CounterVec 53 | NetworkRetriesTotal *prometheus.CounterVec 54 | TLSHandshakeDuration *prometheus.HistogramVec 55 | 56 | // Queue metrics 57 | QueueSize *prometheus.GaugeVec 58 | QueueLatency *prometheus.HistogramVec 59 | QueuePressure *prometheus.GaugeVec 60 | QueueCapacity *prometheus.GaugeVec 61 | QueueBackpressureHit *prometheus.CounterVec 62 | 63 | // Worker metrics 64 | WorkerBusy *prometheus.GaugeVec 65 | WorkerProcessed *prometheus.CounterVec 66 | WorkerErrors *prometheus.CounterVec 67 | WorkerPanics *prometheus.CounterVec 68 | WorkerIdleDuration *prometheus.HistogramVec 69 | WorkerRateLimit *prometheus.GaugeVec 70 | 71 | // Disk I/O metrics 72 | DiskWriteDuration *prometheus.HistogramVec 73 | DiskWriteBytes *prometheus.HistogramVec 74 | DiskWriteOps *prometheus.CounterVec 75 | DiskErrors *prometheus.CounterVec 76 | DiskBufferSize *prometheus.GaugeVec 77 | 78 | // Scheduler metrics 79 | SchedulerShardsActive *prometheus.GaugeVec 80 | SchedulerWorkSubmitted *prometheus.CounterVec 81 | SchedulerWorkCompleted *prometheus.CounterVec 82 | SchedulerWorkFailed *prometheus.CounterVec 83 | SchedulerRateLimitDelay *prometheus.HistogramVec 84 | SchedulerRetriesRate *prometheus.GaugeVec 85 | } 86 | 87 | // Global instance of metrics 88 | var globalMetrics *Metrics 89 | var metricsOnce sync.Once 90 | 91 | // GetMetrics returns the global metrics instance 92 | func GetMetrics() *Metrics { 93 | metricsOnce.Do(func() { 94 | globalMetrics = newMetrics() 95 | }) 96 | return globalMetrics 97 | } 98 | 99 | // EnableMetrics enables metrics collection 100 | func EnableMetrics() { 101 | metricsEnabled = true 102 | } 103 | 104 | // IsMetricsEnabled returns whether metrics collection is enabled 105 | func IsMetricsEnabled() bool { 106 | return metricsEnabled 107 | } 108 | 109 | // newMetrics creates and registers all metrics 110 | func newMetrics() *Metrics { 111 | buckets := []float64{.001, .005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10, 30, 60} 112 | byteBuckets := []float64{1024, 10 * 1024, 50 * 1024, 100 * 1024, 500 * 1024, 1000 * 1024, 5000 * 1024, 10000 * 1024} 113 | 114 | m := &Metrics{ 115 | // Certificate processing metrics 116 | CertProcessingDuration: defaultRegisterer.NewHistogramVec( 117 | prometheus.HistogramOpts{ 118 | Name: "rxtls_cert_processing_duration_seconds", 119 | Help: "Time spent processing certificates", 120 | Buckets: buckets, 121 | }, 122 | []string{"log_url", "operation"}, 123 | ), 124 | CertProcessedTotal: defaultRegisterer.NewCounterVec( 125 | prometheus.CounterOpts{ 126 | Name: "rxtls_cert_processed_total", 127 | Help: "Total number of certificates processed", 128 | }, 129 | []string{"log_url", "operation", "status"}, 130 | ), 131 | CertFailedTotal: defaultRegisterer.NewCounterVec( 132 | prometheus.CounterOpts{ 133 | Name: "rxtls_cert_failed_total", 134 | Help: "Total number of certificate processing failures", 135 | }, 136 | []string{"log_url", "operation", "error_type"}, 137 | ), 138 | 139 | // Network metrics 140 | NetworkRequestDuration: defaultRegisterer.NewHistogramVec( 141 | prometheus.HistogramOpts{ 142 | Name: "rxtls_network_request_duration_seconds", 143 | Help: "Time spent on network requests", 144 | Buckets: buckets, 145 | }, 146 | []string{"log_url", "endpoint"}, 147 | ), 148 | NetworkRequestsTotal: defaultRegisterer.NewCounterVec( 149 | prometheus.CounterOpts{ 150 | Name: "rxtls_network_requests_total", 151 | Help: "Total number of network requests", 152 | }, 153 | []string{"log_url", "endpoint", "status"}, 154 | ), 155 | NetworkErrorsTotal: defaultRegisterer.NewCounterVec( 156 | prometheus.CounterOpts{ 157 | Name: "rxtls_network_errors_total", 158 | Help: "Total number of network errors", 159 | }, 160 | []string{"log_url", "endpoint", "error_type"}, 161 | ), 162 | NetworkRetriesTotal: defaultRegisterer.NewCounterVec( 163 | prometheus.CounterOpts{ 164 | Name: "rxtls_network_retries_total", 165 | Help: "Total number of network retries", 166 | }, 167 | []string{"log_url", "endpoint"}, 168 | ), 169 | TLSHandshakeDuration: defaultRegisterer.NewHistogramVec( 170 | prometheus.HistogramOpts{ 171 | Name: "rxtls_tls_handshake_duration_seconds", 172 | Help: "Time spent on TLS handshakes", 173 | Buckets: buckets, 174 | }, 175 | []string{"log_url"}, 176 | ), 177 | 178 | // Queue metrics 179 | QueueSize: defaultRegisterer.NewGaugeVec( 180 | prometheus.GaugeOpts{ 181 | Name: "rxtls_queue_size", 182 | Help: "Current size of work queues", 183 | }, 184 | []string{"worker_id", "log_url"}, 185 | ), 186 | QueueLatency: defaultRegisterer.NewHistogramVec( 187 | prometheus.HistogramOpts{ 188 | Name: "rxtls_queue_latency_seconds", 189 | Help: "Time items spend in queue before processing", 190 | Buckets: buckets, 191 | }, 192 | []string{"worker_id", "log_url"}, 193 | ), 194 | QueuePressure: defaultRegisterer.NewGaugeVec( 195 | prometheus.GaugeOpts{ 196 | Name: "rxtls_queue_pressure", 197 | Help: "Queue pressure as a ratio of current size to capacity (0-1)", 198 | }, 199 | []string{"worker_id", "log_url"}, 200 | ), 201 | QueueCapacity: defaultRegisterer.NewGaugeVec( 202 | prometheus.GaugeOpts{ 203 | Name: "rxtls_queue_capacity", 204 | Help: "Maximum capacity of work queues", 205 | }, 206 | []string{"worker_id"}, 207 | ), 208 | QueueBackpressureHit: defaultRegisterer.NewCounterVec( 209 | prometheus.CounterOpts{ 210 | Name: "rxtls_queue_backpressure_hits_total", 211 | Help: "Number of times backpressure was applied due to full queue", 212 | }, 213 | []string{"worker_id", "log_url"}, 214 | ), 215 | 216 | // Worker metrics 217 | WorkerBusy: defaultRegisterer.NewGaugeVec( 218 | prometheus.GaugeOpts{ 219 | Name: "rxtls_worker_busy", 220 | Help: "Whether a worker is currently busy (1) or idle (0)", 221 | }, 222 | []string{"worker_id"}, 223 | ), 224 | WorkerProcessed: defaultRegisterer.NewCounterVec( 225 | prometheus.CounterOpts{ 226 | Name: "rxtls_worker_processed_total", 227 | Help: "Total number of items processed by a worker", 228 | }, 229 | []string{"worker_id", "log_url"}, 230 | ), 231 | WorkerErrors: defaultRegisterer.NewCounterVec( 232 | prometheus.CounterOpts{ 233 | Name: "rxtls_worker_errors_total", 234 | Help: "Total number of errors encountered by a worker", 235 | }, 236 | []string{"worker_id", "log_url", "error_type"}, 237 | ), 238 | WorkerPanics: defaultRegisterer.NewCounterVec( 239 | prometheus.CounterOpts{ 240 | Name: "rxtls_worker_panics_total", 241 | Help: "Total number of panics recovered by a worker", 242 | }, 243 | []string{"worker_id"}, 244 | ), 245 | WorkerIdleDuration: defaultRegisterer.NewHistogramVec( 246 | prometheus.HistogramOpts{ 247 | Name: "rxtls_worker_idle_duration_seconds", 248 | Help: "Time workers spend idle waiting for work", 249 | Buckets: buckets, 250 | }, 251 | []string{"worker_id"}, 252 | ), 253 | WorkerRateLimit: defaultRegisterer.NewGaugeVec( 254 | prometheus.GaugeOpts{ 255 | Name: "rxtls_worker_rate_limit", 256 | Help: "Current rate limit for each worker", 257 | }, 258 | []string{"worker_id"}, 259 | ), 260 | 261 | // Disk I/O metrics 262 | DiskWriteDuration: defaultRegisterer.NewHistogramVec( 263 | prometheus.HistogramOpts{ 264 | Name: "rxtls_disk_write_duration_seconds", 265 | Help: "Time spent writing to disk", 266 | Buckets: buckets, 267 | }, 268 | []string{"log_url", "operation"}, 269 | ), 270 | DiskWriteBytes: defaultRegisterer.NewHistogramVec( 271 | prometheus.HistogramOpts{ 272 | Name: "rxtls_disk_write_bytes_total", 273 | Help: "Total number of bytes written to disk", 274 | Buckets: byteBuckets, 275 | }, 276 | []string{"log_url", "operation"}, 277 | ), 278 | DiskWriteOps: defaultRegisterer.NewCounterVec( 279 | prometheus.CounterOpts{ 280 | Name: "rxtls_disk_write_ops_total", 281 | Help: "Total number of write operations to disk", 282 | }, 283 | []string{"log_url", "operation"}, 284 | ), 285 | DiskErrors: defaultRegisterer.NewCounterVec( 286 | prometheus.CounterOpts{ 287 | Name: "rxtls_disk_errors_total", 288 | Help: "Total number of disk errors", 289 | }, 290 | []string{"log_url", "operation", "error_type"}, 291 | ), 292 | DiskBufferSize: defaultRegisterer.NewGaugeVec( 293 | prometheus.GaugeOpts{ 294 | Name: "rxtls_disk_buffer_size_bytes", 295 | Help: "Size of disk write buffers in bytes", 296 | }, 297 | []string{"log_url", "operation"}, 298 | ), 299 | 300 | // Scheduler metrics 301 | SchedulerShardsActive: defaultRegisterer.NewGaugeVec( 302 | prometheus.GaugeOpts{ 303 | Name: "rxtls_scheduler_shards_active", 304 | Help: "Number of active shards in the scheduler", 305 | }, 306 | []string{"operation"}, 307 | ), 308 | SchedulerWorkSubmitted: defaultRegisterer.NewCounterVec( 309 | prometheus.CounterOpts{ 310 | Name: "rxtls_scheduler_work_submitted_total", 311 | Help: "Total number of work items submitted to the scheduler", 312 | }, 313 | []string{"log_url", "operation"}, 314 | ), 315 | SchedulerWorkCompleted: defaultRegisterer.NewCounterVec( 316 | prometheus.CounterOpts{ 317 | Name: "rxtls_scheduler_work_completed_total", 318 | Help: "Total number of work items completed by the scheduler", 319 | }, 320 | []string{"log_url", "operation"}, 321 | ), 322 | SchedulerWorkFailed: defaultRegisterer.NewCounterVec( 323 | prometheus.CounterOpts{ 324 | Name: "rxtls_scheduler_work_failed_total", 325 | Help: "Total number of work items that failed processing", 326 | }, 327 | []string{"log_url", "operation", "error_type"}, 328 | ), 329 | SchedulerRateLimitDelay: defaultRegisterer.NewHistogramVec( 330 | prometheus.HistogramOpts{ 331 | Name: "rxtls_scheduler_rate_limit_delay_seconds", 332 | Help: "Time spent waiting due to rate limiting", 333 | Buckets: buckets, 334 | }, 335 | []string{"log_url", "operation"}, 336 | ), 337 | SchedulerRetriesRate: defaultRegisterer.NewGaugeVec( 338 | prometheus.GaugeOpts{ 339 | Name: "rxtls_scheduler_retries_rate", 340 | Help: "Rate of retries per second", 341 | }, 342 | []string{"log_url", "operation"}, 343 | ), 344 | } 345 | 346 | return m 347 | } 348 | 349 | // StartMetricsServer starts an HTTP server to expose Prometheus metrics 350 | func StartMetricsServer(addr string) error { 351 | if !metricsEnabled { 352 | return nil 353 | } 354 | 355 | // Only start once 356 | var startErr error 357 | metricsInitialized.Do(func() { 358 | mux := http.NewServeMux() 359 | mux.Handle("/metrics", promhttp.HandlerFor(registry, promhttp.HandlerOpts{})) 360 | 361 | metricsServer = &http.Server{ 362 | Addr: addr, 363 | Handler: mux, 364 | } 365 | 366 | go func() { 367 | log.Printf("Starting metrics server on %s", addr) 368 | if err := metricsServer.ListenAndServe(); err != nil && err != http.ErrServerClosed { 369 | log.Printf("Metrics server error: %v", err) 370 | } 371 | }() 372 | }) 373 | 374 | return startErr 375 | } 376 | 377 | // ShutdownMetricsServer gracefully shuts down the metrics server 378 | func ShutdownMetricsServer(ctx context.Context) error { 379 | if metricsServer != nil { 380 | log.Println("Shutting down metrics server...") 381 | return metricsServer.Shutdown(ctx) 382 | } 383 | return nil 384 | } 385 | 386 | // RecordWithLabels is a helper to record metrics with labels 387 | func (m *Metrics) RecordWithLabels(fn func(), labels prometheus.Labels) { 388 | if !metricsEnabled { 389 | fn() 390 | return 391 | } 392 | 393 | start := time.Now() 394 | fn() 395 | _ = time.Since(start) // Record duration if needed 396 | // This is just a placeholder - actual implementation would depend on the metric type 397 | } 398 | 399 | // MeasureDuration is a helper to measure the duration of a function 400 | func MeasureDuration(histogram *prometheus.HistogramVec, labels prometheus.Labels) func() { 401 | if !metricsEnabled { 402 | return func() {} 403 | } 404 | 405 | start := time.Now() 406 | return func() { 407 | duration := time.Since(start) 408 | histogram.With(labels).Observe(duration.Seconds()) 409 | } 410 | } 411 | 412 | // UpdateQueueMetrics updates queue metrics for a worker 413 | func (m *Metrics) UpdateQueueMetrics(workerID int, logURL string, queueSize, queueCapacity int) { 414 | if !metricsEnabled { 415 | return 416 | } 417 | 418 | m.QueueSize.WithLabelValues(strconv.Itoa(workerID), logURL).Set(float64(queueSize)) 419 | m.QueueCapacity.WithLabelValues(strconv.Itoa(workerID)).Set(float64(queueCapacity)) 420 | 421 | if queueCapacity > 0 { 422 | pressure := float64(queueSize) / float64(queueCapacity) 423 | m.QueuePressure.WithLabelValues(strconv.Itoa(workerID), logURL).Set(pressure) 424 | } 425 | } 426 | 427 | // UpdateWorkerRateLimit updates the rate limit metric for a worker 428 | func (m *Metrics) UpdateWorkerRateLimit(workerID int, rateLimit float64) { 429 | if !metricsEnabled { 430 | return 431 | } 432 | 433 | m.WorkerRateLimit.WithLabelValues(strconv.Itoa(workerID)).Set(rateLimit) 434 | } 435 | 436 | // UpdateRetriesRate updates the retries rate metric 437 | func (m *Metrics) UpdateRetriesRate(logURL, operation string, retriesPerSecond float64) { 438 | if !metricsEnabled { 439 | return 440 | } 441 | 442 | m.SchedulerRetriesRate.WithLabelValues(logURL, operation).Set(retriesPerSecond) 443 | } 444 | -------------------------------------------------------------------------------- /internal/util/filename.go: -------------------------------------------------------------------------------- 1 | /* 2 | Package util provides miscellaneous utility functions used across the rxtls application. 3 | These functions are typically small, self-contained, and offer common helper functionalities 4 | that don't belong to a more specific package like `core` or `client`. 5 | */ 6 | package util 7 | 8 | /* 9 | rxtls — fast tool in Go for working with Certificate Transparency logs 10 | Copyright (C) 2025 Pepijn van der Stap 11 | 12 | This program is free software: you can redistribute it and/or modify 13 | it under the terms of the GNU Affero General Public License as published by 14 | the Free Software Foundation, either version 3 of the License, or 15 | (at your option) any later version. 16 | 17 | This program is distributed in the hope that it will be useful, 18 | but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 | GNU Affero General Public License for more details. 21 | 22 | You should have received a copy of the GNU Affero General Public License 23 | along with this program. If not, see . 24 | */ 25 | 26 | import "strings" 27 | 28 | // SanitizeFilename takes an input string (typically a URL or a descriptive name) 29 | // and transforms it into a string that is generally safe to use as a filename 30 | // on common operating systems. 31 | // 32 | // The sanitization process involves: 33 | // 1. Replacing characters that are problematic in filenames (e.g., '/', '\', ':', '*', '?', '"', '<', '>', '|') 34 | // with underscores ('_'). 35 | // 2. Limiting the total length of the filename to a predefined maximum (currently 100 characters) 36 | // to prevent issues with OS filename length limits. 37 | // 38 | // This function is primarily used when generating output filenames based on CT log URLs 39 | // to ensure that the resulting names are valid and do not cause filesystem errors. 40 | // 41 | // Performance: For its intended use (generating a few filenames at the start of processing a log), 42 | // the performance of this function is not critical. It uses standard string manipulation functions. 43 | // 44 | // Parameters: 45 | // input: The string to be sanitized into a filename-safe format. 46 | // 47 | // Returns: 48 | // A sanitized string suitable for use as a filename. 49 | func SanitizeFilename(input string) string { 50 | // Replace common problematic characters with an underscore. 51 | // This set can be expanded if other problematic characters are identified. 52 | replaced := strings.Map(func(r rune) rune { 53 | switch r { 54 | case '/', '\\', ':', '*', '?', '"', '<', '>', '|': // Common invalid filename chars on Windows/Unix. 55 | return '_' 56 | } 57 | return r // Keep other characters as they are. 58 | }, input) 59 | 60 | // Limit filename length to avoid issues with operating system limits. 61 | // A maxLength of 100 is a conservative choice, well within typical FS limits (e.g., 255 bytes). 62 | const maxLength = 100 63 | if len(replaced) > maxLength { 64 | // Truncate the string if it exceeds the maximum length. 65 | // Note: This is a simple truncation. For multi-byte character sets (UTF-8), 66 | // this could potentially cut a character in half if not careful. However, for URLs 67 | // and typical log names, this is often acceptable. More robust truncation would 68 | // require rune-aware iteration. 69 | return replaced[:maxLength] 70 | } 71 | return replaced 72 | } 73 | --------------------------------------------------------------------------------