├── agents └── .gitkeep ├── gomodules ├── RELEASE_VERSION.txt ├── .github ├── FUNDING.yml ├── ISSUE_TEMPLATE │ ├── custom.md │ ├── feature_request.md │ └── bug_report.md ├── release.yml ├── workflows │ ├── dependency-review.yml │ ├── go.yml │ ├── go-vulncheck.yml │ ├── 120_fix_latest_multiarch.yml │ └── codeql.yml └── dependabot.yml ├── tests ├── perf │ ├── run_api_all │ ├── run_api_only_tests │ ├── run_api_db_tests │ ├── run_events_create │ ├── api_get_all_source_status.js │ ├── api_get_healthcheck.js │ └── event_create_events.js ├── fuzz │ ├── run_api_all │ └── api_fuzz_payloads.txt └── README.md ├── doc ├── image.png ├── GeneralArchitecture.jpg ├── TheCrowlerBasicArchitecture.graffle ├── TheCrowlerBasicArchitecture │ ├── Rules&Plugins.png │ └── GeneralArchitecture.png ├── README.md ├── architecture.md ├── installation.md ├── agents.md ├── test_policy.md ├── env_vars.md ├── sources.md └── ruleset_architecture.md ├── pkg ├── ruleset │ ├── testdata │ │ └── fuzz │ │ │ └── FuzzParseRuleset │ │ │ └── 582528ddfad69eb5 │ ├── common_test.go │ ├── ruleset_fuzz_test.go │ ├── test-ruleset.yaml │ ├── actionrule.go │ ├── crawlingrule.go │ └── scrapingrule.go ├── database │ ├── postgresql-setup.sh │ ├── README.md │ ├── common.go │ ├── queries_types.go │ ├── database.go │ ├── queries.go │ └── database_test.go ├── common │ ├── network_test.go │ ├── json_parser.go │ ├── slices_test.go │ ├── urls_test.go │ ├── generic_consts.go │ ├── encoding_test.go │ ├── encoding.go │ ├── interfaces_test.go │ ├── locks.go │ ├── url.go │ ├── env_templates.go │ ├── network.go │ ├── slices.go │ └── types.go ├── config │ └── test-config.yml ├── search │ ├── search.go │ └── exec.go ├── crawler │ ├── test_data │ │ ├── invalid_ruleset.yaml │ │ ├── source-config.json │ │ ├── example-ruleset.yaml │ │ └── test-ruleset.yaml │ ├── consts.go │ ├── fuzzing_test.go │ └── fuzzing_rules.go ├── httpinfo │ └── jarm_collector_test.go ├── fingerprints │ ├── types.go │ ├── sha256.go │ ├── murmurhash.go │ ├── ctls.go │ ├── blake2.go │ ├── hassh.go │ ├── hassh_server.go │ ├── ja3.go │ ├── tlsh.go │ ├── simhash.go │ ├── minhash.go │ ├── factory.go │ └── ja4.go ├── exprterpreter │ └── types.go ├── plugin │ └── types.go ├── netinfo │ ├── netinfo_test.go │ └── helper.go └── agent │ ├── setup_test.go │ ├── agentmeta.go │ ├── action_run_db_query.go │ └── execute_isolated_unix_other.go ├── images ├── crowler-vdi-bg.png └── TheCROWler_v1JPG.jpg ├── scripts ├── check_ut_coverage.sh ├── apply-limits.sh └── containerd.sh ├── commitlint.config.js ├── plugins └── RemoveArticleContent.js ├── CODE_OF_CONDUCT.md ├── selenium-patches ├── 4.21.0 │ ├── Dockerfile_Base_ARM64_4.21.0.patch │ ├── Dockerfile │ └── Dockerfile_Chromium_ARM64_4.21.0.patch ├── browserAutomation.conf ├── 4.23.1 │ └── Dockerfile_Base_ARM64_4.23.1.patch ├── 4.20.0 │ ├── Dockerfile_Base_ARM64_4.20.0.patch │ └── Dockerfile ├── 4.18.1 │ ├── Dockerfile_Base_ARM64_4.18.1.patch │ └── Dockerfile ├── 4.19.1 │ ├── Dockerfile_Base_ARM64_4.18.1.patch │ ├── Dockerfile_Base_ARM64_4.19.1.patch │ └── Dockerfile ├── 4.27.0 │ └── Makefile-fixed.patch └── 4.24.0 │ └── Dockerfile ├── CONTRIBUTORS.md ├── Dockerfile.db ├── config.default.remote ├── .eslintrc.js ├── .gitignore ├── .golangci.yml ├── codacy.yml ├── schemas ├── crowler-event-schema.json └── crowler-source-categories-schema.json ├── services └── events │ └── types.go ├── rules └── AcceptCookies-ruleset.json ├── .pre-commit-config.yaml ├── SECURITY.md ├── docker-rebuild.sh ├── Dockerfile.searchapi └── Dockerfile.events /agents/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gomodules: -------------------------------------------------------------------------------- 1 | ./ 2 | -------------------------------------------------------------------------------- /RELEASE_VERSION.txt: -------------------------------------------------------------------------------- 1 | v1.0.7 2 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: [pzaino] 2 | -------------------------------------------------------------------------------- /tests/perf/run_api_all: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | k6 run ./tests/perf/api_*.js 4 | -------------------------------------------------------------------------------- /doc/image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pzaino/thecrowler/HEAD/doc/image.png -------------------------------------------------------------------------------- /pkg/ruleset/testdata/fuzz/FuzzParseRuleset/582528ddfad69eb5: -------------------------------------------------------------------------------- 1 | go test fuzz v1 2 | []byte("0") 3 | -------------------------------------------------------------------------------- /tests/perf/run_api_only_tests: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | k6 run ./tests/perf/api_get_healthcheck.js 4 | -------------------------------------------------------------------------------- /images/crowler-vdi-bg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pzaino/thecrowler/HEAD/images/crowler-vdi-bg.png -------------------------------------------------------------------------------- /tests/perf/run_api_db_tests: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | k6 run ./tests/perf/api_get_all_source_status.js 4 | -------------------------------------------------------------------------------- /doc/GeneralArchitecture.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pzaino/thecrowler/HEAD/doc/GeneralArchitecture.jpg -------------------------------------------------------------------------------- /images/TheCROWler_v1JPG.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pzaino/thecrowler/HEAD/images/TheCROWler_v1JPG.jpg -------------------------------------------------------------------------------- /doc/TheCrowlerBasicArchitecture.graffle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pzaino/thecrowler/HEAD/doc/TheCrowlerBasicArchitecture.graffle -------------------------------------------------------------------------------- /scripts/check_ut_coverage.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | go test -coverprofile=coverage.out ./... 4 | 5 | go tool cover -func=coverage.out 6 | -------------------------------------------------------------------------------- /doc/TheCrowlerBasicArchitecture/Rules&Plugins.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pzaino/thecrowler/HEAD/doc/TheCrowlerBasicArchitecture/Rules&Plugins.png -------------------------------------------------------------------------------- /doc/TheCrowlerBasicArchitecture/GeneralArchitecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pzaino/thecrowler/HEAD/doc/TheCrowlerBasicArchitecture/GeneralArchitecture.png -------------------------------------------------------------------------------- /tests/fuzz/run_api_all: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ffuf -w ./api_fuzz_payloads.txt -X GET -H "Content-Type: application/json" -u "http://localhost:8080/v1/webobject?q=FUZZ" -mr "HTTP/1.1 200 OK" 4 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/custom.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Custom issue template 3 | about: Describe this issue template's purpose here. 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | 11 | -------------------------------------------------------------------------------- /commitlint.config.js: -------------------------------------------------------------------------------- 1 | // Define module if it's not defined 2 | if (typeof module != 'undefined') { 3 | let module = {}; 4 | // Define module exports 5 | module.exports = {extends: ['@commitlint/config-conventional']}; 6 | } 7 | -------------------------------------------------------------------------------- /plugins/RemoveArticleContent.js: -------------------------------------------------------------------------------- 1 | // name: RemoveArticleContent 2 | // description: This is an example of a plugin that removes empty tags from the HTML. 3 | // type: vdi_plugin 4 | 5 | document.querySelector('div.article-content').remove(); 6 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | In the interest of fostering an open and welcoming environment, we as 4 | contributors and maintainers pledge to making participation in our project 5 | and our community a harassment-free experience for everyone. 6 | -------------------------------------------------------------------------------- /tests/perf/run_events_create: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check in which subdirectory are we, if we are already in "./tests/perf" we don't need to change directory 4 | test_path="." 5 | if [[ $(basename "$PWD") != "perf" ]]; then 6 | # If we are not in the perf directory, change to it 7 | test_path="./tests/perf" 8 | fi 9 | 10 | k6 run ${test_path}/event_create_events.js 11 | -------------------------------------------------------------------------------- /selenium-patches/4.21.0/Dockerfile_Base_ARM64_4.21.0.patch: -------------------------------------------------------------------------------- 1 | --- ./docker-selenium/Base/Dockerfile 2024-09-06 22:21:39 2 | +++ ./docker-selenium/Base/Dockerfile_Base_ARM64_4.21.0 2024-09-06 22:26:52 3 | @@ -1,4 +1,4 @@ 4 | -FROM ubuntu:jammy-20240427 5 | +FROM arm64v8/ubuntu:jammy-20240427 6 | LABEL authors="Selenium " 7 | 8 | # Arguments to define the version of dependencies to download 9 | -------------------------------------------------------------------------------- /CONTRIBUTORS.md: -------------------------------------------------------------------------------- 1 | # List of contributors to the project (Alphabetical order) 2 | 3 | - James Bettke (QA and Testing) 4 | - [Jeff Foley](https://github.com/caffix) (QA and Testing, JARM fingerprint idea) 5 | - [Adem Rosic](https://github.com/The-Inceptions) (QA and Testing) 6 | - [Fabio Suarez](https://github.com/fabio-o0) (QA and Testing, Search Engine module) 7 | - [Paolo Fabio Zaino](https://github.com/pzaino) (Original Author and Developer) 8 | -------------------------------------------------------------------------------- /pkg/database/postgresql-setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # Perform all actions as $DOCKER_POSTGRES_USER 5 | export PGPASSWORD="$POSTGRES_PASSWORD" 6 | psql -v ON_ERROR_STOP=1 --username "$POSTGRES_USER" --dbname "$POSTGRES_DB" \ 7 | -v POSTGRES_DB="$POSTGRES_DB" \ 8 | -v CROWLER_DB_USER="$CROWLER_DB_USER" \ 9 | -v CROWLER_DB_PASSWORD="$CROWLER_DB_PASSWORD" \ 10 | -f /docker-entrypoint-initdb.d/postgresql-setup-v1.5.pgsql 11 | -------------------------------------------------------------------------------- /pkg/database/README.md: -------------------------------------------------------------------------------- 1 | # CROWler supported Database Managers 2 | 3 | Although if I am working to add support for multiple database managers, the only 4 | one supported at the moment is PostgreSQL. The database manager is used to store 5 | the data collected by the CROWler. The database manager is also used to store 6 | the sources etc. of the CROWler. 7 | 8 | Work in progress to add support for other database managers: 9 | 10 | - SQLite 11 | - MySQL / MariaDB 12 | -------------------------------------------------------------------------------- /pkg/common/network_test.go: -------------------------------------------------------------------------------- 1 | package common 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | ) 8 | 9 | func TestDetectLocalNetwork_RealEnvironment(t *testing.T) { 10 | cidr, err := DetectLocalNetwork() 11 | 12 | if err != nil { 13 | t.Errorf("Error detecting local network: %v", err) 14 | } else { 15 | t.Logf("Detected local network CIDR: %s", cidr) 16 | assert.NotEmpty(t, cidr, "Local network CIDR should not be empty") 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /Dockerfile.db: -------------------------------------------------------------------------------- 1 | FROM postgres@sha256:4d89c904835259bc58876520e56267ca07a4ebd6a027f7814bbbf91b50d685be 2 | 3 | # Set build-time default timezone (can be overridden at runtime) 4 | ARG TIMEZONE=UTC 5 | ENV TZ=$TIMEZONE 6 | 7 | # Copy database init scripts 8 | COPY ./pkg/database/postgresql-setup.sh /docker-entrypoint-initdb.d/init.sh 9 | COPY ./pkg/database/postgresql-setup-v1.5.pgsql /docker-entrypoint-initdb.d/postgresql-setup-v1.5.pgsql 10 | 11 | # Make sure init script is executable 12 | RUN chmod +x /docker-entrypoint-initdb.d/init.sh 13 | -------------------------------------------------------------------------------- /config.default.remote: -------------------------------------------------------------------------------- 1 | version: "1.0.0" 2 | author: "Paolo Fabio Zaino" 3 | description: "Reusable remote configuration for The CROWler Engine instances" 4 | created_at: "2025-09-07T12:00:00" 5 | 6 | remote: 7 | host: "${CROWLER_DISTRIBUTION_HOST}" 8 | path: "${CROWLER_DISTRIBUTION_PATH}" 9 | port: ${CROWLER_DISTRIBUTION_PORT} 10 | type: "http" 11 | region: "${CROWLER_DISTRIBUTION_REGION}" 12 | token: "${CROWLER_DISTRIBUTION_TOKEN}" 13 | secret: "${CROWLER_DISTRIBUTION_SECRET}" 14 | timeout: ${CROWLER_DISTRIBUTION_TIMEOUT} 15 | sslmode: "${CROWLER_DISTRIBUTION_SSLMODE}" 16 | -------------------------------------------------------------------------------- /.github/release.yml: -------------------------------------------------------------------------------- 1 | changelog: 2 | exclude: 3 | labels: 4 | - skip-changelog 5 | - dependencies 6 | authors: 7 | - dependabot 8 | - github-actions 9 | categories: 10 | - title: 🚀 Features 11 | labels: [feature, enhancement] 12 | - title: 🐛 Bug Fixes 13 | labels: [bug, fix] 14 | - title: 🧰 Maintenance 15 | labels: [chore, refactor, ci, docs] 16 | - title: 🔒 Security 17 | labels: [security] 18 | - title: ⚠️ Breaking Changes 19 | labels: [breaking-change] 20 | - title: Other Changes 21 | labels: ["*"] # catch-all for unlabeled PRs 22 | -------------------------------------------------------------------------------- /.eslintrc.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | "env": { 3 | "browser": true, 4 | "es2021": true 5 | }, 6 | "extends": "eslint:recommended", 7 | "overrides": [ 8 | { 9 | "env": { 10 | "node": true 11 | }, 12 | "files": [ 13 | ".eslintrc.{js,cjs}" 14 | ], 15 | "parserOptions": { 16 | "sourceType": "script" 17 | } 18 | } 19 | ], 20 | "parserOptions": { 21 | "ecmaVersion": "latest", 22 | "sourceType": "module" 23 | }, 24 | "rules": { 25 | }, 26 | }; 27 | -------------------------------------------------------------------------------- /pkg/common/json_parser.go: -------------------------------------------------------------------------------- 1 | package common 2 | 3 | // JsonParser recursively traverses the JSON document (represented as map[string]interface{}) 4 | // following the sequence of keys provided. 5 | func JsonParser(doc map[string]interface{}, keys ...string) interface{} { 6 | if len(keys) == 0 { 7 | return doc 8 | } 9 | 10 | key := keys[0] 11 | value, exists := doc[key] 12 | if !exists { 13 | return nil // key not found 14 | } 15 | 16 | if len(keys) == 1 { 17 | return value 18 | } 19 | 20 | nestedDoc, ok := value.(map[string]interface{}) 21 | if !ok { 22 | return nil // value is not a map so we cannot traverse further 23 | } 24 | 25 | return JsonParser(nestedDoc, keys[1:]...) 26 | } 27 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. 12 | Ex. I'm always frustrated when [...] 13 | 14 | **Describe the solution you'd like** 15 | A clear and concise description of what you want to happen. 16 | 17 | **Describe alternatives you've considered** 18 | A clear and concise description of any alternative solutions or features 19 | you've considered. 20 | 21 | **Additional context** 22 | Add any other context or screenshots about the feature request here. 23 | -------------------------------------------------------------------------------- /pkg/config/test-config.yml: -------------------------------------------------------------------------------- 1 | database: 2 | type: postgres 3 | host: localhost 4 | port: 5432 5 | user: ${DB_USER} 6 | password: ${DB_PASSWORD} 7 | dbname: SitesIndex 8 | crawler: 9 | workers: 5 10 | depth: 1 11 | delay: "2" 12 | timeout: 10 13 | maintenance: 60 14 | api: 15 | port: 8080 16 | host: 0.0.0.0 17 | timeout: 10 18 | selenium: 19 | - type: chrome 20 | path: "" 21 | port: 4444 22 | headless: true 23 | host: localhost 24 | network_info: 25 | dns: 26 | enabled: true 27 | whois: 28 | enabled: true 29 | httpinfo: 30 | enabled: true 31 | sslinfo: 32 | enabled: true 33 | geo_localization: 34 | enabled: false 35 | path: "" 36 | 37 | debug_level: 1 38 | -------------------------------------------------------------------------------- /pkg/common/slices_test.go: -------------------------------------------------------------------------------- 1 | package common 2 | 3 | import ( 4 | "reflect" 5 | "testing" 6 | ) 7 | 8 | func TestPrepareSlice(t *testing.T) { 9 | slice := []string{" Hello ", " World ", " Gopher "} 10 | expected := []string{"hello", "world", "gopher"} 11 | 12 | result := PrepareSlice(&slice, 3) 13 | 14 | if !reflect.DeepEqual(result, expected) { 15 | t.Errorf("PrepareSlice() = %v, want %v", result, expected) 16 | } 17 | } 18 | 19 | func TestSliceContains(t *testing.T) { 20 | slice := []string{"apple", "banana", "cherry"} 21 | item := "banana" 22 | expected := true 23 | result := SliceContains(slice, item) 24 | if result != expected { 25 | t.Errorf("SliceContains() = %v, want %v", result, expected) 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /tests/perf/api_get_all_source_status.js: -------------------------------------------------------------------------------- 1 | import http from 'k6/http'; 2 | import { sleep, check } from 'k6'; 3 | 4 | const test_target = 1000; 5 | 6 | export let options = { 7 | stages: [ 8 | { duration: '10s', target: test_target / 10 }, // Ramp-up to 100 VUs over 10 seconds 9 | { duration: '30s', target: test_target }, // Then sustain test_target VUs for 30 seconds 10 | { duration: '10s', target: 0 }, // Ramp-down to 0 VUs 11 | ], 12 | rps: test_target, // Force test_target requests per second 13 | }; 14 | 15 | export default function () { 16 | let res = http.get('http://localhost:8080/v1/get_all_source_status', { timeout: '10s' }); 17 | check(res, { 18 | 'is status 200': (r) => r.status === 200, 19 | }); 20 | } 21 | -------------------------------------------------------------------------------- /pkg/database/common.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Paolo Fabio Zaino 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Package database is responsible for handling the database setup, configuration and abstraction. 16 | package database 17 | -------------------------------------------------------------------------------- /pkg/search/search.go: -------------------------------------------------------------------------------- 1 | // Package search implements the search functionality for TheCrowler. 2 | package search 3 | 4 | import ( 5 | "database/sql" 6 | 7 | cfg "github.com/pzaino/thecrowler/pkg/config" 8 | cdb "github.com/pzaino/thecrowler/pkg/database" 9 | ) 10 | 11 | // Searcher is the search engine kernel. 12 | // It generates SQL from dorking queries and returns raw DB rows. 13 | type Searcher struct { 14 | DB *cdb.Handler 15 | Config cfg.Config 16 | } 17 | 18 | // QueryResult contains raw DB results. 19 | type QueryResult struct { 20 | Rows *sql.Rows 21 | Limit int 22 | Offset int 23 | SQL string 24 | Params []any 25 | } 26 | 27 | // NewSearcher creates a new search engine kernel. 28 | func NewSearcher(db *cdb.Handler, cfg cfg.Config) *Searcher { 29 | return &Searcher{ 30 | DB: db, 31 | Config: cfg, 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /pkg/common/urls_test.go: -------------------------------------------------------------------------------- 1 | package common 2 | 3 | import "testing" 4 | 5 | func TestNormalizeURL(t *testing.T) { 6 | tests := []struct { 7 | name string 8 | input string 9 | expected string 10 | }{ 11 | { 12 | name: "Lowercase", 13 | input: "http://example.com/", 14 | expected: "http://example.com", 15 | }, 16 | { 17 | name: "TrimSpaces", 18 | input: " http://example.com/ ", 19 | expected: "http://example.com", 20 | }, 21 | { 22 | name: "TrimTrailingSlash", 23 | input: "http://example.com/", 24 | expected: "http://example.com", 25 | }, 26 | } 27 | 28 | for _, test := range tests { 29 | t.Run(test.name, func(t *testing.T) { 30 | res := NormalizeURL(test.input) 31 | if res != test.expected { 32 | t.Errorf("expected %q, got %q", test.expected, res) 33 | } 34 | }) 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /pkg/crawler/test_data/invalid_ruleset.yaml: -------------------------------------------------------------------------------- 1 | - ruleset_name: example.com 2 | rule_groups: 3 | - group_name: "Group1" 4 | valid_from: "2021-01-01" 5 | valid_to: "invalid-date-format" 6 | is_enabled: true 7 | rules: 8 | - path: "/articles" 9 | elements: 10 | title: "h1.article-title" 11 | content: "div.article-content" 12 | date: "span.date" 13 | js_files: true 14 | technology_patterns: 15 | - "jquery" 16 | - "bootstrap" 17 | 18 | - ruleset_name: another-example.com 19 | rule_groups: 20 | - group_name: "GroupA" 21 | valid_from: "2021-01-01" 22 | valid_to: "2023-12-31" 23 | is_enabled: "should be boolean" 24 | rules: 25 | - path: "/products" 26 | elements: 27 | name: 1 28 | price: 100 29 | -------------------------------------------------------------------------------- /pkg/search/exec.go: -------------------------------------------------------------------------------- 1 | // Package search implements the search functionality for TheCrowler. 2 | package search 3 | 4 | import ( 5 | "strconv" 6 | //cdb "github.com/pzaino/thecrowler/pkg/database" 7 | ) 8 | 9 | // ExecParsed executes a parsed query and returns the results. 10 | func (s *Searcher) ExecParsed(p *ParsedQuery) (*QueryResult, error) { 11 | sqlQuery := p.sqlQuery 12 | params := p.sqlParams 13 | 14 | limitIndex := len(params) - 1 15 | offsetIndex := len(params) 16 | 17 | sqlQuery += " LIMIT $" + strconv.Itoa(limitIndex) + 18 | " OFFSET $" + strconv.Itoa(offsetIndex) + ";" 19 | 20 | rows, err := (*s.DB).ExecuteQuery(sqlQuery, params...) 21 | if err != nil { 22 | return nil, err 23 | } 24 | 25 | return &QueryResult{ 26 | Rows: rows, 27 | Limit: p.limit, 28 | Offset: p.offset, 29 | SQL: sqlQuery, 30 | Params: params, 31 | }, nil 32 | } 33 | -------------------------------------------------------------------------------- /pkg/httpinfo/jarm_collector_test.go: -------------------------------------------------------------------------------- 1 | package httpinfo 2 | 3 | import ( 4 | "os" 5 | "testing" 6 | ) 7 | 8 | // TestJARMCollector_Collect tests the Collect method of the JARMCollector. 9 | func TestJARMCollector_Collect(t *testing.T) { 10 | if os.Getenv("GITHUB_ACTIONS") == "true" { 11 | t.Skip("Skipping this test in GitHub Actions.") 12 | } 13 | 14 | jc := JARMCollector{ 15 | Proxy: nil, // Set the proxy configuration if needed 16 | } 17 | 18 | host := "example.com" 19 | port := "443" 20 | 21 | jarm, err := jc.Collect(host, port) 22 | 23 | if err != nil { 24 | t.Errorf("Unexpected error: %v", err) 25 | } 26 | 27 | // Print JARM 28 | t.Logf("JARM: %s", jarm) 29 | 30 | // Add assertions to validate the JARM fingerprint 31 | // For example: 32 | // if jarm != "expected_jarm" { 33 | // t.Errorf("Expected JARM: %s, got: %s", "expected_jarm", jarm) 34 | // } 35 | } 36 | -------------------------------------------------------------------------------- /selenium-patches/browserAutomation.conf: -------------------------------------------------------------------------------- 1 | [program:browserAutomation] 2 | priority=13 3 | command=/opt/bin/rbee 4 | autostart=true 5 | autorestart=true 6 | user=seluser 7 | 8 | ;Logs (all Rbee should be visible in the docker logs) 9 | redirect_stderr=true 10 | stdout_logfile=/dev/stdout 11 | stdout_logfile_maxbytes=0 12 | 13 | [program:dbus] 14 | priority=0 15 | command=/usr/bin/dbus-daemon --session --nofork --address=unix:path=/tmp/dbus-socket 16 | user=seluser 17 | autostart=true 18 | autorestart=true 19 | 20 | ;Logs (all DBus activity redirected to the log files below) 21 | redirect_stderr=false 22 | stdout_logfile=/var/log/supervisor/dbus-stdout.log 23 | stderr_logfile=/var/log/supervisor/dbus-stderr.log 24 | stdout_logfile_maxbytes=50MB 25 | stderr_logfile_maxbytes=50MB 26 | stdout_logfile_backups=5 27 | stderr_logfile_backups=5 28 | stdout_capture_maxbytes=50MB 29 | stderr_capture_maxbytes=50MB 30 | -------------------------------------------------------------------------------- /pkg/fingerprints/types.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Paolo Fabio Zaino 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Package fingerprints implements the fingerprints library for the Crowler 16 | package fingerprints 17 | 18 | // Fingerprint is the interface that wraps the basic Compute method. 19 | type Fingerprint interface { 20 | Compute(data string) string 21 | } 22 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 16 | 1. Do '...' 17 | 2. Do '....' 18 | 3. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Config.yaml interested portion:** 27 | 28 | ```yaml 29 | # Add the portion of the config.yaml that you think is relevant to the issue 30 | ``` 31 | 32 | **Rules interested portion:** 33 | 34 | ```yaml 35 | # Add the portion of the rules that you think is relevant to the issue 36 | ``` 37 | 38 | **Additional context** 39 | Add any other context about the problem here. 40 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # If you prefer the allow list template instead of the deny list, see community template: 2 | # https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore 3 | # 4 | # Binaries for programs and plugins 5 | *.exe 6 | *.exe~ 7 | *.dll 8 | *.so 9 | *.dylib 10 | 11 | # Test binary, built with `go test -c` 12 | *.test 13 | 14 | # Output of the go coverage tool, specifically when used with LiteIDE 15 | *.out 16 | 17 | # Dependency directories (remove the comment below to include it) 18 | # vendor/ 19 | 20 | # Go workspace file 21 | go.work 22 | 23 | # API configuration files 24 | app_config.go 25 | config.yaml 26 | config.sh 27 | .env 28 | .DS_Store 29 | 30 | # build artefacts 31 | .vscode/ 32 | docker-selenium/ 33 | Rbee/ 34 | bin/ 35 | images/*.png 36 | 37 | # Testing environments 38 | sonar-project.properties 39 | .scannerwork/** 40 | AllCertificatesRecordReport.csv 41 | test*.json 42 | /cmd/bench 43 | /test_url.csv 44 | coverage.out 45 | -------------------------------------------------------------------------------- /tests/perf/api_get_healthcheck.js: -------------------------------------------------------------------------------- 1 | import http from 'k6/http'; 2 | import { sleep, check } from 'k6'; 3 | 4 | const test_target = 1000; 5 | 6 | export let options = { 7 | stages: [ 8 | { duration: '10s', target: test_target / 10 }, // Ramp-up to 100 VUs over 10 seconds 9 | { duration: '30s', target: test_target }, // Then sustain test_target VUs for 30 seconds 10 | { duration: '10s', target: 0 }, // Ramp-down to 0 VUs 11 | ], 12 | rps: test_target, // Force test_target requests per second 13 | }; 14 | 15 | /* 16 | * This test script sends a GET request to the health check endpoint 17 | * and checks if the response status code is 200. 18 | * This perf test allows to measure performance of the API without 19 | * having the DB queries in the way. 20 | */ 21 | export default function () { 22 | let res = http.get('http://localhost:8080/v1/health', { timeout: '10s' }); 23 | check(res, { 24 | 'is status 200': (r) => r.status === 200, 25 | }); 26 | } 27 | -------------------------------------------------------------------------------- /.golangci.yml: -------------------------------------------------------------------------------- 1 | # .golangci.yml 2 | run: 3 | timeout: 5m 4 | tests: false 5 | 6 | linters-settings: 7 | gocyclo: 8 | min-complexity: 45 9 | 10 | staticcheck: {} 11 | 12 | goconst: 13 | min-len: 3 14 | min-occurrences: 2 15 | 16 | dupl: 17 | threshold: 50 18 | 19 | linters: 20 | enable: 21 | - govet # Use govet instead of maligned (it has 'fieldalignment' check) 22 | - gocyclo 23 | - gosec 24 | - revive 25 | - goconst 26 | # - dupl 27 | - unused # Replaces structcheck, varcheck, and deadcode 28 | - ineffassign 29 | - typecheck 30 | - nakedret 31 | - misspell 32 | - dogsled 33 | 34 | disable: 35 | - lll # Line length linter, often too restrictive 36 | - funlen # Function length linter, can be noisy for large projects 37 | 38 | issues: 39 | exclude-use-default: false 40 | max-issues-per-linter: 0 41 | max-same-issues: 0 42 | 43 | output: 44 | sort-results: true 45 | -------------------------------------------------------------------------------- /pkg/common/generic_consts.go: -------------------------------------------------------------------------------- 1 | package common 2 | 3 | const ( 4 | // EnableStr is a constant for the string "enable". 5 | EnableStr = "enable" 6 | // DisableStr is a constant for the string "disable". 7 | DisableStr = "disable" 8 | // LoalhostStr is a constant for the string "localhost". 9 | LoalhostStr = "localhost" 10 | // LocalStr is a constant for the string "local". 11 | LocalStr = "local" 12 | // NowhereStr is a constant for the string "nowhere". 13 | NowhereStr = "nowhere" 14 | // AlwaysStr is a constant for the string "always". 15 | AlwaysStr = "always" 16 | // ClickStr is a constant for the string "click". 17 | ClickStr = "click" 18 | // LClickStr is a constant for the string "left_click". 19 | LClickStr = "left_click" 20 | // RClickStr is a constant for the string "right_click". 21 | RClickStr = "right_click" 22 | // HTTPStr is a constant for the string "http". 23 | HTTPStr = "http" 24 | // HTTPSStr is a constant for the string "https". 25 | HTTPSStr = "https" 26 | ) 27 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | # Integration, Performance and Fuzzing tests 2 | 3 | To run these tests you'll need to install some tools: 4 | 5 | ## Performance tests 6 | 7 | I use k6 for the API performance tests. 8 | 9 | On mac: 10 | 11 | ```bash 12 | brew install k6 13 | ``` 14 | 15 | On Linux: 16 | 17 | - Install go lang first 18 | 19 | ```bash 20 | go install go.k6.io/k6@latest 21 | ``` 22 | 23 | To run the tests: 24 | 25 | - Make sure the API is app and running and reachable via localhost:8080 26 | 27 | ```bash 28 | cd ./tests/perf 29 | ./run_api_all 30 | ``` 31 | 32 | ## API Fuzzing tests 33 | 34 | I use ffuf to fuzz the API. 35 | 36 | on mac: 37 | 38 | ```bash 39 | brew install ffuf 40 | ``` 41 | 42 | On Linux: 43 | 44 | - First make sure you have go installed 45 | 46 | - Then 47 | 48 | ```bash 49 | go install github.com/ffuf/ffuf/v2@latest 50 | ``` 51 | 52 | To run the tests: 53 | 54 | - Make sure the API is up and running and reachable to localhost:8080 55 | 56 | - Then 57 | 58 | ```bash 59 | cd ./tests/fuzz 60 | ./run_api_all 61 | ``` 62 | -------------------------------------------------------------------------------- /codacy.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | engines: 4 | eslint: 5 | enabled: true 6 | gosec: 7 | enabled: true 8 | shellcheck: 9 | enabled: true 10 | bandit: 11 | enabled: true 12 | 13 | ignore_paths: 14 | - "doc/*" 15 | - "bin/*" 16 | - ".github/*" 17 | - "data/*" 18 | - "tests/*" 19 | - "**/*_test.go" 20 | 21 | 22 | tools: 23 | eslint: 24 | config: 25 | extends: eslint:recommended 26 | env: 27 | node: true 28 | es6: true 29 | parserOptions: 30 | ecmaVersion: 2020 31 | 32 | gosec: 33 | enabled: true 34 | rules: 35 | - G101 36 | - G102 37 | - G103 38 | - G104 39 | - G106 40 | - G107 41 | - G201 42 | - G202 43 | - G203 44 | - G204 45 | - G301 46 | - G302 47 | - G303 48 | - G304 49 | - G305 50 | - G401 51 | - G402 52 | - G403 53 | - G404 54 | - G501 55 | - G502 56 | - G503 57 | - G601 58 | 59 | shellcheck: 60 | severity: error 61 | exclude: 62 | - SC1091 63 | 64 | bandit: 65 | confidence: high 66 | severity: medium 67 | -------------------------------------------------------------------------------- /.github/workflows/dependency-review.yml: -------------------------------------------------------------------------------- 1 | # Dependency Review Action 2 | # 3 | # This Action will scan dependency manifest files that change as part of a Pull Request, 4 | # surfacing known-vulnerable versions of the packages declared or updated in the PR. 5 | # Once installed, if the workflow run is marked as required, 6 | # PRs introducing known-vulnerable packages will be blocked from merging. 7 | # 8 | # Source repository: https://github.com/actions/dependency-review-action 9 | name: 'Dependency Review' 10 | on: [pull_request] 11 | 12 | permissions: 13 | contents: read 14 | 15 | jobs: 16 | dependency-review: 17 | runs-on: ubuntu-latest 18 | steps: 19 | - name: Harden Runner 20 | uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 21 | with: 22 | egress-policy: audit 23 | 24 | - name: 'Checkout Repository' 25 | uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 26 | - name: 'Dependency Review' 27 | uses: actions/dependency-review-action@0659a74c94536054bfa5aeb92241f70d680cc78e # v4 28 | -------------------------------------------------------------------------------- /pkg/common/encoding_test.go: -------------------------------------------------------------------------------- 1 | package common 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | const ( 8 | helloWorld = "Hello, World!" 9 | ) 10 | 11 | func TestBase64Encode(t *testing.T) { 12 | data := helloWorld 13 | expected := "SGVsbG8sIFdvcmxkIQ==" 14 | result := Base64Encode(data) 15 | if result != expected { 16 | t.Errorf("Base64Encode(%s) = %s, expected %s", data, result, expected) 17 | } 18 | } 19 | 20 | func TestCalculateEntropy(t *testing.T) { 21 | data := helloWorld 22 | expected := 3.1808329877552226 23 | result := CalculateEntropy(data) 24 | expectedCheck := float32(expected) 25 | resultCheck := float32(result) 26 | if resultCheck != expectedCheck { 27 | t.Errorf("CalculateEntropy(%s) = %f, expected %f", data, resultCheck, expectedCheck) 28 | } 29 | } 30 | 31 | func TestBase64Decode(t *testing.T) { 32 | data := "SGVsbG8sIFdvcmxkIQ==" 33 | expected := helloWorld 34 | result, err := Base64Decode(data) 35 | if err != nil { 36 | t.Errorf("Base64Decode(%s) returned an error: %v", data, err) 37 | } 38 | if result != expected { 39 | t.Errorf("Base64Decode(%s) = %s, expected %s", data, result, expected) 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /pkg/fingerprints/sha256.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Paolo Fabio Zaino 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Package fingerprints implements the fingerprints library for the Crowler 16 | package fingerprints 17 | 18 | import ( 19 | "crypto/sha256" 20 | "encoding/hex" 21 | ) 22 | 23 | // SHA256 implements the Fingerprint interface for SHA-256 fingerprints. 24 | type SHA256 struct{} 25 | 26 | // Compute computes the SHA-256 fingerprint of a given data. 27 | func (s SHA256) Compute(data string) string { 28 | hash := sha256.Sum256([]byte(data)) 29 | return hex.EncodeToString(hash[:]) 30 | } 31 | -------------------------------------------------------------------------------- /pkg/fingerprints/murmurhash.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Paolo Fabio Zaino 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Package fingerprints implements the fingerprints library for the Crowler 16 | package fingerprints 17 | 18 | import ( 19 | "fmt" 20 | 21 | "github.com/spaolacci/murmur3" 22 | ) 23 | 24 | // MurmurHash implements the Fingerprint interface for MurmurHash fingerprints. 25 | type MurmurHash struct{} 26 | 27 | // Compute computes the MurmurHash fingerprint of a given data. 28 | func (m MurmurHash) Compute(data string) string { 29 | return fmt.Sprintf("%x", murmur3.Sum32([]byte(data))) 30 | } 31 | -------------------------------------------------------------------------------- /pkg/fingerprints/ctls.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Paolo Fabio Zaino 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Package fingerprints implements the fingerprints library for the Crowler 16 | package fingerprints 17 | 18 | import ( 19 | "crypto/sha256" 20 | "encoding/hex" 21 | ) 22 | 23 | // CustomTLS implements the Fingerprint interface for custom TLS fingerprints. 24 | type CustomTLS struct{} 25 | 26 | // Compute computes the custom TLS fingerprint of a given data. 27 | func (c CustomTLS) Compute(data string) string { 28 | hash := sha256.Sum256([]byte(data)) 29 | return hex.EncodeToString(hash[:]) 30 | } 31 | -------------------------------------------------------------------------------- /pkg/fingerprints/blake2.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Paolo Fabio Zaino 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Package fingerprints implements the fingerprints library for the Crowler 16 | package fingerprints 17 | 18 | import ( 19 | "encoding/hex" 20 | 21 | "golang.org/x/crypto/blake2b" 22 | ) 23 | 24 | // BLAKE2 implements the Fingerprint interface for BLAKE2 fingerprints. 25 | type BLAKE2 struct{} 26 | 27 | // Compute computes the BLAKE2 fingerprint of a given data. 28 | func (b BLAKE2) Compute(data string) string { 29 | hash := blake2b.Sum256([]byte(data)) 30 | return hex.EncodeToString(hash[:]) 31 | } 32 | -------------------------------------------------------------------------------- /tests/perf/event_create_events.js: -------------------------------------------------------------------------------- 1 | import http from 'k6/http'; 2 | import { check } from 'k6'; 3 | 4 | const test_target = 25000; 5 | 6 | export let options = { 7 | stages: [ 8 | { duration: '10s', target: test_target / 10 }, 9 | { duration: '30s', target: test_target }, 10 | { duration: '10s', target: 0 }, 11 | ], 12 | rps: test_target, 13 | }; 14 | 15 | export default function () { 16 | const uniqueId = `${__VU}-${__ITER}-${Math.random().toString(36).substring(2, 10)}`; 17 | const now = new Date().toISOString(); 18 | 19 | const payload = JSON.stringify({ 20 | source_id: 0, 21 | event_type: "test_event", 22 | event_severity: "low", 23 | timestamp: now, 24 | details: { 25 | mode: "Test", 26 | ts: now, 27 | unique_id: uniqueId 28 | } 29 | }); 30 | 31 | const headers = { 32 | 'Content-Type': 'application/json', 33 | 'User-Agent': 'k6-Events-Test' 34 | }; 35 | 36 | const res = http.post('http://localhost:8082/v1/event/create', payload, { headers, timeout: '10s' }); 37 | 38 | check(res, { 39 | 'is status 201': (r) => r.status === 201, 40 | 'has body': (r) => r.body && r.body.length > 0 41 | }); 42 | } 43 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: github-actions 4 | directory: / 5 | schedule: 6 | interval: daily 7 | target: 8 | branch: "develop" 9 | 10 | - package-ecosystem: gomod 11 | directory: / 12 | schedule: 13 | interval: daily 14 | target: 15 | branch: "develop" 16 | 17 | - package-ecosystem: docker 18 | directory: /selenium-patches 19 | schedule: 20 | interval: daily 21 | 22 | - package-ecosystem: docker 23 | directory: /selenium-patches/4.18.1 24 | schedule: 25 | interval: daily 26 | 27 | - package-ecosystem: docker 28 | directory: /selenium-patches/4.19.1 29 | schedule: 30 | interval: daily 31 | 32 | - package-ecosystem: docker 33 | directory: /selenium-patches/4.20.0 34 | schedule: 35 | interval: daily 36 | 37 | - package-ecosystem: docker 38 | directory: /selenium-patches/4.21.0 39 | schedule: 40 | interval: daily 41 | 42 | - package-ecosystem: docker 43 | directory: /selenium-patches/4.23.1 44 | schedule: 45 | interval: daily 46 | 47 | - package-ecosystem: docker 48 | directory: /selenium-patches/4.24.0 49 | schedule: 50 | interval: daily 51 | -------------------------------------------------------------------------------- /doc/README.md: -------------------------------------------------------------------------------- 1 | # The CROWler 2 | 3 | The CROWler is an open-source, feature-rich web crawler designed with a unique 4 | philosophy at its core: to be as gentle and low-noise as possible. In other 5 | words, The CROWler tries to stand out by ensuring minimal impact on the 6 | websites it crawls while maximizing convenience for its users. 7 | 8 | Additionally, the system is equipped with an API, providing a streamlined 9 | interface for data queries. This feature ensures easy integration and 10 | access to indexed data for various applications. 11 | 12 | The CROWler is designed to be micro-services based, so it can be easily 13 | deployed in a containerized environment. 14 | 15 | ## Content 16 | 17 | - [Features](./features.md) 18 | - [General Architecture](./architecture.md) 19 | - [Database Architecture](./database_architecture.md) 20 | - [Rulesets Architecture](./ruleset_architecture.md) 21 | - [Installation](./installation.md) 22 | - [Usage](./usage.md) 23 | - [Configuration](./config.md) 24 | - [Environment Variables](./env_vars.md) 25 | - [What are teh CROWler's "sources"?](./sources.md) 26 | - [Rulesets](./rulesets.md) 27 | - [API](./api.md) 28 | - [Contributing](../CONTRIBUTING.md) 29 | - [Test Policy](./test_policy.md) 30 | - [License](../LICENSE.md) 31 | -------------------------------------------------------------------------------- /pkg/common/encoding.go: -------------------------------------------------------------------------------- 1 | package common 2 | 3 | import ( 4 | "crypto/sha256" 5 | "encoding/base64" 6 | "encoding/hex" 7 | "math" 8 | ) 9 | 10 | // Base64Encode encodes a string to base64, this may be required by some 11 | // configurations. 12 | func Base64Encode(data string) string { 13 | return base64.StdEncoding.EncodeToString([]byte(data)) 14 | } 15 | 16 | // Base64Decode decodes a base64 string to a normal string. 17 | func Base64Decode(data string) (string, error) { 18 | decoded, err := base64.StdEncoding.DecodeString(data) 19 | return string(decoded), err 20 | } 21 | 22 | // CalculateEntropy of a string 23 | func CalculateEntropy(data string) float64 { 24 | frequency := make(map[rune]int) 25 | for _, char := range data { 26 | frequency[char]++ 27 | } 28 | 29 | var entropy float64 30 | length := float64(len(data)) 31 | for _, count := range frequency { 32 | probability := float64(count) / length 33 | entropy -= probability * math.Log2(probability) 34 | } 35 | 36 | return entropy 37 | } 38 | 39 | // GenerateSHA256 generates a SHA256 hash of the input string. 40 | func GenerateSHA256(data string) string { 41 | // Generate SHA-256 hash 42 | hash := sha256.Sum256([]byte(data)) 43 | 44 | // Convert hash to a hexadecimal string 45 | hashString := hex.EncodeToString(hash[:]) 46 | 47 | return hashString 48 | } 49 | -------------------------------------------------------------------------------- /doc/architecture.md: -------------------------------------------------------------------------------- 1 | # TheCROWler Architecture 2 | 3 | The crowler architecture is a typical microservice based architecture. 4 | The system is divided into multiple services, each responsible for a 5 | specific task. The services communicate with each other using REST APIs. 6 | The system is designed to be easily scalable and deployable in a containerized 7 | environment. 8 | 9 | ## Components 10 | 11 | - **Crowler API** - The main API service that provides an interface for data 12 | queries. 13 | - **Crowler Engine** - The engine service is responsible for crawling the sources 14 | and index them. 15 | - **Crowler DB** - The database service that stores the indexed data. 16 | - **Chrome/Selenium** Virtual Desktops are used to simulate real user interactions. 17 | 18 | If you need to scale up the system, you can simply spin up more instances of the 19 | Crowler Engine service and Chrome/Selenium services. 20 | 21 | The CROWler engine is responsible for fetch the "Sources" URLs provided by the API 22 | or the user, and then using the rulesets, interact with the page, collect data, 23 | store it in the database and detect entities. 24 | 25 | For more info on the ruleset architecture see [Ruleset Architecture](./ruleset_architecture.md). 26 | 27 | ## Architecture diagram 28 | 29 | ![TheCROWler Microservice Architecture](./GeneralArchitecture.jpg) 30 | -------------------------------------------------------------------------------- /pkg/common/interfaces_test.go: -------------------------------------------------------------------------------- 1 | package common 2 | 3 | import ( 4 | "reflect" 5 | "testing" 6 | ) 7 | 8 | func TestConvertInterfaceMapToStringMap(t *testing.T) { 9 | // Test case 1: map[interface{}]interface{} 10 | input1 := map[interface{}]interface{}{ 11 | "key1": "value1", 12 | "key2": 2, 13 | "key3": []interface{}{"a", "b", "c"}, 14 | } 15 | expected1 := map[string]interface{}{ 16 | "key1": "value1", 17 | "key2": 2, 18 | "key3": []interface{}{"a", "b", "c"}, 19 | } 20 | result1 := ConvertInterfaceMapToStringMap(input1) 21 | if !reflect.DeepEqual(result1, expected1) { 22 | t.Errorf("ConvertInterfaceMapToStringMap() = %v; want %v", result1, expected1) 23 | } 24 | 25 | // Test case 2: []interface{} 26 | input2 := []interface{}{"a", 1, map[interface{}]interface{}{"key": "value"}} 27 | expected2 := []interface{}{"a", 1, map[string]interface{}{"key": "value"}} 28 | result2 := ConvertInterfaceMapToStringMap(input2) 29 | if !reflect.DeepEqual(result2, expected2) { 30 | t.Errorf("ConvertInterfaceMapToStringMap() = %v; want %v", result2, expected2) 31 | } 32 | 33 | // Test case 3: other types 34 | input3 := "test" 35 | expected3 := "test" 36 | result3 := ConvertInterfaceMapToStringMap(input3) 37 | if !reflect.DeepEqual(result3, expected3) { 38 | t.Errorf("ConvertInterfaceMapToStringMap() = %v; want %v", result3, expected3) 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /pkg/fingerprints/hassh.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Paolo Fabio Zaino 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Package fingerprints implements the fingerprints library for the Crowler 16 | package fingerprints 17 | 18 | import ( 19 | //nolint:gosec // Disabling G501: Md5 is required for backward compatibility, we do not use it for security purposes 20 | "crypto/md5" 21 | "encoding/hex" 22 | ) 23 | 24 | // HASSH implements the Fingerprint interface for HASSH fingerprints. 25 | type HASSH struct{} 26 | 27 | // Compute computes the HASSH fingerprint of a given data. 28 | func (h HASSH) Compute(data string) string { 29 | //nolint:gosec // Disabling G401: Md5 is required for backward compatibility, we do not use it for security purposes 30 | hash := md5.Sum([]byte(data)) 31 | return hex.EncodeToString(hash[:]) 32 | } 33 | -------------------------------------------------------------------------------- /pkg/common/locks.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Paolo Fabio Zaino 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Package common package is used to store common functions and variables 16 | package common 17 | 18 | import ( 19 | "sync" 20 | "sync/atomic" 21 | ) 22 | 23 | // SafeMutex is a thread-safe mutex that ensures that the lock is only released if it was previously locked. 24 | type SafeMutex struct { 25 | mu sync.Mutex 26 | locked uint32 27 | } 28 | 29 | // Lock acquires the lock and sets the locked state to true. 30 | func (m *SafeMutex) Lock() { 31 | m.mu.Lock() 32 | atomic.StoreUint32(&m.locked, 1) 33 | } 34 | 35 | // Unlock releases the lock only if it was previously locked. 36 | func (m *SafeMutex) Unlock() { 37 | if !atomic.CompareAndSwapUint32(&m.locked, 1, 0) { 38 | return // or panic/log 39 | } 40 | m.mu.Unlock() 41 | } 42 | -------------------------------------------------------------------------------- /pkg/fingerprints/hassh_server.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Paolo Fabio Zaino 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Package fingerprints implements the fingerprints library for the Crowler 16 | package fingerprints 17 | 18 | import ( 19 | //nolint:gosec // Disabling G501: Md5 is required for backward compatibility, we do not use it for security purposes 20 | "crypto/md5" 21 | "encoding/hex" 22 | ) 23 | 24 | // HASSHServer implements the Fingerprint interface for HASSHServer fingerprints. 25 | type HASSHServer struct{} 26 | 27 | // Compute computes the HASSHServer fingerprint of a given data. 28 | func (h HASSHServer) Compute(data string) string { 29 | //nolint:gosec // Disabling G401: Md5 is required for backward compatibility, we do not use it for security purposes 30 | hash := md5.Sum([]byte(data)) 31 | return hex.EncodeToString(hash[:]) 32 | } 33 | -------------------------------------------------------------------------------- /schemas/crowler-event-schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "http://json-schema.org/draft-07/schema#", 3 | "$id": "https://github.com/pzaino/thecrowler/main/schemas/crowler-event-schema.json", 4 | "title": "Event", 5 | "type": "object", 6 | "properties": { 7 | "source_id": { 8 | "type": "integer", 9 | "format": "uint64", 10 | "description": "The unique identifier of the source. 0 for no source" 11 | }, 12 | "event_type": { 13 | "type": "string", 14 | "description": "The type of the event." 15 | }, 16 | "event_severity": { 17 | "type": "string", 18 | "description": "The severity of the event. Generally 'high', 'medium', 'low' but a user can also use custom values" 19 | }, 20 | "event_timestamp": { 21 | "type": "string", 22 | "format": "date-time", 23 | "description": "The user's timestamp of the event. The time on the system who has generated the event" 24 | }, 25 | "details": { 26 | "type": "object", 27 | "additionalProperties": true, 28 | "description": "The details of the event. Custom JSON for the user to send to the plugins and/or Agents designed to handle the event" 29 | } 30 | }, 31 | "required": [ 32 | "source_id", 33 | "event_type", 34 | "details" 35 | ], 36 | "additionalProperties": false 37 | } 38 | -------------------------------------------------------------------------------- /services/events/types.go: -------------------------------------------------------------------------------- 1 | // Package main (events) implements the CROWler Events Handler engine. 2 | package main 3 | 4 | import ( 5 | "time" 6 | 7 | cdb "github.com/pzaino/thecrowler/pkg/database" 8 | ) 9 | 10 | // HealthCheck is a struct that holds the health status of the application. 11 | type HealthCheck struct { 12 | Status string `json:"status"` 13 | } 14 | 15 | // ReadyCheck is a struct that holds the readiness status of the application. 16 | type ReadyCheck struct { 17 | Status string `json:"status"` 18 | } 19 | 20 | // PluginResponse is a struct that holds the response from a plugin. 21 | type PluginResponse struct { 22 | Success bool `json:"success"` 23 | Message string `json:"message"` 24 | APIResponse interface{} `json:"apiResponse,omitempty"` // Use `interface{}` to allow flexibility in the API response structure 25 | } 26 | 27 | // HeartbeatState tracks an active heartbeat round. 28 | type HeartbeatState struct { 29 | ParentID string 30 | SentAt time.Time 31 | Timeout time.Duration 32 | Responses map[string]cdb.Event // keyed by agent or source 33 | DoneChan chan struct{} 34 | } 35 | 36 | // HeartbeatReport is the structure of the heartbeat report. 37 | type HeartbeatReport struct { 38 | ParentID string `json:"parent_id"` 39 | Total int `json:"total"` 40 | Responders []string `json:"responders"` 41 | Raw []cdb.Event `json:"raw_responses"` 42 | } 43 | 44 | var ( 45 | lastDBMaintenance time.Time 46 | ) 47 | -------------------------------------------------------------------------------- /.github/workflows/go.yml: -------------------------------------------------------------------------------- 1 | # This workflow will build a golang project 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-go 3 | 4 | name: Go 5 | 6 | on: 7 | push: 8 | branches: [ "main" ] 9 | pull_request: 10 | branches: [ "main" ] 11 | 12 | permissions: read-all 13 | 14 | jobs: 15 | 16 | build: 17 | runs-on: ubuntu-latest 18 | steps: 19 | - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 20 | 21 | - name: Set up Go 22 | uses: actions/setup-go@3041bf56c941b39c61721a86cd11f3bb1338122a # v5.2.0 23 | with: 24 | go-version-file: go.mod 25 | 26 | - name: Build 27 | run: go build -v ./... 28 | 29 | - name: Test 30 | run: go test -v ./... 31 | 32 | - name: Go report card 33 | uses: creekorful/goreportcard-action@1f35ced8cdac2cba28c9a2f2288a16aacfd507f9 # v1.0 34 | 35 | - name: Harden-Runner 36 | uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 37 | with: 38 | disable-sudo: true 39 | egress-policy: block 40 | allowed-endpoints: > 41 | github.com:443 42 | api.github.com:443 43 | github-releases.githubusercontent.com:443 44 | goreportcard.com:443 45 | objects.githubusercontent.com:443 46 | proxy.golang.org:443 47 | storage.googleapis.com:443 48 | golang.org:443 49 | go.dev:443 50 | -------------------------------------------------------------------------------- /selenium-patches/4.23.1/Dockerfile_Base_ARM64_4.23.1.patch: -------------------------------------------------------------------------------- 1 | --- ./selenium-patches/4.23.1/Dockerfile_Base_orig 2024-09-06 15:33:51 2 | +++ ./selenium-patches/4.23.1/Dockerfile_Base_fixed 2024-09-06 17:47:32 3 | @@ -1,4 +1,4 @@ 4 | -FROM ubuntu:noble-20240605 5 | +FROM arm64v8/ubuntu:jammy-20240808 6 | LABEL authors="Selenium " 7 | 8 | # Arguments to define the version of dependencies to download 9 | @@ -40,13 +40,14 @@ 10 | # Includes minimal runtime used for executing non GUI Java programs 11 | #======================== 12 | RUN if [ "$(dpkg --print-architecture)" = "amd64" ]; then \ 13 | - echo "deb http://archive.ubuntu.com/ubuntu noble main universe\n" > /etc/apt/sources.list \ 14 | - && echo "deb http://archive.ubuntu.com/ubuntu noble-updates main universe\n" >> /etc/apt/sources.list \ 15 | - && echo "deb http://security.ubuntu.com/ubuntu noble-security main universe\n" >> /etc/apt/sources.list ; \ 16 | + echo "deb http://archive.ubuntu.com/ubuntu jammy main universe\n" > /etc/apt/sources.list \ 17 | + && echo "deb http://archive.ubuntu.com/ubuntu jammy-updates main universe\n" >> /etc/apt/sources.list \ 18 | + && echo "deb http://security.ubuntu.com/ubuntu jammy-security main universe\n" >> /etc/apt/sources.list ; \ 19 | fi \ 20 | && apt-get -qqy update \ 21 | && apt-get upgrade -yq \ 22 | && apt-get -qqy --no-install-recommends install \ 23 | + base-files \ 24 | acl \ 25 | bzip2 \ 26 | ca-certificates \ 27 | -------------------------------------------------------------------------------- /selenium-patches/4.20.0/Dockerfile_Base_ARM64_4.20.0.patch: -------------------------------------------------------------------------------- 1 | --- ./docker-selenium/Base/Dockerfile 2024-09-06 00:44:57 2 | +++ ./docker-selenium/Base/Dockerfile_Base_ARM64_4.20.0 2024-09-06 00:47:59 3 | @@ -1,4 +1,4 @@ 4 | -FROM ubuntu:jammy-20240405 5 | +FROM arm64v8/ubuntu:jammy-20240405 6 | LABEL authors="Selenium " 7 | 8 | # Arguments to define the version of dependencies to download 9 | @@ -17,7 +17,7 @@ 10 | ARG GID=1201 11 | ARG TZ="UTC" 12 | ARG JRE_VERSION=17 13 | -ARG TARGETARCH=amd64 14 | +ARG TARGETARCH=arm64 15 | ARG TARGETVARIANT 16 | 17 | USER root 18 | @@ -36,9 +36,9 @@ 19 | # Miscellaneous packages 20 | # Includes minimal runtime used for executing non GUI Java programs 21 | #======================== 22 | -RUN echo "deb http://archive.ubuntu.com/ubuntu jammy main universe\n" > /etc/apt/sources.list \ 23 | - && echo "deb http://archive.ubuntu.com/ubuntu jammy-updates main universe\n" >> /etc/apt/sources.list \ 24 | - && echo "deb http://security.ubuntu.com/ubuntu jammy-security main universe\n" >> /etc/apt/sources.list \ 25 | +RUN echo "deb http://ports.ubuntu.com/ubuntu-ports jammy main universe\n" > /etc/apt/sources.list \ 26 | + && echo "deb http://ports.ubuntu.com/ubuntu-ports jammy-updates main universe\n" >> /etc/apt/sources.list \ 27 | + && echo "deb http://ports.ubuntu.com/ubuntu-ports jammy-security main universe\n" >> /etc/apt/sources.list \ 28 | && apt-get -qqy update \ 29 | && apt-get upgrade -yq \ 30 | && apt-get -qqy --no-install-recommends install \ 31 | -------------------------------------------------------------------------------- /selenium-patches/4.18.1/Dockerfile_Base_ARM64_4.18.1.patch: -------------------------------------------------------------------------------- 1 | --- Dockerfile 2024-04-29 20:55:19 2 | +++ Dockerfile_patched 2024-04-29 20:54:54 3 | @@ -19,9 +19,9 @@ 4 | #================================================ 5 | # Customize sources for apt-get 6 | #================================================ 7 | -RUN echo "deb http://archive.ubuntu.com/ubuntu jammy main universe\n" > /etc/apt/sources.list \ 8 | - && echo "deb http://archive.ubuntu.com/ubuntu jammy-updates main universe\n" >> /etc/apt/sources.list \ 9 | - && echo "deb http://security.ubuntu.com/ubuntu jammy-security main universe\n" >> /etc/apt/sources.list 10 | +RUN echo "deb http://ports.ubuntu.com/ubuntu-ports jammy main universe\n" > /etc/apt/sources.list \ 11 | + && echo "deb http://ports.ubuntu.com/ubuntu-ports jammy-updates main universe\n" >> /etc/apt/sources.list \ 12 | + && echo "deb http://ports.ubuntu.com/ubuntu-ports jammy-security main universe\n" >> /etc/apt/sources.list 13 | 14 | # No interactive frontend during docker build 15 | ENV DEBIAN_FRONTEND=noninteractive \ 16 | @@ -48,7 +48,7 @@ 17 | gnupg2 \ 18 | libnss3-tools \ 19 | && rm -rf /var/lib/apt/lists/* /var/cache/apt/* \ 20 | - && sed -i 's/securerandom\.source=file:\/dev\/random/securerandom\.source=file:\/dev\/urandom/' ./usr/lib/jvm/java-11-openjdk-amd64/conf/security/java.security 21 | + && sed -i 's/securerandom\.source=file:\/dev\/random/securerandom\.source=file:\/dev\/urandom/' ./usr/lib/jvm/java-11-openjdk-arm64/conf/security/java.security 22 | 23 | #=================== 24 | # Timezone settings 25 | -------------------------------------------------------------------------------- /selenium-patches/4.19.1/Dockerfile_Base_ARM64_4.18.1.patch: -------------------------------------------------------------------------------- 1 | --- Dockerfile 2024-04-29 20:55:19 2 | +++ Dockerfile_patched 2024-04-29 20:54:54 3 | @@ -19,9 +19,9 @@ 4 | #================================================ 5 | # Customize sources for apt-get 6 | #================================================ 7 | -RUN echo "deb http://archive.ubuntu.com/ubuntu jammy main universe\n" > /etc/apt/sources.list \ 8 | - && echo "deb http://archive.ubuntu.com/ubuntu jammy-updates main universe\n" >> /etc/apt/sources.list \ 9 | - && echo "deb http://security.ubuntu.com/ubuntu jammy-security main universe\n" >> /etc/apt/sources.list 10 | +RUN echo "deb http://ports.ubuntu.com/ubuntu-ports jammy main universe\n" > /etc/apt/sources.list \ 11 | + && echo "deb http://ports.ubuntu.com/ubuntu-ports jammy-updates main universe\n" >> /etc/apt/sources.list \ 12 | + && echo "deb http://ports.ubuntu.com/ubuntu-ports jammy-security main universe\n" >> /etc/apt/sources.list 13 | 14 | # No interactive frontend during docker build 15 | ENV DEBIAN_FRONTEND=noninteractive \ 16 | @@ -48,7 +48,7 @@ 17 | gnupg2 \ 18 | libnss3-tools \ 19 | && rm -rf /var/lib/apt/lists/* /var/cache/apt/* \ 20 | - && sed -i 's/securerandom\.source=file:\/dev\/random/securerandom\.source=file:\/dev\/urandom/' ./usr/lib/jvm/java-11-openjdk-amd64/conf/security/java.security 21 | + && sed -i 's/securerandom\.source=file:\/dev\/random/securerandom\.source=file:\/dev\/urandom/' ./usr/lib/jvm/java-11-openjdk-arm64/conf/security/java.security 22 | 23 | #=================== 24 | # Timezone settings 25 | -------------------------------------------------------------------------------- /selenium-patches/4.19.1/Dockerfile_Base_ARM64_4.19.1.patch: -------------------------------------------------------------------------------- 1 | --- Dockerfile 2024-04-29 20:55:19 2 | +++ Dockerfile_patched 2024-04-29 20:54:54 3 | @@ -19,9 +19,9 @@ 4 | #================================================ 5 | # Customize sources for apt-get 6 | #================================================ 7 | -RUN echo "deb http://archive.ubuntu.com/ubuntu jammy main universe\n" > /etc/apt/sources.list \ 8 | - && echo "deb http://archive.ubuntu.com/ubuntu jammy-updates main universe\n" >> /etc/apt/sources.list \ 9 | - && echo "deb http://security.ubuntu.com/ubuntu jammy-security main universe\n" >> /etc/apt/sources.list 10 | +RUN echo "deb http://ports.ubuntu.com/ubuntu-ports jammy main universe\n" > /etc/apt/sources.list \ 11 | + && echo "deb http://ports.ubuntu.com/ubuntu-ports jammy-updates main universe\n" >> /etc/apt/sources.list \ 12 | + && echo "deb http://ports.ubuntu.com/ubuntu-ports jammy-security main universe\n" >> /etc/apt/sources.list 13 | 14 | # No interactive frontend during docker build 15 | ENV DEBIAN_FRONTEND=noninteractive \ 16 | @@ -48,7 +48,7 @@ 17 | gnupg2 \ 18 | libnss3-tools \ 19 | && rm -rf /var/lib/apt/lists/* /var/cache/apt/* \ 20 | - && sed -i 's/securerandom\.source=file:\/dev\/random/securerandom\.source=file:\/dev\/urandom/' ./usr/lib/jvm/java-11-openjdk-amd64/conf/security/java.security 21 | + && sed -i 's/securerandom\.source=file:\/dev\/random/securerandom\.source=file:\/dev\/urandom/' ./usr/lib/jvm/java-11-openjdk-arm64/conf/security/java.security 22 | 23 | #=================== 24 | # Timezone settings 25 | -------------------------------------------------------------------------------- /.github/workflows/go-vulncheck.yml: -------------------------------------------------------------------------------- 1 | name: Go-VulnCheck 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - main 10 | 11 | permissions: 12 | contents: read 13 | 14 | jobs: 15 | osv-scanner: 16 | runs-on: ubuntu-latest 17 | 18 | steps: 19 | - name: Harden Runner 20 | uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 21 | with: 22 | egress-policy: audit 23 | 24 | - name: Checkout repository 25 | uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 26 | 27 | - name: Set up Go 28 | uses: actions/setup-go@3041bf56c941b39c61721a86cd11f3bb1338122a # v5.2.0 29 | with: 30 | go-version-file: go.mod 31 | 32 | - name: Clean Go environment and install dependencies 33 | run: | 34 | go clean -cache -modcache -i -r 35 | go mod download 36 | go mod tidy 37 | 38 | - name: Build the project 39 | run: | 40 | go build ./... 41 | go test ./... 42 | 43 | - name: Install govulncheck 44 | run: go install golang.org/x/vuln/cmd/govulncheck@latest 45 | 46 | - name: Download OSV-Scanner 47 | run: | 48 | wget https://github.com/google/osv-scanner/releases/download/v1.8.1/osv-scanner_linux_amd64 -O osv-scanner 49 | chmod +x osv-scanner 50 | 51 | #- name: Run OSV-Scanner 52 | # run: ./osv-scanner --recursive . 53 | 54 | - name: Run govulncheck 55 | run: govulncheck ./... 56 | -------------------------------------------------------------------------------- /pkg/exprterpreter/types.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Paolo Fabio Zaino 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Package exprterpreter contains the expression interpreter logic. 16 | package exprterpreter 17 | 18 | // Micro-Interpreters for complex parameters 19 | 20 | const maxInterpreterRecursionDepth = 100 21 | 22 | // EncodedCmd is a struct containing the parsed command token and arguments. 23 | type EncodedCmd struct { 24 | Token int 25 | Args []EncodedCmd 26 | ArgValue string // stores the argument value 27 | } 28 | 29 | const ( 30 | // TokenRandom is the token for the random(x, y) command 31 | TokenRandom = 1 32 | // TokenTime is the token for the time() command 33 | TokenTime = 2 34 | // TokenURL is the token for the URL command (which always represent the current URL) 35 | TokenURL = 3 36 | ) 37 | 38 | // commandTokenMap maps command strings to their respective Token IDs. 39 | var commandTokenMap = map[string]int{ 40 | "random": TokenRandom, 41 | "time": TokenTime, 42 | "url": TokenURL, 43 | } 44 | -------------------------------------------------------------------------------- /pkg/crawler/consts.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Paolo Fabio Zaino 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Package crawler implements the crawler library for the Crowler 16 | package crawler 17 | 18 | const ( 19 | strPluginCall = "plugin_call" 20 | strRegEx = "regex" 21 | strXPath = "xpath" 22 | strCSS = "css" 23 | strJSON = "json" 24 | strYAML = "yaml" 25 | strText1 = "text" 26 | strText2 = "text_content" 27 | strText3 = "inner_text" 28 | strText4 = "html" 29 | strJSPath = "js_path" 30 | strName = "name" 31 | strLinkText1 = "link_text" 32 | strLinkText2 = "linktext" 33 | strPartialLinkText1 = "partial_link_text" 34 | strPartialLinkText2 = "partiallinktext" 35 | strTagName1 = "tag_name" 36 | strTagName2 = "tagname" 37 | strTagName3 = "tag" 38 | strTagName4 = "element" 39 | strClassName1 = "class_name" 40 | strClassName2 = "classname" 41 | strClassName3 = "class" 42 | ) 43 | -------------------------------------------------------------------------------- /pkg/crawler/test_data/source-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "format_version": "1.0", 3 | "author": "Test Author", 4 | "created_at": "2024-04-10T15:00:00Z", 5 | "description": "This is a test document following the specified schema.", 6 | "source_name": "Example Source", 7 | "crawling_config": { 8 | "site": "https://www.example.com" 9 | }, 10 | "execution_plan": [ 11 | { 12 | "label": "Initial Crawl", 13 | "conditions": { 14 | "url_patterns": [ 15 | "https://www.example.com/{category}" 16 | ] 17 | }, 18 | "rulesets": [ 19 | "defaultRuleset" 20 | ], 21 | "additional_conditions": { 22 | "max_depth": 5, 23 | "limit_per_site": 1000 24 | } 25 | }, 26 | { 27 | "label": "Follow-up Crawl", 28 | "conditions": { 29 | "url_patterns": [ 30 | "https://www.example.com/{category}/{subcategory}" 31 | ] 32 | }, 33 | "rule_groups": [ 34 | "secondaryRulesetGroup" 35 | ], 36 | "additional_conditions": { 37 | "max_depth": 3, 38 | "limit_per_site": 500 39 | } 40 | }, 41 | { 42 | "label": "Final Crawl", 43 | "conditions": { 44 | "url_patterns": [ 45 | "https://www.example.com/{category}/{subcategory}/{id}" 46 | ] 47 | }, 48 | "rules": [ 49 | "finalizeRule" 50 | ], 51 | "additional_conditions": { 52 | "max_depth": 1, 53 | "limit_per_site": 100 54 | } 55 | } 56 | ] 57 | } 58 | -------------------------------------------------------------------------------- /pkg/plugin/types.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Paolo Fabio Zaino 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Package plugin provides the plugin functionality for the CROWler. 16 | package plugin 17 | 18 | // JSPlugin struct to hold the JS plugin 19 | type JSPlugin struct { 20 | Name string `json:"name" yaml:"name"` // Name of the plugin 21 | Description string `json:"description" yaml:"description"` // Description of the plugin 22 | Version string `json:"version" yaml:"version"` // Version of the plugin 23 | PType string `json:"type" yaml:"type"` // Type of the plugin 24 | Async bool `json:"async" yaml:"async"` // Is the plugin asynchronous 25 | Script string `json:"script" yaml:"script"` // Script for the plugin 26 | EventType string `json:"event_type" yaml:"event_type"` // Event type for the plugin. Plugins can register to handle an event. 27 | } 28 | 29 | // JSPluginRegister struct to hold the JS plugins 30 | type JSPluginRegister struct { 31 | Registry map[string]JSPlugin // Registry of JS plugins 32 | Order []string // Order of the plugins in registration order 33 | } 34 | -------------------------------------------------------------------------------- /pkg/fingerprints/ja3.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Paolo Fabio Zaino 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Package fingerprints implements the fingerprints library for the Crowler 16 | package fingerprints 17 | 18 | import ( 19 | //nolint:gosec // Disabling G501: Md5 is required for backward compatibility, we do not use it for security purposes 20 | "crypto/md5" 21 | "encoding/hex" 22 | ) 23 | 24 | // JA3 implements the Fingerprint interface for JA3 fingerprints. 25 | type JA3 struct{} 26 | 27 | // Compute computes the JA3 fingerprint of a given data. 28 | func (j JA3) Compute(data string) string { 29 | //nolint:gosec // Disabling G401: Md5 is required for backward compatibility, we do not use it for security purposes 30 | hash := md5.Sum([]byte(data)) 31 | return hex.EncodeToString(hash[:]) 32 | } 33 | 34 | // JA3S implements the Fingerprint interface for JA3S fingerprints. 35 | type JA3S struct{} 36 | 37 | // Compute computes the JA3S fingerprint of a given data. 38 | func (j JA3S) Compute(data string) string { 39 | //nolint:gosec // Disabling G401: Md5 is required for backward compatibility, we do not use it for security purposes 40 | hash := md5.Sum([]byte(data)) 41 | return hex.EncodeToString(hash[:]) 42 | } 43 | -------------------------------------------------------------------------------- /pkg/fingerprints/tlsh.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Paolo Fabio Zaino 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Package fingerprints implements the fingerprints library for the Crowler 16 | package fingerprints 17 | 18 | import ( 19 | "crypto/sha256" 20 | "encoding/hex" 21 | ) 22 | 23 | // TLSH implements the Fingerprint interface for TLSH fingerprints. 24 | type TLSH struct { 25 | buckets [256]int 26 | total int 27 | checksum [1]byte 28 | } 29 | 30 | // NewTLSH creates a new TLSH fingerprint. 31 | func NewTLSH() *TLSH { 32 | return &TLSH{} 33 | } 34 | 35 | // Update updates the TLSH fingerprint with new data. 36 | func (t *TLSH) Update(data []byte) { 37 | for _, b := range data { 38 | t.checksum[0] ^= b 39 | t.buckets[b]++ 40 | t.total++ 41 | } 42 | } 43 | 44 | // Finalize finalizes the TLSH fingerprint. 45 | func (t *TLSH) Finalize() string { 46 | digest := sha256.New() 47 | for _, b := range t.buckets { 48 | digest.Write([]byte{byte(b)}) 49 | } 50 | hash := digest.Sum(nil) 51 | return hex.EncodeToString(hash) 52 | } 53 | 54 | // Compute computes the TLSH fingerprint of a given data. 55 | func (t TLSH) Compute(data string) string { 56 | tlsh := NewTLSH() 57 | tlsh.Update([]byte(data)) 58 | return tlsh.Finalize() 59 | } 60 | -------------------------------------------------------------------------------- /scripts/apply-limits.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | compose_file="docker-compose.yml" 4 | 5 | echo "[*] Scanning $compose_file for service resource limits..." 6 | 7 | # Read the file line by line 8 | current_service="" 9 | container_name="" 10 | cpus="" 11 | mem_limit="" 12 | 13 | apply_limits() { 14 | if [ -n "$current_service" ] && [ -n "$container_name" ] && [ -n "$cpus" ] && [ -n "$mem_limit" ]; then 15 | # Check if container is running 16 | if docker ps --format '{{.Names}}' | grep -q "^$container_name$"; then 17 | echo "→ Updating container: $container_name (service: $current_service) with CPU: $cpus, Memory: $mem_limit" 18 | docker update --cpus="$cpus" --memory="$mem_limit" --memory-swap="$mem_limit" "$container_name" 19 | else 20 | echo "⚠️ Container '$container_name' (service: $current_service) not running. Skipping." 21 | fi 22 | fi 23 | 24 | # Reset vars 25 | container_name="" 26 | cpus="" 27 | mem_limit="" 28 | } 29 | 30 | while IFS= read -r line; do 31 | # Detect a new service 32 | if [[ "$line" =~ ^[[:space:]]{2}([a-zA-Z0-9_-]+):[[:space:]]*$ ]]; then 33 | # Apply limits for the previous service if any 34 | apply_limits 35 | current_service="${BASH_REMATCH[1]}" 36 | fi 37 | 38 | # Extract container_name 39 | if [[ "$line" =~ container_name:[[:space:]]*\"?([^\"]+)\"? ]]; then 40 | container_name="${BASH_REMATCH[1]}" 41 | fi 42 | 43 | # Extract cpus 44 | if [[ "$line" =~ cpus:[[:space:]]*\"?([0-9.]+)\"? ]]; then 45 | cpus="${BASH_REMATCH[1]}" 46 | fi 47 | 48 | # Extract mem_limit 49 | if [[ "$line" =~ mem_limit:[[:space:]]*\"?([0-9]+[gmGM])\"? ]]; then 50 | mem_limit="${BASH_REMATCH[1]}" 51 | fi 52 | done < "$compose_file" 53 | 54 | # Apply for the last parsed service 55 | apply_limits 56 | 57 | echo "✅ Done." 58 | -------------------------------------------------------------------------------- /doc/installation.md: -------------------------------------------------------------------------------- 1 | # Installing the CROWler 2 | 3 | ## Prerequisites 4 | 5 | The crowler is designed to run from a Raspberry Pi 3B+ type of hardware all the 6 | way up to a full fledged server and/or cloud computing. 7 | 8 | It has been tested on ARM64 and x86_64 architectures. It should work on ARM32 9 | and x86_32 as well, but it has not been tested. 10 | 11 | It has been tested on various Linux distributions, including OpenSUSE, Ubuntu, 12 | Debian, and Raspbian. It should work on other distributions as well, but it has 13 | not been tested. 14 | 15 | It has also been tested on MacOS. It should work on Windows as well, but it has 16 | not been tested. 17 | 18 | The following software is required: 19 | 20 | - Docker 21 | - Docker Compose 22 | 23 | ## Installation 24 | 25 | The installation is done by cloning the repository and running the 26 | `docker-build.sh` or `docker-rebuild.sh` scripts. 27 | 28 | The `docker-build.sh` script will build the Docker images and start the 29 | containers. The `docker-rebuild.sh` script will clean up everything and then 30 | rebuild the Docker images and start the containers. 31 | 32 | Before you build your images: 33 | 34 | - you MUST create a config.yaml file and place it in the root of the project. 35 | More details on how to write your config.yaml file can be found in the 36 | [Configuration](./config_yaml.md) section. 37 | - you MUST define some environment variables in your shell. More details on how 38 | to define your environment variables can be found in the 39 | [Environment Variables](./env_vars.md) section. 40 | 41 | That's it! When you've configured your ENV variables and written your config.yaml 42 | , just run: 43 | 44 | ```bash 45 | ./docker-build.sh 46 | ``` 47 | 48 | You should now have a running CROWler instance waiting to receive Sources to 49 | crawl. 50 | 51 | Enjoy! :) 52 | -------------------------------------------------------------------------------- /pkg/ruleset/common_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Paolo Fabio Zaino 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Package ruleset implements the ruleset library for the Crowler and 16 | // the scrapper. 17 | package ruleset 18 | 19 | import ( 20 | "github.com/google/go-cmp/cmp" 21 | 22 | "testing" 23 | ) 24 | 25 | func TestParseRules(t *testing.T) { 26 | // Create a temporary YAML file for testing 27 | tempFile := "./test-ruleset.yaml" 28 | 29 | // Call the ParseRules function with the temporary file 30 | sites, err := BulkLoadRules(nil, tempFile) 31 | if err != nil { 32 | t.Fatalf("ParseRules returned an error: %v", err) 33 | } 34 | 35 | // Verify the parsed rules 36 | expectedSites := rulesets 37 | if diff := cmp.Diff(expectedSites, sites); diff != "" { 38 | t.Errorf("Parsed rules mismatch (-expected +actual):\n%s", diff) 39 | } 40 | 41 | /* 42 | if !reflect.DeepEqual(sites, expectedSites) { 43 | t.Errorf("Parsed rules do not match expected rules") 44 | } 45 | */ 46 | } 47 | 48 | func TestInitializeLibrary(t *testing.T) { 49 | engine, err := InitializeLibrary("./test-ruleset.yaml") 50 | if err != nil { 51 | t.Fatalf("InitializeLibrary returned an error: %v", err) 52 | } 53 | if engine == nil { 54 | t.Errorf("Expected non-nil engine, got nil") 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /pkg/fingerprints/simhash.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Paolo Fabio Zaino 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Package fingerprints implements the fingerprints library for the Crowler 16 | package fingerprints 17 | 18 | import ( 19 | //nolint:gosec // Disabling G501: Md5 is required for backward compatibility, we do not use it for security purposes 20 | "crypto/md5" 21 | "encoding/binary" 22 | "fmt" 23 | "strings" 24 | ) 25 | 26 | // SimHash implements the Fingerprint interface for SimHash fingerprints. 27 | type SimHash struct{} 28 | 29 | // Compute computes the SimHash fingerprint of a given data. 30 | func (s SimHash) Compute(data string) string { 31 | bits := make([]int, 64) 32 | words := strings.Fields(data) 33 | 34 | for _, word := range words { 35 | //nolint:gosec // Disabling G501: Md5 is required for backward compatibility, we do not use it for security purposes 36 | hash := md5.Sum([]byte(word)) 37 | for i := 0; i < 64; i++ { 38 | bit := (binary.BigEndian.Uint64(hash[:]) >> i) & 1 39 | if bit == 1 { 40 | bits[i]++ 41 | } else { 42 | bits[i]-- 43 | } 44 | } 45 | } 46 | 47 | var fingerprint uint64 48 | for i := 0; i < 64; i++ { 49 | if bits[i] > 0 { 50 | fingerprint |= 1 << i 51 | } 52 | } 53 | 54 | return fmt.Sprintf("%x", fingerprint) 55 | } 56 | -------------------------------------------------------------------------------- /pkg/netinfo/netinfo_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Paolo Fabio Zaino 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Package netinfo provides functionality to extract network information 16 | package netinfo 17 | 18 | import ( 19 | "encoding/json" 20 | "fmt" 21 | "os" 22 | "testing" 23 | 24 | cfg "github.com/pzaino/thecrowler/pkg/config" 25 | ) 26 | 27 | func TestGetNetInfo(t *testing.T) { 28 | if os.Getenv("GITHUB_ACTIONS") == "true" || os.Getenv("CI") == "true" { 29 | t.Skip("Skipping this test in CI or GitHub Actions.") 30 | } 31 | 32 | // Replace "example.com" with the URL you want to test 33 | url := "https://www.example.com/" 34 | 35 | // Create a new NetInfo instance 36 | ni := &NetInfo{} 37 | c := cfg.NewConfig() 38 | ni.Config = &c.NetworkInfo 39 | 40 | // Call GetNetInfo to retrieve network information 41 | err := ni.GetNetInfo(url) 42 | 43 | // Check for errors 44 | if err != nil { 45 | t.Errorf("GetNetInfo(%s) returned an error: %v", url, err) 46 | return 47 | } 48 | 49 | // Print the full NetInfo content for debugging purposes 50 | fmt.Printf("NetInfo for URL %s:\n", url) 51 | jsonData, _ := json.MarshalIndent(ni, "", " ") 52 | fmt.Println(string(jsonData)) 53 | 54 | // Validate the returned NetInfo struct 55 | if ni.URL != url { 56 | t.Errorf("Expected URL in NetInfo to be %s, but got %s", url, ni.URL) 57 | } 58 | 59 | // Add more validation as needed for Hosts, IPs, and WHOIS data 60 | } 61 | -------------------------------------------------------------------------------- /pkg/crawler/test_data/example-ruleset.yaml: -------------------------------------------------------------------------------- 1 | ruleset_name: example.com 2 | format_version: "1.0" 3 | rule_groups: 4 | - group_name: "Group1" 5 | is_enabled: true 6 | scraping_rules: 7 | - rule_name: "Rule1" 8 | path: "/articles" 9 | elements: 10 | - key: "title" 11 | selectors: 12 | - selector_type: "css" 13 | selector: "h1.article-title" 14 | - selector_type: "xpath" 15 | selector: "//h1[@class='article-title']" 16 | - key: "content" 17 | selectors: 18 | - selector_type: "css" 19 | selector: "div.article-content" 20 | - key: "date" 21 | selectors: 22 | - selector_type: "css" 23 | selector: "span.date" 24 | js_files: true 25 | technology_patterns: 26 | - "jquery" 27 | - "bootstrap" 28 | 29 | - group_name: "Group2" 30 | valid_from: "2024-01-01T00:00:00Z" 31 | valid_to: "2025-01-01T00:00:00Z" 32 | is_enabled: true 33 | scraping_rules: 34 | - rule_name: "Get the News" 35 | path: "/news" 36 | elements: 37 | - key: "headline" 38 | selectors: 39 | - selector_type: "css" 40 | selector: "h1.headline" 41 | - key: "summary" 42 | selectors: 43 | - selector_type: "css" 44 | selector: "p.summary" 45 | js_files: false 46 | 47 | - group_name: "GDPR" 48 | is_enabled: true 49 | action_rules: 50 | - rule_name: "FindAcceptButton" 51 | action_type: "click" 52 | path: "/cookies" 53 | elements: 54 | - key: "accept_button" 55 | selectors: 56 | - selector_type: "css" 57 | selector: "button.accept" 58 | - key: "reject_button" 59 | selectors: 60 | - selector_type: "css" 61 | selector: "button.reject" 62 | js_files: false 63 | -------------------------------------------------------------------------------- /doc/agents.md: -------------------------------------------------------------------------------- 1 | # Using Agents with the CROWler 2 | 3 | The CROWler allows you to run multiple agents (either in series or in parallel). 4 | This is useful when you have a large number of agents that you want to run at 5 | the same time. 6 | The CROWler will automatically distribute the agents across multiple cores on 7 | your machine, allowing you to run many agents at once. 8 | 9 | Agents are also useful when a user does not wish to code complex plugins or 10 | wish to leverage AI models to perform tasks such ass data validation, 11 | enrichment, correction, manipulation etc. and combine it with actions. 12 | 13 | Agents should be defined in YAML files and stored in the `./agents/` path. 14 | 15 | Below an example of such YAML file. 16 | 17 | ## Examples of configuring Agents 18 | 19 | ```yaml 20 | 21 | # Examples of a set of agents configuration file in YAML format: 22 | 23 | jobs: 24 | - name: "Serial Agent 1" 25 | process: "serial" 26 | trigger_type: event 27 | trigger_name: "event_name" 28 | steps: 29 | - action: "APIRequest" 30 | params: 31 | config: 32 | url: "http://example.com/api/data" 33 | - action: "AIInteraction" 34 | params: 35 | prompt: "Summarize the following data: $response" 36 | config: 37 | url: "https://api.openai.com/v1/completions" 38 | api_key: "your_api_key" 39 | 40 | - name: "Parallel Agent 1" 41 | process: "parallel" 42 | trigger_type: event 43 | trigger_name: "event_name" 44 | steps: 45 | - action: "DBQuery" 46 | params: 47 | query: "INSERT INTO logs (message) VALUES ('Parallel job 1')" 48 | - action: "RunCommand" 49 | params: 50 | command: "echo 'Parallel action $response.status'" 51 | 52 | - name: "Serial Agent 2" 53 | process: "serial" 54 | trigger_type: agent 55 | trigger_name: "agent_name" 56 | steps: 57 | - action: "PluginExecution" 58 | params: 59 | plugin_name: "example_plugin" 60 | 61 | ``` 62 | -------------------------------------------------------------------------------- /rules/AcceptCookies-ruleset.json: -------------------------------------------------------------------------------- 1 | { 2 | "format_version": "1.0", 3 | "author": "Paolo Fabio Zaino", 4 | "created_at": "2024-03-01T12:00:00Z", 5 | "description": "Automatically accepts website cookie policies across different languages, with XPath and regex examples.", 6 | "ruleset_name": "CookiePolicyAcceptanceMultilingual", 7 | "rule_groups": [ 8 | { 9 | "group_name": "CookieAcceptanceRulesExtended", 10 | "is_enabled": true, 11 | "action_rules": [ 12 | { 13 | "rule_name": "ClickAcceptCookiesButton", 14 | "action_type": "click", 15 | "selectors": [ 16 | { 17 | "selector_type": "css", 18 | "selector": ".cookie-accept, .accept-cookies, #accept-cookies, #akzeptieren, #aceptar-cookies, #accepter, #accetta, button[name='accept_cookies'], button[class*='cookieAccept'], a[role='button'][href*='acceptCookies'], div[class*='cookie'][id*='accept'], div[id*='cookie'][class*='ok'], div[class*='cookie'][role*='button']" 19 | }, 20 | { 21 | "selector_type": "xpath", 22 | "selector": "//button[contains(text(), 'Accept') or contains(text(), 'Akzeptieren') or contains(text(), 'Aceptar') or contains(text(), 'Accepter') or contains(text(), 'Accetta')]" 23 | }, 24 | { 25 | "selector_type": "class_name", 26 | "selector": "disclaimerOK" 27 | }, 28 | { 29 | "selector_type": "id", 30 | "selector": "cookieOK" 31 | }, 32 | { 33 | "selector_type": "element", 34 | "selector": "div", 35 | "value": "{{accept}}" 36 | } 37 | ], 38 | "wait_conditions": [ 39 | { 40 | "condition_type": "delay", 41 | "value": "2" 42 | } 43 | ], 44 | "error_handling": { 45 | "retry_count": 3, 46 | "retry_delay": 5 47 | } 48 | } 49 | ] 50 | } 51 | ] 52 | } 53 | -------------------------------------------------------------------------------- /pkg/common/url.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Paolo Fabio Zaino 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Package common provides common utilities and functions used across the application. 16 | package common 17 | 18 | import "strings" 19 | 20 | // NormalizeURL normalizes a URL by trimming trailing slashes and converting it to lowercase. 21 | func NormalizeURL(url string) string { 22 | // Trim spaces 23 | url = strings.TrimSpace(url) 24 | // Trim trailing slash 25 | url = strings.TrimRight(url, "/") 26 | // Convert to lowercase 27 | url = strings.ToLower(url) 28 | return url 29 | } 30 | 31 | // IsURLValid checks if a URL is valid. 32 | func IsURLValid(url string) bool { 33 | // Check if the URL is empty 34 | if url == "" { 35 | return false 36 | } 37 | tURL := strings.ToLower(strings.TrimSpace(url)) 38 | 39 | // Check if the URL starts with http:// or https:// 40 | if !strings.HasPrefix(tURL, "http://") && 41 | !strings.HasPrefix(tURL, "https://") && 42 | !strings.HasPrefix(tURL, "ws://") && 43 | !strings.HasPrefix(tURL, "wss://") && 44 | !strings.HasPrefix(tURL, "ftp://") && 45 | !strings.HasPrefix(tURL, "ftps://") { 46 | return false 47 | } 48 | 49 | // Check if the URL has a valid domain 50 | if strings.Contains(tURL, " ") || strings.Contains(tURL, "\n") || strings.Contains(tURL, "\t") { 51 | return false 52 | } 53 | 54 | // Check if the URL has a valid TLD 55 | if !strings.Contains(tURL, ".") { 56 | return false 57 | } 58 | 59 | // Looks like a valid URL 60 | return true 61 | } 62 | -------------------------------------------------------------------------------- /doc/test_policy.md: -------------------------------------------------------------------------------- 1 | # Test Policy for TheCROWler 2 | 3 | ## Introduction 4 | 5 | This document outlines the testing policy for "TheCROWler" project. The 6 | objective is to maintain high code quality, functionality, and reliability 7 | of the application. 8 | 9 | ## Testing Tools 10 | 11 | - Go Test: Primary tool for running tests. 12 | - Selenium WebDriver: For browser-based tests. 13 | 14 | ## Test Types 15 | 16 | - Unit Testing: To test individual components in isolation. 17 | - Integration Testing: To ensure modules work together as expected. 18 | - Functional Testing: To verify the software performs its intended 19 | functions. 20 | - Regression Testing: To confirm that a recent program change has not 21 | adversely affected existing features. 22 | - Browser Compatibility Testing: Using Selenium WebDriver to ensure 23 | compatibility across different web browsers. 24 | 25 | ## Test Coverage 26 | 27 | Strive for >80% test coverage. 28 | Include both positive and negative test scenarios. 29 | 30 | ## Test Data 31 | 32 | - Use a combination of real and synthetic data. 33 | - Ensure data privacy and compliance with relevant regulations. 34 | 35 | ## Code Review and Merge Policy 36 | 37 | - All new code must include relevant tests. 38 | - Pull requests must pass all tests before merging. 39 | - Regular code reviews to ensure adherence to testing standards. 40 | 41 | ## Continuous Integration 42 | 43 | - Integrate with a CI tool (e.g., GitHub Actions) for automated testing. 44 | - Tests should run on every commit to the main branch and all pull requests. 45 | 46 | ## Reporting and Documentation 47 | 48 | Document all tests and update regularly. 49 | Use tools for clear reporting of test results. 50 | Track bugs and fixes in a dedicated system (e.g., GitHub Issues). 51 | 52 | ## Responsibility 53 | 54 | All contributors are responsible for writing and maintaining tests for their 55 | code. Project maintainers will oversee adherence to this policy. 56 | 57 | ## Policy Review 58 | 59 | This policy will be reviewed and updated regularly to adapt to project needs 60 | and technological advancements. 61 | -------------------------------------------------------------------------------- /.github/workflows/120_fix_latest_multiarch.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | name: "Fix Latest Multi-Arch Tags" 4 | 5 | on: 6 | workflow_dispatch: 7 | inputs: 8 | release_tag: 9 | description: "Tag to rebuild 'latest' from (e.g., v1.5.0)" 10 | required: true 11 | dry_run: 12 | description: "Run without pushing changes" 13 | default: "false" 14 | required: false 15 | 16 | permissions: 17 | contents: read 18 | 19 | env: 20 | REGISTRY: docker.io 21 | IMAGE_OWNER: zfpsystems 22 | 23 | jobs: 24 | retag-latest: 25 | runs-on: ubuntu-latest 26 | 27 | strategy: 28 | fail-fast: false 29 | matrix: 30 | service: 31 | - crowler-api 32 | - crowler-events 33 | - crowler-engine 34 | - crowler-db 35 | 36 | steps: 37 | - name: Login to Docker Hub 38 | if: ${{ github.event.inputs.dry_run != 'true' }} 39 | uses: docker/login-action@v3 40 | with: 41 | username: ${{ secrets.DOCKER_USERNAME }} 42 | password: ${{ secrets.DOCKER_PAT }} 43 | 44 | - name: Set up Buildx 45 | uses: docker/setup-buildx-action@v3 46 | 47 | - name: Update latest multi-arch manifest 48 | run: | 49 | NAME="${{ matrix.service }}" 50 | TAG="${{ github.event.inputs.release_tag }}" 51 | 52 | echo "Recreating multi-arch manifest for $NAME:latest from $TAG" 53 | 54 | if [ "${{ github.event.inputs.dry_run }}" = "true" ]; then 55 | echo "[DRY RUN] Would run imagetools create for $REGISTRY/$IMAGE_OWNER/$NAME:$TAG" 56 | else 57 | docker buildx imagetools create \ 58 | --tag $REGISTRY/$IMAGE_OWNER/$NAME:latest \ 59 | $REGISTRY/$IMAGE_OWNER/$NAME:$TAG 60 | fi 61 | 62 | - name: Logout 63 | if: ${{ github.event.inputs.dry_run != 'true' }} 64 | run: docker logout 65 | -------------------------------------------------------------------------------- /pkg/crawler/test_data/test-ruleset.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | ruleset_name: "Example Items Extraction Ruleset" 3 | format_version: "1.0" 4 | rule_groups: 5 | - group_name: "Group1" 6 | valid_from: "2021-01-01T00:00:00Z" 7 | valid_to: "2029-12-31T00:00:00Z" 8 | is_enabled: true 9 | scraping_rules: 10 | - rule_name: "Articles" 11 | path: "/articles" 12 | elements: 13 | - key: "title" 14 | selectors: 15 | - selector_type: "css" 16 | selector: "h1.article-title" 17 | - selector_type: "xpath" 18 | selector: "//h1[@class='article-title']" 19 | - key: "content" 20 | selectors: 21 | - selector_type: "css" 22 | selector: "div.article-content" 23 | - key: "date" 24 | selectors: 25 | - selector_type: "css" 26 | selector: "span.date" 27 | js_files: true 28 | technology_patterns: 29 | - "jquery" 30 | - "bootstrap" 31 | 32 | - group_name: "Group2" 33 | valid_from: "2021-01-01T00:00:00Z" 34 | valid_to: "2021-12-31T00:00:00Z" 35 | is_enabled: false 36 | scraping_rules: 37 | - rule_name: "News" 38 | path: "/news" 39 | elements: 40 | - key: "headline" 41 | selectors: 42 | - selector_type: "css" 43 | selector: "h1.headline" 44 | - key: "summary" 45 | selectors: 46 | - selector_type: "css" 47 | selector: "p.summary" 48 | js_files: false 49 | 50 | - group_name: "GroupA" 51 | valid_from: "2021-01-01T00:00:00Z" 52 | valid_to: "2023-12-31T00:00:00Z" 53 | is_enabled: true 54 | scraping_rules: 55 | - rule_name: "Products" 56 | path: "/products" 57 | elements: 58 | - key: "name" 59 | selectors: 60 | - selector_type: "css" 61 | selector: "div.product-name" 62 | - key: "price" 63 | selectors: 64 | - selector_type: "css" 65 | selector: "span.price" 66 | -------------------------------------------------------------------------------- /selenium-patches/4.27.0/Makefile-fixed.patch: -------------------------------------------------------------------------------- 1 | --- Makefile 2024-12-22 16:18:14 2 | +++ Makefile_crowler 2024-12-22 16:19:18 3 | @@ -12,7 +12,7 @@ 4 | NAMESPACE := $(or $(NAMESPACE),$(NAMESPACE),$(NAME)) 5 | AUTHORS := $(or $(AUTHORS),$(AUTHORS),SeleniumHQ) 6 | PUSH_IMAGE := $(or $(PUSH_IMAGE),$(PUSH_IMAGE),false) 7 | -FROM_IMAGE_ARGS := --build-arg NAMESPACE=$(NAMESPACE) --build-arg VERSION=$(TAG_VERSION) --build-arg AUTHORS=$(AUTHORS) --sbom=true --attest type=provenance,mode=max 8 | +FROM_IMAGE_ARGS := --build-arg NAMESPACE=$(NAMESPACE) --build-arg VERSION=$(TAG_VERSION) --build-arg AUTHORS=$(AUTHORS) 9 | BUILD_ARGS := $(BUILD_ARGS) --progress plain 10 | MAJOR := $(word 1,$(subst ., ,$(TAG_VERSION))) 11 | MINOR := $(word 2,$(subst ., ,$(TAG_VERSION))) 12 | @@ -120,7 +120,7 @@ 13 | 14 | base: prepare_resources gen_certs 15 | cd ./Base && SEL_PASSWD=$(SEL_PASSWD) docker buildx build --platform $(PLATFORMS) $(BUILD_ARGS) --build-arg VERSION=$(BASE_VERSION) --build-arg RELEASE=$(BASE_RELEASE) --build-arg AUTHORS=$(AUTHORS) \ 16 | - --secret id=SEL_PASSWD --sbom=true --attest type=provenance,mode=max -t $(NAME)/base:$(TAG_VERSION) . 17 | + --secret id=SEL_PASSWD -t $(NAME)/base:$(TAG_VERSION) . 18 | 19 | base_nightly: 20 | BASE_VERSION=$(BASE_VERSION_NIGHTLY) BASE_RELEASE=$(BASE_RELEASE_NIGHTLY) make base 21 | @@ -245,7 +245,7 @@ 22 | cd ./Standalone && docker buildx build --platform $(PLATFORMS) $(BUILD_ARGS) --build-arg NAMESPACE=$(NAME) --build-arg VERSION=beta --build-arg BASE=node-edge -t $(NAME)/standalone-edge:beta . 23 | 24 | video: 25 | - cd ./Video && SEL_PASSWD=$(SEL_PASSWD) docker buildx build --platform $(PLATFORMS) $(BUILD_ARGS) --build-arg NAMESPACE=$(FFMPEG_BASED_NAME) --build-arg BASED_TAG=$(FFMPEG_BASED_TAG) --secret id=SEL_PASSWD --sbom=true --attest type=provenance,mode=max -t $(NAME)/video:$(FFMPEG_TAG_VERSION)-$(BUILD_DATE) . 26 | + cd ./Video && SEL_PASSWD=$(SEL_PASSWD) docker buildx build --platform $(PLATFORMS) $(BUILD_ARGS) --build-arg NAMESPACE=$(FFMPEG_BASED_NAME) --build-arg BASED_TAG=$(FFMPEG_BASED_TAG) --secret id=SEL_PASSWD -t $(NAME)/video:$(FFMPEG_TAG_VERSION)-$(BUILD_DATE) . 27 | 28 | fetch_grid_scaler_resources: 29 | mkdir -p ./.keda/scalers \ 30 | -------------------------------------------------------------------------------- /pkg/ruleset/ruleset_fuzz_test.go: -------------------------------------------------------------------------------- 1 | //go:build go1.22 2 | 3 | package ruleset 4 | 5 | import ( 6 | "os" 7 | "testing" 8 | 9 | "github.com/qri-io/jsonschema" 10 | "gopkg.in/yaml.v2" 11 | ) 12 | 13 | // FuzzParseRuleset is a fuzz test for the parseRuleset function 14 | func FuzzParseRuleset(f *testing.F) { 15 | // Add initial seed inputs 16 | f.Add([]byte(`format_version: "1.0" 17 | author: "test" 18 | created_at: "2023-01-01T00:00:00Z" 19 | description: "Test ruleset" 20 | ruleset_name: "TestRuleset" 21 | rule_groups: [] 22 | `)) 23 | 24 | // Adding interesting cases found by the fuzzer 25 | f.Add([]byte(`0`)) // Example of an interesting case that caused an error in the past 26 | 27 | f.Fuzz(func(t *testing.T, data []byte) { 28 | // Create a temporary file to hold the fuzzed ruleset 29 | tmpFile, err := os.CreateTemp("", "ruleset-*.yaml") 30 | if err != nil { 31 | t.Fatalf("failed to create temp file: %v", err) 32 | } 33 | defer os.Remove(tmpFile.Name()) 34 | 35 | if _, err := tmpFile.Write(data); err != nil { 36 | t.Fatalf("failed to write to temp file: %v", err) 37 | } 38 | if err := tmpFile.Close(); err != nil { 39 | t.Fatalf("failed to close temp file: %v", err) 40 | } 41 | 42 | // Load the schema file 43 | schemaFile := "../../schemas/crowler-ruleset-schema.json" 44 | schemaData, err := os.ReadFile(schemaFile) 45 | if err != nil { 46 | t.Fatalf("failed to read schema file: %v", err) 47 | } 48 | 49 | // Unmarshal the schema file 50 | rs := &jsonschema.Schema{} 51 | if err := rs.UnmarshalJSON(schemaData); err != nil { 52 | t.Fatalf("failed to unmarshal schema: %v", err) 53 | } 54 | 55 | // Read the ruleset file 56 | rulesFile, err := os.ReadFile(tmpFile.Name()) 57 | if err != nil { 58 | t.Fatalf("failed to read temp file: %v", err) 59 | } 60 | 61 | // Unmarshal the YAML content to check its validity 62 | var yamlDoc map[string]interface{} 63 | if err := yaml.Unmarshal(rulesFile, &yamlDoc); err != nil { 64 | t.Skipf("failed to unmarshal rules file: %v", err) 65 | } 66 | 67 | // Call parseRuleset with the fuzzed data 68 | _, err = parseRuleset(rs, &rulesFile, "yaml") 69 | if err != nil { 70 | t.Skip() 71 | } 72 | }) 73 | } 74 | -------------------------------------------------------------------------------- /doc/env_vars.md: -------------------------------------------------------------------------------- 1 | # Environment variables used to customize the CROWler Docker Images 2 | 3 | **DOCKER_POSTGRES_DB_HOST** (default value: localhost) - The hostname of the 4 | PostgreSQL server to use for the CROWler. 5 | 6 | **DOCKER_POSTGRES_DB_PORT** (default value: 5432) - The port of the PostgreSQL 7 | server to use for the CROWler. 8 | 9 | **DOCKER_POSTGRES_DB_NAME** (default value: SitesIndex) - The name of the database 10 | to use for the CROWler. 11 | 12 | **DOCKER_POSTGRES_USER** (default value: postgres) - The username to use to admin 13 | to the database. 14 | 15 | **DOCKER_POSTGRES_PASSWORD** (default value: postgres) - The password to use to 16 | connect to the database. 17 | 18 | **DOCKER_CROWLER_DB_USER** (default value: crowler) - The username to use to 19 | connect to the database with read/write/exec permissions only. This is the 20 | username the CROWler will use to connect to the database. 21 | 22 | **DOCKER_CROWLER_DB_PASSWORD** (default value: changeme) - The password to use to 23 | connect to the database with read/write/exec permissions only. This is the 24 | password the CROWler will use to connect to the database. 25 | 26 | **DOCKER_CROWLER_API_PORT** (default value: 8081) - The port the API will 27 | listen on. 28 | 29 | **DOCKER_SEARCH_API_PORT** (default value: 8080) - The port the Search API will 30 | listen on. 31 | 32 | **DOCKER_SELENIUM_IMAGE** 33 | (default value: selenium/standalone-chromium:4.27.0-20241223) - This is for the 34 | Selenium version to use in the VDI. Current version is 4.27.0 and the date is 35 | the date you'll build the VDI image expressed as `yyyymmdd` (y = year, m = 36 | month number, d = day number). 37 | 38 | **DOCKER_DEFAULT_PLATFORM** (default value: linux/amd64) - The platform to use 39 | to build the CROWler Docker images. This is useful if you are building the 40 | CROWler on an architecture that is not `x86_64`. 41 | 42 | For example: 43 | 44 | ```bash 45 | DOCKER_DB_HOST='crowler-db' 46 | DOCKER_POSTGRES_PASSWORD='your_postgres_password' 47 | DOCKER_CROWLER_DB_USER='crowler' 48 | DOCKER_CROWLER_DB_PASSWORD='your_crowler_password' 49 | 50 | DOCKER_SELENIUM_IMAGE="selenium/standalone-chromium:4.27.0-20241223" 51 | 52 | DOCKER_DEFAULT_PLATFORM="linux/arm64" 53 | ``` 54 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v6.0.0 4 | hooks: 5 | - id: trailing-whitespace 6 | exclude: '\.patch$' 7 | - id: end-of-file-fixer 8 | exclude: '\.patch$' 9 | - id: check-yaml 10 | exclude: '\.patch$' 11 | - id: check-added-large-files 12 | # args: ['--maxkb=2048'] 13 | - repo: https://github.com/dnephin/pre-commit-golang 14 | rev: v0.5.1 15 | hooks: 16 | - id: go-fmt 17 | # - id: go-imports 18 | - id: no-go-testing 19 | # - id: golangci-lint 20 | # args: [ "--config", ".golangci.yml" ] 21 | - id: go-unit-tests 22 | - repo: local 23 | hooks: 24 | - id: go-test-coverage 25 | name: Go Test Coverage 26 | description: Checks for go tests coverage 27 | entry: bash -c 'echo >/dev/tty; go test ./... -cover &>/dev/tty; echo >/dev/tty;' 28 | language: system 29 | always_run: true 30 | pass_filenames: false 31 | - id: osv-scanner 32 | name: osv-scanner 33 | description: Vulnerability scanner written in Go which uses the data provided by https://osv.dev 34 | entry: bash -c 'echo >/dev/tty;$GOPATH/bin/osv-scanner scan ./* &>/dev/tty; echo >/dev/tty;' 35 | always_run: true 36 | pass_filenames: false 37 | language: golang 38 | - repo: https://github.com/alessandrojcm/commitlint-pre-commit-hook 39 | rev: v9.22.0 40 | hooks: 41 | - id: commitlint 42 | stages: [commit-msg] 43 | additional_dependencies: ['@commitlint/config-conventional'] 44 | - repo: https://github.com/gitleaks/gitleaks 45 | rev: v8.28.0 46 | hooks: 47 | - id: gitleaks 48 | #- repo: https://github.com/pre-commit/mirrors-eslint 49 | # rev: v9.10.0 50 | # hooks: 51 | # - id: eslint 52 | - repo: https://github.com/jumanjihouse/pre-commit-hooks 53 | rev: 3.0.0 54 | hooks: 55 | - id: shellcheck 56 | #- repo: https://github.com/golangci/golangci-lint 57 | # rev: v2.4.0 58 | # hooks: 59 | # - id: golangci-lint 60 | #- repo: https://github.com/pre-commit/mirrors-eslint 61 | # rev: v9.33.0 62 | # hooks: 63 | # - id: eslint 64 | -------------------------------------------------------------------------------- /schemas/crowler-source-categories-schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "http://json-schema.org/draft-07/schema#", 3 | "$id": "https://github.com/pzaino/thecrowler/main/schemas/crowler-source-categories-schema.json", 4 | "title": "CategoriesSchema", 5 | "type": "object", 6 | "description": "Schema to define categories and sub-categories for bulk uploading into The CROWler", 7 | "properties": { 8 | "categories": { 9 | "type": "array", 10 | "description": "List of categories", 11 | "items": { 12 | "type": "object", 13 | "properties": { 14 | "name": { 15 | "type": "string", 16 | "description": "Name of the category" 17 | }, 18 | "description": { 19 | "type": "string", 20 | "description": "Description of the category" 21 | }, 22 | "subcategories": { 23 | "type": "array", 24 | "description": "List of sub-categories within this category", 25 | "items": { 26 | "type": "object", 27 | "properties": { 28 | "name": { 29 | "type": "string", 30 | "description": "Name of the sub-category" 31 | }, 32 | "description": { 33 | "type": "string", 34 | "description": "Description of the sub-category" 35 | } 36 | }, 37 | "required": [ 38 | "name" 39 | ], 40 | "additionalProperties": false 41 | } 42 | } 43 | }, 44 | "required": [ 45 | "name" 46 | ], 47 | "additionalProperties": false 48 | } 49 | } 50 | }, 51 | "required": [ 52 | "categories" 53 | ], 54 | "additionalProperties": false 55 | } 56 | -------------------------------------------------------------------------------- /pkg/common/env_templates.go: -------------------------------------------------------------------------------- 1 | package common 2 | 3 | import "strings" 4 | 5 | const ( 6 | strType = "string" 7 | optReject = "reject" 8 | optAccept = "accept" 9 | optConsent = "consent" 10 | ) 11 | 12 | var ( 13 | // Lists of button texts in different languages for 'Accept' and 'Consent' 14 | acceptTexts = []string{ 15 | "Accept", "Akzeptieren", "Aceptar", "Accettare", "Accetto", "Accepter", "Aceitar", 16 | "Godta", "Aanvaarden", "Zaakceptuj", "Elfogad", "Принять", "同意", 17 | "承認", "수락", // Add more translations as needed 18 | } 19 | consentTexts = []string{ 20 | "Consent", "Zustimmen", "Consentir", "Consentire", "Consento", "Consentement", "Concordar", 21 | "Samtykke", "Toestemmen", "Zgoda", "Hozzájárulás", "Согласие", "同意する", 22 | "同意", "동의", // Add more translations as needed 23 | } 24 | rejectTexts = []string{ 25 | "Reject", "Ablehnen", "Rechazar", "Rifiutare", "Rifiuto", "Refuser", "Rejeitar", 26 | "Avvise", "Weigeren", "Odrzuć", "Elutasít", "Отклонить", "拒绝", 27 | "拒否", "거부", // Add more translations as needed 28 | } 29 | ) 30 | 31 | // ProcessEnvTemplate processes an environment variable template 32 | func ProcessEnvTemplate(envVar, CtxID string) (EnvValue, error) { 33 | var rval EnvValue 34 | if strings.Contains(envVar, "${") { 35 | envVar = InterpolateEnvVars(envVar) 36 | } 37 | envVar = strings.TrimSpace(envVar) 38 | if strings.HasPrefix(envVar, "{{") && strings.HasSuffix(envVar, "}}") { 39 | envVar = strings.TrimPrefix(envVar, "{{") 40 | envVar = strings.TrimSuffix(envVar, "}}") 41 | envVar = strings.TrimSpace(envVar) 42 | switch envVar { 43 | case optAccept: 44 | rval.Name = optAccept 45 | rval.Value = strings.Join(acceptTexts, "|") 46 | rval.Type = strType 47 | case optConsent: 48 | rval.Name = optConsent 49 | rval.Value = strings.Join(consentTexts, "|") 50 | rval.Type = strType 51 | case optReject: 52 | rval.Name = optReject 53 | rval.Value = strings.Join(rejectTexts, "|") 54 | rval.Type = strType 55 | default: 56 | rIface, rProperties, err := KVStore.Get(envVar, CtxID) 57 | if err != nil { 58 | rval.Name = envVar 59 | rval.Value = rIface.(string) 60 | rval.Type = rProperties.Type 61 | return rval, err 62 | } 63 | rval.Name = envVar 64 | rval.Type = rProperties.Type 65 | rval.Value = rIface 66 | } 67 | } 68 | return rval, nil 69 | } 70 | -------------------------------------------------------------------------------- /pkg/fingerprints/minhash.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Paolo Fabio Zaino 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Package fingerprints implements the fingerprints library for the Crowler 16 | package fingerprints 17 | 18 | import ( 19 | "fmt" 20 | "hash/fnv" 21 | "math" 22 | ) 23 | 24 | // MinHash implements the Fingerprint interface for MinHash fingerprints. 25 | type MinHash struct { 26 | numHash int 27 | hashes []uint64 28 | } 29 | 30 | // NewMinHash creates a new MinHash fingerprint with the given number of hashes. 31 | func NewMinHash(numHash int) *MinHash { 32 | hashes := make([]uint64, numHash) 33 | for i := range hashes { 34 | hashes[i] = math.MaxUint64 35 | } 36 | return &MinHash{ 37 | numHash: numHash, 38 | hashes: hashes, 39 | } 40 | } 41 | 42 | // hashFunction computes the hash of the given data with the given seed. 43 | func hashFunction(data []byte, seed uint64) uint64 { 44 | h := fnv.New64a() 45 | _, err := h.Write(data) 46 | if err != nil { 47 | _, err = h.Write([]byte{byte(seed)}) 48 | if err != nil { 49 | return 0 50 | } 51 | } 52 | return h.Sum64() 53 | } 54 | 55 | // Push pushes the given data into the MinHash fingerprint. 56 | func (mh *MinHash) Push(data []byte) { 57 | //nolint:gosec // Disabling G115: We are using the hash function to generate a fingerprint 58 | for i := uint64(0); i < uint64(mh.numHash); i++ { 59 | hashValue := hashFunction(data, i) 60 | if hashValue < mh.hashes[i] { 61 | mh.hashes[i] = hashValue 62 | } 63 | } 64 | } 65 | 66 | // Signature returns the MinHash fingerprint signature. 67 | func (mh *MinHash) Signature() []uint64 { 68 | return mh.hashes 69 | } 70 | 71 | // Compute computes the MinHash fingerprint of a given data. 72 | func (mh MinHash) Compute(data string) string { 73 | mh = *NewMinHash(200) 74 | mh.Push([]byte(data)) 75 | return fmt.Sprintf("%x", mh.Signature()) 76 | } 77 | -------------------------------------------------------------------------------- /scripts/containerd.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Function to find the daemon.json file 4 | find_docker_daemon_json() { 5 | # Look for daemon.json in standard locations or custom paths 6 | possible_locations=("/etc/docker/daemon.json" "/usr/local/docker/daemon.json" "/etc/default/docker/daemon.json" "$HOME/.docker/daemon.json") 7 | 8 | for location in "${possible_locations[@]}"; do 9 | if [ -f "$location" ]; then 10 | echo "$location" 11 | return 12 | fi 13 | done 14 | 15 | # If no file found, return default location to create a new one 16 | echo "/etc/docker/daemon.json" 17 | } 18 | 19 | # Get the daemon.json file location 20 | DOCKER_DAEMON_CONFIG=$(find_docker_daemon_json) 21 | 22 | # Check if daemon.json exists in the found or default location 23 | enabled=0 24 | if [ ! -f "$DOCKER_DAEMON_CONFIG" ]; then 25 | # If the file doesn't exist, create it with containerd enabled 26 | enabled=1 27 | echo "Creating Docker daemon configuration file with containerd enabled at $DOCKER_DAEMON_CONFIG..." 28 | sudo mkdir -p $(dirname "$DOCKER_DAEMON_CONFIG") # Ensure the directory exists 29 | sudo bash -c "cat > $DOCKER_DAEMON_CONFIG" < /dev/null 51 | fi 52 | fi 53 | 54 | # Restart Docker to apply the changes if any 55 | if [ $enabled -eq 1 ]; then 56 | echo "Restarting Docker service..." 57 | sudo systemctl restart docker 58 | fi 59 | 60 | # Verify that Docker restarted successfully 61 | if [ $? -eq 0 ]; then 62 | echo "Docker has been restarted successfully, and containerd is enabled." 63 | else 64 | echo "Failed to restart Docker. Please check the Docker service." 65 | fi 66 | -------------------------------------------------------------------------------- /pkg/ruleset/test-ruleset.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | ruleset_name: "Example Items Extraction Ruleset" 3 | format_version: "1.0" 4 | rule_groups: 5 | - group_name: "Group1" 6 | valid_from: "2021-01-01T00:00:00Z" 7 | valid_to: "2029-12-31T00:00:00Z" 8 | is_enabled: true 9 | scraping_rules: 10 | - rule_name: "Articles" 11 | pre_conditions: 12 | - path: "/articles" 13 | elements: 14 | - key: "title" 15 | selectors: 16 | - selector_type: "css" 17 | selector: "h1.article-title" 18 | - selector_type: "xpath" 19 | selector: "//h1[@class='article-title']" 20 | - key: "content" 21 | selectors: 22 | - selector_type: "css" 23 | selector: "div.article-content" 24 | - key: "date" 25 | selectors: 26 | - selector_type: "css" 27 | selector: "span.date" 28 | js_files: true 29 | technology_patterns: 30 | - "jquery" 31 | - "bootstrap" 32 | post_processing: 33 | - step_type: "remove" 34 | selector: "div.ads" 35 | - step_type: "replace" 36 | selector: "div.article-content" 37 | replacement: "div.article-content > p" 38 | - step_type: "plugin_call" 39 | value: "RemoveArticleContent" 40 | 41 | - group_name: "Group2" 42 | valid_from: "2021-01-01T00:00:00Z" 43 | valid_to: "2021-12-31T00:00:00Z" 44 | is_enabled: false 45 | scraping_rules: 46 | - rule_name: "News" 47 | pre_conditions: 48 | - path: "/news" 49 | elements: 50 | - key: "headline" 51 | selectors: 52 | - selector_type: "css" 53 | selector: "h1.headline" 54 | - key: "summary" 55 | selectors: 56 | - selector_type: "css" 57 | selector: "p.summary" 58 | js_files: false 59 | 60 | - group_name: "GroupA" 61 | valid_from: "2021-01-01T00:00:00Z" 62 | valid_to: "2023-12-31T00:00:00Z" 63 | is_enabled: true 64 | scraping_rules: 65 | - rule_name: "Products" 66 | pre_conditions: 67 | - url: "https://www.another-example.com" 68 | path: "/products" 69 | elements: 70 | - key: "name" 71 | selectors: 72 | - selector_type: "css" 73 | selector: "div.product-name" 74 | - key: "price" 75 | selectors: 76 | - selector_type: "css" 77 | selector: "span.price" 78 | -------------------------------------------------------------------------------- /selenium-patches/4.18.1/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG NAMESPACE 2 | ARG VERSION 3 | ARG AUTHORS 4 | ARG BASE 5 | 6 | # Build the extra browser Automation for the CROWler 7 | FROM ${NAMESPACE}/${BASE}:${VERSION} AS builder 8 | 9 | USER root 10 | 11 | #==================================== 12 | # Install go lang to build "Rbee" for the CROWler 13 | #==================================== 14 | # Add the PPA for up-to-date Go versions and install Go 15 | RUN apt-get update \ 16 | && apt-get install -y software-properties-common \ 17 | && apt-get install -y wget \ 18 | && apt-get install -y git \ 19 | && apt-get install -y unzip 20 | RUN sudo add-apt-repository ppa:longsleep/golang-backports \ 21 | && apt-get update \ 22 | && apt-get install -y golang-go 23 | 24 | # Check Go version to ensure correct installation 25 | RUN go version 26 | 27 | # Install dependecies for the Go project 28 | RUN apt install -y libx11-dev libxtst-dev libxext-dev 29 | 30 | # Copy and build your Go project 31 | WORKDIR /src 32 | COPY ./Rbee/cmd ./cmd 33 | COPY ./Rbee/pkg ./pkg 34 | COPY ./Rbee/go.mod . 35 | COPY ./Rbee/go.sum . 36 | COPY ./Rbee/autobuild.sh . 37 | COPY ./Rbee/browserAutomation.conf . 38 | RUN chmod +x autobuild.sh 39 | 40 | WORKDIR /src 41 | RUN ./autobuild.sh rb 42 | 43 | # Build the Selenium Standalone image and copy the binary 44 | FROM ${NAMESPACE}/${BASE}:${VERSION} 45 | LABEL authors=${AUTHORS} 46 | 47 | USER ${SEL_UID} 48 | 49 | #==================================== 50 | # Scripts to run Selenium Standalone 51 | #==================================== 52 | COPY --chown="${SEL_UID}:${SEL_GID}" start-selenium-standalone.sh /opt/bin/start-selenium-standalone.sh 53 | COPY --from=builder --chown="${SEL_UID}:${SEL_GID}" /src/bin/rbee /opt/bin/rbee 54 | COPY --from=builder --chown="${SEL_UID}:${SEL_GID}" /src/browserAutomation.conf /etc/supervisor/conf.d/browserAutomation.conf 55 | 56 | #============================== 57 | # Supervisor configuration file 58 | #============================== 59 | COPY selenium.conf /etc/supervisor/conf.d/ 60 | 61 | # Copying configuration script generator 62 | COPY --chown="${SEL_UID}:${SEL_GID}" generate_config /opt/bin/generate_config 63 | 64 | # In seconds, maps to "--session-request-timeout" 65 | ENV SE_SESSION_REQUEST_TIMEOUT 300 66 | # In seconds, maps to "--session-retry-interval" 67 | ENV SE_SESSION_RETRY_INTERVAL 15 68 | # In seconds, maps to "--healthcheck-interval" 69 | ENV SE_HEALTHCHECK_INTERVAL 120 70 | # Boolean value, maps "--relax-checks" 71 | ENV SE_RELAX_CHECKS true 72 | 73 | EXPOSE 4444 74 | 75 | ENV SE_OTEL_SERVICE_NAME "selenium-standalone" 76 | -------------------------------------------------------------------------------- /selenium-patches/4.19.1/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG NAMESPACE 2 | ARG VERSION 3 | ARG AUTHORS 4 | ARG BASE 5 | 6 | # Build the extra browser Automation for the CROWler 7 | FROM ${NAMESPACE}/${BASE}:${VERSION} AS builder 8 | 9 | USER root 10 | 11 | #==================================== 12 | # Install go lang to build "Rbee" for the CROWler 13 | #==================================== 14 | # Add the PPA for up-to-date Go versions and install Go 15 | RUN apt-get update \ 16 | && apt-get install -y software-properties-common \ 17 | && apt-get install -y wget \ 18 | && apt-get install -y git \ 19 | && apt-get install -y unzip 20 | RUN sudo add-apt-repository ppa:longsleep/golang-backports \ 21 | && apt-get update \ 22 | && apt-get install -y golang-go 23 | 24 | # Check Go version to ensure correct installation 25 | RUN go version 26 | 27 | # Install dependecies for the Go project 28 | RUN apt install -y libx11-dev libxtst-dev libxext-dev 29 | 30 | # Copy and build your Go project 31 | WORKDIR /src 32 | COPY ./Rbee/cmd ./cmd 33 | COPY ./Rbee/pkg ./pkg 34 | COPY ./Rbee/go.mod . 35 | COPY ./Rbee/go.sum . 36 | COPY ./Rbee/autobuild.sh . 37 | COPY ./Rbee/browserAutomation.conf . 38 | RUN chmod +x autobuild.sh 39 | 40 | WORKDIR /src 41 | RUN ./autobuild.sh rb 42 | 43 | # Build the Selenium Standalone image and copy the binary 44 | FROM ${NAMESPACE}/${BASE}:${VERSION} 45 | LABEL authors=${AUTHORS} 46 | 47 | USER ${SEL_UID} 48 | 49 | #==================================== 50 | # Scripts to run Selenium Standalone 51 | #==================================== 52 | COPY --chown="${SEL_UID}:${SEL_GID}" start-selenium-standalone.sh /opt/bin/start-selenium-standalone.sh 53 | COPY --from=builder --chown="${SEL_UID}:${SEL_GID}" /src/bin/rbee /opt/bin/rbee 54 | COPY --from=builder --chown="${SEL_UID}:${SEL_GID}" /src/browserAutomation.conf /etc/supervisor/conf.d/browserAutomation.conf 55 | 56 | #============================== 57 | # Supervisor configuration file 58 | #============================== 59 | COPY selenium.conf /etc/supervisor/conf.d/ 60 | 61 | # Copying configuration script generator 62 | COPY --chown="${SEL_UID}:${SEL_GID}" generate_config /opt/bin/generate_config 63 | 64 | # In seconds, maps to "--session-request-timeout" 65 | ENV SE_SESSION_REQUEST_TIMEOUT 300 66 | # In seconds, maps to "--session-retry-interval" 67 | ENV SE_SESSION_RETRY_INTERVAL 15 68 | # In seconds, maps to "--healthcheck-interval" 69 | ENV SE_HEALTHCHECK_INTERVAL 120 70 | # Boolean value, maps "--relax-checks" 71 | ENV SE_RELAX_CHECKS true 72 | 73 | EXPOSE 4444 74 | 75 | ENV SE_OTEL_SERVICE_NAME "selenium-standalone" 76 | -------------------------------------------------------------------------------- /pkg/database/queries_types.go: -------------------------------------------------------------------------------- 1 | package database 2 | 3 | import "encoding/json" 4 | 5 | // SourceFilter is a struct to filter sources based on URL and/or SourceID. 6 | type SourceFilter struct { 7 | URL string `json:"url,omitempty" yaml:"url,omitempty"` // Optional, used if no SourceID is provided 8 | SourceID int64 `json:"source_id,omitempty" yaml:"source_id,omitempty"` // Optional, used if no URL is provided 9 | } 10 | 11 | // UpdateSourceRequest represents the structure of the update source request 12 | type UpdateSourceRequest struct { 13 | SourceID int64 `json:"source_id,omitempty"` // Optional, used if no URL is provided 14 | URL string `json:"url,omitempty"` // The URL of the source 15 | Status string `json:"status,omitempty"` // The status of the source (e.g., 'completed', 'pending') 16 | Restricted int `json:"restricted,omitempty"` // Restriction level (0-4) 17 | Disabled bool `json:"disabled,omitempty"` // Whether the source is disabled 18 | Flags int `json:"flags,omitempty"` // Bitwise flags for the source 19 | Config json.RawMessage `json:"config,omitempty"` // JSON configuration for the source 20 | Details json.RawMessage `json:"details,omitempty"` // JSON details about the source's internal state 21 | } 22 | 23 | // OwnerRequest represents the structure of the owner request 24 | type OwnerRequest struct { 25 | OwnerID int64 `json:"owner_id"` // The ID of the owner 26 | CreatedAt string `json:"created_at"` // The creation date of the owner 27 | LastUpdatedAt string `json:"last_updated_at"` // The last update date of the owner 28 | UserID int64 `json:"user_id"` // The ID of the user 29 | DetailsHash string `json:"details_hash"` // The SHA256 hash of the details 30 | Details json.RawMessage `json:"details"` // The details of the owner 31 | } 32 | 33 | // CategoryRequest represents the structure of the category request 34 | type CategoryRequest struct { 35 | CategoryID int64 `json:"category_id"` // The ID of the category 36 | CreatedAt string `json:"created_at"` // The creation date of the category 37 | LastUpdatedAt string `json:"last_updated_at"` // The last update date of the category 38 | Name string `json:"name"` // The name of the category 39 | Description string `json:"description"` // The description of the category 40 | ParentID int64 `json:"parent_id"` // The ID of the parent category 41 | } 42 | -------------------------------------------------------------------------------- /selenium-patches/4.20.0/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG NAMESPACE=selenium 2 | ARG VERSION 3 | ARG BASE=node-base 4 | ARG AUTHORS=zfpsystems 5 | 6 | # Build the extra browser Automation for the CROWler 7 | FROM ${NAMESPACE}/${BASE}:${VERSION} AS builder 8 | LABEL authors=${AUTHORS} 9 | 10 | USER root 11 | 12 | #==================================== 13 | # Install go lang to build "Rbee" for the CROWler 14 | #==================================== 15 | # Add the PPA for up-to-date Go versions and install Go 16 | RUN apt-get update \ 17 | && apt-get install -y software-properties-common \ 18 | && apt-get install -y wget \ 19 | && apt-get install -y git \ 20 | && apt-get install -y unzip 21 | RUN sudo add-apt-repository ppa:longsleep/golang-backports \ 22 | && apt-get update \ 23 | && apt-get install -y golang-go 24 | 25 | # Check Go version to ensure correct installation 26 | RUN go version 27 | 28 | # Install dependecies for the Go project 29 | RUN apt install -y libx11-dev libxtst-dev libxext-dev 30 | 31 | # Copy and build your Go project 32 | WORKDIR /src 33 | COPY ./Rbee/cmd ./cmd 34 | COPY ./Rbee/pkg ./pkg 35 | COPY ./Rbee/go.mod . 36 | COPY ./Rbee/go.sum . 37 | COPY ./Rbee/autobuild.sh . 38 | COPY ./Rbee/browserAutomation.conf . 39 | RUN chmod +x autobuild.sh 40 | 41 | WORKDIR /src 42 | RUN ./autobuild.sh rb 43 | 44 | # Build the Selenium Standalone image and copy the binary 45 | FROM ${NAMESPACE}/${BASE}:${VERSION} 46 | LABEL authors=${AUTHORS} 47 | 48 | USER ${SEL_UID} 49 | 50 | #==================================== 51 | # Scripts to run Selenium Standalone 52 | #==================================== 53 | COPY --chown="${SEL_UID}:${SEL_GID}" start-selenium-standalone.sh /opt/bin/start-selenium-standalone.sh 54 | COPY --from=builder --chown="${SEL_UID}:${SEL_GID}" /src/bin/rbee /opt/bin/rbee 55 | COPY --from=builder --chown="${SEL_UID}:${SEL_GID}" /src/browserAutomation.conf /etc/supervisor/conf.d/browserAutomation.conf 56 | 57 | #============================== 58 | # Supervisor configuration file 59 | #============================== 60 | COPY selenium.conf /etc/supervisor/conf.d/ 61 | 62 | # Copying configuration script generator 63 | COPY --chown="${SEL_UID}:${SEL_GID}" generate_config /opt/bin/generate_config 64 | 65 | # In seconds, maps to "--session-request-timeout" 66 | ENV SE_SESSION_REQUEST_TIMEOUT 300 67 | # In seconds, maps to "--session-retry-interval" 68 | ENV SE_SESSION_RETRY_INTERVAL 15 69 | # In seconds, maps to "--healthcheck-interval" 70 | ENV SE_HEALTHCHECK_INTERVAL 120 71 | # Boolean value, maps "--relax-checks" 72 | ENV SE_RELAX_CHECKS true 73 | 74 | EXPOSE 4444 75 | 76 | ENV SE_OTEL_SERVICE_NAME "selenium-standalone" 77 | -------------------------------------------------------------------------------- /selenium-patches/4.21.0/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG NAMESPACE=selenium 2 | ARG VERSION 3 | ARG BASE=node-base 4 | ARG AUTHORS=zfpsystems 5 | 6 | # Build the extra browser Automation for the CROWler 7 | FROM ${NAMESPACE}/${BASE}:${VERSION} AS builder 8 | LABEL authors=${AUTHORS} 9 | 10 | USER root 11 | 12 | #==================================== 13 | # Install go lang to build "Rbee" for the CROWler 14 | #==================================== 15 | # Add the PPA for up-to-date Go versions and install Go 16 | RUN apt-get update \ 17 | && apt-get install -y software-properties-common \ 18 | && apt-get install -y wget \ 19 | && apt-get install -y git \ 20 | && apt-get install -y unzip 21 | RUN sudo add-apt-repository ppa:longsleep/golang-backports \ 22 | && apt-get update \ 23 | && apt-get install -y golang-go 24 | 25 | # Check Go version to ensure correct installation 26 | RUN go version 27 | 28 | # Install dependecies for the Go project 29 | RUN apt install -y libx11-dev libxtst-dev libxext-dev 30 | 31 | # Copy and build your Go project 32 | WORKDIR /src 33 | COPY ./Rbee/cmd ./cmd 34 | COPY ./Rbee/pkg ./pkg 35 | COPY ./Rbee/go.mod . 36 | COPY ./Rbee/go.sum . 37 | COPY ./Rbee/autobuild.sh . 38 | COPY ./Rbee/browserAutomation.conf . 39 | RUN chmod +x autobuild.sh 40 | 41 | WORKDIR /src 42 | RUN ./autobuild.sh rb 43 | 44 | # Build the Selenium Standalone image and copy the binary 45 | FROM ${NAMESPACE}/${BASE}:${VERSION} 46 | LABEL authors=${AUTHORS} 47 | 48 | USER ${SEL_UID} 49 | 50 | #==================================== 51 | # Scripts to run Selenium Standalone 52 | #==================================== 53 | COPY --chown="${SEL_UID}:${SEL_GID}" start-selenium-standalone.sh /opt/bin/start-selenium-standalone.sh 54 | COPY --from=builder --chown="${SEL_UID}:${SEL_GID}" /src/bin/rbee /opt/bin/rbee 55 | COPY --from=builder --chown="${SEL_UID}:${SEL_GID}" /src/browserAutomation.conf /etc/supervisor/conf.d/browserAutomation.conf 56 | 57 | #============================== 58 | # Supervisor configuration file 59 | #============================== 60 | COPY selenium.conf /etc/supervisor/conf.d/ 61 | 62 | # Copying configuration script generator 63 | COPY --chown="${SEL_UID}:${SEL_GID}" generate_config /opt/bin/generate_config 64 | 65 | # In seconds, maps to "--session-request-timeout" 66 | ENV SE_SESSION_REQUEST_TIMEOUT 300 67 | # In seconds, maps to "--session-retry-interval" 68 | ENV SE_SESSION_RETRY_INTERVAL 15 69 | # In seconds, maps to "--healthcheck-interval" 70 | ENV SE_HEALTHCHECK_INTERVAL 120 71 | # Boolean value, maps "--relax-checks" 72 | ENV SE_RELAX_CHECKS true 73 | 74 | EXPOSE 4444 75 | 76 | ENV SE_OTEL_SERVICE_NAME "selenium-standalone" 77 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Reporting a Vulnerability 4 | 5 | We take the security of our project seriously and appreciate your efforts to responsibly disclose vulnerabilities. If you believe you have found a security vulnerability in the **CROWler** project, please report it by following the steps below. 6 | 7 | ### What Constitutes a Vulnerability 8 | 9 | A security vulnerability in the **CROWler** project is any issue that can potentially allow an attacker to compromise the integrity, availability, or confidentiality of the data and functionalities provided by the application. Examples include but are not limited to: 10 | 11 | - Buffer Overflows 12 | - Denial of Service (DoS) 13 | - Improper Handling of User Input 14 | - Insecure Use of Cryptographic Algorithms 15 | - Memory Leaks 16 | - Race Conditions 17 | - Unsafe Concurrency Practices 18 | - SQL Injection 19 | - Cross-Site Scripting (XSS) (for plugins, given they are written in JavaScript and can be executed on a browser) 20 | - Cross-Site Request Forgery (CSRF) (for plugins, given they are written in JavaScript and can be executed on a browser) 21 | - Directory Traversal 22 | - Authentication and Authorization Flaws 23 | - Insecure Deserialization 24 | 25 | ### How to Report 26 | 27 | Please report vulnerabilities by opening a private issue on our GitHub repository: 28 | 29 | 1. **GitHub Issue Tracker:** Open a private issue [here](https://github.com/pzaino/thecrowler/issues). Make sure the issue is marked as confidential and contains detailed information about the vulnerability and steps to reproduce it. 30 | 31 | ### Coordinated Vulnerability Disclosure Guidelines 32 | 33 | - **Initial Acknowledgment:** We will acknowledge receipt of your report within 2 business days. 34 | - **Assessment:** We will assess the vulnerability and determine its impact. This process may take up to 5 business days. 35 | - **Mitigation:** If the vulnerability is confirmed, we will work on a mitigation plan and provide an estimated timeline for the fix. This typically takes between 15 and 30 days. 36 | - **Disclosure:** We will notify you when the vulnerability is fixed and coordinate a public disclosure, ensuring you receive credit for the discovery if you wish. 37 | 38 | ## Security Contacts 39 | 40 | - **GitHub Issue Tracker:** [Report an issue](https://github.com/pzaino/thecrowler/issues) 41 | 42 | ## Supported Versions 43 | 44 | Use this section to verify if the version of **CROWler** you are using is currently supported and eligible for security updates. 45 | 46 | | Version | Supported | 47 | | ------- | ------------------ | 48 | | 1.x.y | :white_check_mark: | 49 | | 0.x.y | :x: | 50 | -------------------------------------------------------------------------------- /selenium-patches/4.24.0/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG NAMESPACE=selenium 2 | ARG VERSION=latest 3 | ARG BASE=node-base 4 | #FROM ${NAMESPACE}/${BASE}:${VERSION} 5 | ARG AUTHORS 6 | #LABEL authors=${AUTHORS} 7 | 8 | # Build the extra browser Automation for the CROWler 9 | FROM ${NAMESPACE}/${BASE}:${VERSION} AS builder 10 | LABEL authors=${AUTHORS} 11 | 12 | USER root 13 | 14 | #==================================== 15 | # Install go lang to build "Rbee" for the CROWler 16 | #==================================== 17 | # Add the PPA for up-to-date Go versions and install Go 18 | RUN apt-get update \ 19 | && apt-get install -y software-properties-common \ 20 | && apt-get install -y wget \ 21 | && apt-get install -y git \ 22 | && apt-get install -y unzip 23 | RUN sudo add-apt-repository ppa:longsleep/golang-backports \ 24 | && apt-get update \ 25 | && apt-get install -y golang-go 26 | 27 | # Check Go version to ensure correct installation 28 | RUN go version 29 | 30 | # Install dependecies for the Go project 31 | RUN apt install -y libx11-dev libxtst-dev libxext-dev 32 | 33 | # Copy and build your Go project 34 | WORKDIR /src 35 | COPY ./Rbee/cmd ./cmd 36 | COPY ./Rbee/pkg ./pkg 37 | COPY ./Rbee/go.mod . 38 | COPY ./Rbee/go.sum . 39 | COPY ./Rbee/autobuild.sh . 40 | COPY ./Rbee/browserAutomation.conf . 41 | RUN chmod +x autobuild.sh 42 | 43 | WORKDIR /src 44 | RUN ./autobuild.sh rb 45 | 46 | # Build the Selenium Standalone image and copy the binary 47 | FROM ${NAMESPACE}/${BASE}:${VERSION} 48 | LABEL authors=${AUTHORS} 49 | 50 | USER ${SEL_UID} 51 | 52 | #==================================== 53 | # Scripts to run Selenium Standalone 54 | #==================================== 55 | COPY --chown="${SEL_UID}:${SEL_GID}" start-selenium-standalone.sh /opt/bin/start-selenium-standalone.sh 56 | COPY --from=builder --chown="${SEL_UID}:${SEL_GID}" /src/bin/rbee /opt/bin/rbee 57 | COPY --from=builder --chown="${SEL_UID}:${SEL_GID}" /src/browserAutomation.conf /etc/supervisor/conf.d/browserAutomation.conf 58 | 59 | #============================== 60 | # Supervisor configuration file 61 | #============================== 62 | COPY selenium.conf /etc/supervisor/conf.d/ 63 | 64 | # Copying configuration script generator 65 | COPY --chown="${SEL_UID}:${SEL_GID}" generate_config /opt/bin/generate_config 66 | 67 | # In seconds, maps to "--session-request-timeout" 68 | ENV SE_SESSION_REQUEST_TIMEOUT=300 \ 69 | # In seconds, maps to "--session-retry-interval" 70 | SE_SESSION_RETRY_INTERVAL=15 \ 71 | # In seconds, maps to "--healthcheck-interval" 72 | SE_HEALTHCHECK_INTERVAL=120 \ 73 | # Boolean value, maps "--relax-checks" 74 | SE_RELAX_CHECKS=true \ 75 | SE_REJECT_UNSUPPORTED_CAPS=true \ 76 | SE_OTEL_SERVICE_NAME="selenium-standalone" 77 | 78 | EXPOSE 4444 79 | -------------------------------------------------------------------------------- /docker-rebuild.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # shellcheck disable=SC2124 4 | pars="$@" 5 | 6 | # default settings 7 | preserve_volumes=0 8 | 9 | # Process the arguments in pars 10 | for arg in ${pars}; do 11 | case ${arg} in 12 | --volumes) 13 | preserve_volumes=1 14 | # remove "--volumes" from pars 15 | #pars=$(echo "${pars}" | sed 's/--volumes//') 16 | pars=${pars/--volumes/} 17 | ;; 18 | esac 19 | done 20 | 21 | # start cleaning up 22 | echo "Cleaning up..." 23 | 24 | if [ "${preserve_volumes}" -eq 1 ]; then 25 | echo "Preserving volumes" 26 | # Stop and remove containers, networks, and volumes 27 | docker compose down --remove-orphans 28 | echo "Stopping all crowler-* containers..." 29 | docker ps -a --format '{{.ID}} {{.Names}}' | awk '$2 ~ /^crowler-/ {print $1}' | xargs -r docker stop 30 | 31 | echo "Removing all crowler-* containers..." 32 | docker ps -a --format '{{.ID}} {{.Names}}' | awk '$2 ~ /^crowler-/ {print $1}' | xargs -r docker rm -f 33 | 34 | echo "Removing all crowler-* networks..." 35 | docker network ls --format '{{.ID}} {{.Name}}' | awk '$2 ~ /^crowler-/ {print $1}' | xargs -r docker network rm 36 | 37 | echo "Removing all crowler-* volumes..." 38 | docker volume ls --format '{{.Name}}' | awk '$1 ~ /^crowler-/ {print $1}' | xargs -r docker volume rm 39 | 40 | echo "Removing all crowler-* images..." 41 | docker images --format '{{.ID}} {{.Repository}}' | awk '$2 ~ /^crowler-/ {print $1}' | xargs -r docker rmi -f 42 | 43 | echo "Cleanup complete! All crowler-* resources have been removed." 44 | # Prune dangling images matching the naming pattern "crowler-*" 45 | docker images | grep -E "crowler-|selenium/" | awk '{print $3}' | xargs docker rmi 46 | else 47 | echo "Removing volumes" 48 | # Stop and remove containers, networks, and volumes 49 | docker compose down -v --remove-orphans 50 | # remove crowler network 51 | echo "Stopping all crowler-* containers..." 52 | docker ps -a --format '{{.ID}} {{.Names}}' | awk '$2 ~ /^crowler-/ {print $1}' | xargs -r docker stop 53 | 54 | echo "Removing all crowler-* containers..." 55 | docker ps -a --format '{{.ID}} {{.Names}}' | awk '$2 ~ /^crowler-/ {print $1}' | xargs -r docker rm -f 56 | 57 | echo "Removing all crowler-* networks..." 58 | docker network ls --format '{{.ID}} {{.Name}}' | awk '$2 ~ /^crowler-/ {print $1}' | xargs -r docker network rm 59 | 60 | echo "Removing all crowler-* images..." 61 | docker images --format '{{.ID}} {{.Repository}}' | awk '$2 ~ /^crowler-/ {print $1}' | xargs -r docker rmi -f 62 | 63 | echo "Cleanup complete! All crowler-* resources have been removed." 64 | fi 65 | 66 | 67 | # Rebuild and start containers 68 | ./docker-build.sh "${pars}" 69 | -------------------------------------------------------------------------------- /pkg/fingerprints/factory.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Paolo Fabio Zaino 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Package fingerprints implements the fingerprints library for the Crowler 16 | package fingerprints 17 | 18 | import "fmt" 19 | 20 | // FingerprintType represents the type of fingerprint algorithm. 21 | type FingerprintType int 22 | 23 | const ( 24 | // TypeJA3 represents the JA3 fingerprint type. 25 | TypeJA3 FingerprintType = iota 26 | // TypeJA3S represents the JA3S fingerprint type. 27 | TypeJA3S 28 | // TypeHASSH represents the HASSH fingerprint type. 29 | TypeHASSH 30 | // TypeHASSHServer represents the HASSHServer fingerprint type. 31 | TypeHASSHServer 32 | // TypeTLSH represents the TLSH fingerprint type. 33 | TypeTLSH 34 | // TypeSimHash represents the SimHash fingerprint type. 35 | TypeSimHash 36 | // TypeMinHash represents the MinHash fingerprint type. 37 | TypeMinHash 38 | // TypeBLAKE2 represents the BLAKE2 fingerprint type. 39 | TypeBLAKE2 40 | // TypeSHA256 represents the SHA256 fingerprint type. 41 | TypeSHA256 42 | // TypeCityHash represents the CityHash fingerprint type. 43 | TypeCityHash 44 | // TypeMurmurHash represents the MurmurHash fingerprint type. 45 | TypeMurmurHash 46 | // TypeCustomTLS represents the CustomTLS fingerprint type. 47 | TypeCustomTLS 48 | // TypeJARM represents the JARM fingerprint type. 49 | TypeJARM 50 | ) 51 | 52 | // FingerprintFactory creates an instance of a Fingerprint implementation. 53 | func FingerprintFactory(fType FingerprintType) (Fingerprint, error) { 54 | switch fType { 55 | case TypeJA3: 56 | return &JA3{}, nil 57 | case TypeJA3S: 58 | return &JA3S{}, nil 59 | case TypeHASSH: 60 | return &HASSH{}, nil 61 | case TypeHASSHServer: 62 | return &HASSHServer{}, nil 63 | case TypeTLSH: 64 | return &TLSH{}, nil 65 | case TypeSimHash: 66 | return &SimHash{}, nil 67 | case TypeMinHash: 68 | return &MinHash{}, nil 69 | case TypeBLAKE2: 70 | return &BLAKE2{}, nil 71 | case TypeSHA256: 72 | return &SHA256{}, nil 73 | case TypeCityHash: 74 | return &CityHash{}, nil 75 | case TypeMurmurHash: 76 | return &MurmurHash{}, nil 77 | case TypeCustomTLS: 78 | return &CustomTLS{}, nil 79 | case TypeJARM: 80 | return &JARM{}, nil 81 | default: 82 | return nil, fmt.Errorf("unknown fingerprint type") 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /Dockerfile.searchapi: -------------------------------------------------------------------------------- 1 | # Build stage (using golang:1.23.6) 2 | #FROM golang@sha256:f8113c4b13e2a8b3a168dceaee88ac27743cc84e959f43b9dbd2291e9c3f57a0 AS builder 3 | # Build stage (using golang:1.23.8-alpine3.21) 4 | #FROM golang@sha256:b7486658b87d34ecf95125e5b97e8dfe86c21f712aa36fc0c702e5dc41dc63e1 AS builder 5 | # Build stage (using golang:1.23.10-alpine3.22) 6 | #FROM golang@sha256:9a425d78a8257fc92d41ad979d38cb54005bac3fdefbdadde868e004eccbb898 AS builder 7 | # Build stage (using golang:1.23.11-alpine3.22) 8 | #FROM golang@sha256:ddcd26ec6b109c838725a1d93e3bec6d8b9c47f1fdc696b58820c63c70349c9a AS builder 9 | # Build stage (using golang:1.23.12-alpine3.22) 10 | #FROM golang@sha256:383395b794dffa5b53012a212365d40c8e37109a626ca30d6151c8348d380b5f AS builder 11 | # Build stage (using golang:1.24.7-alpine3.22) 12 | #FROM golang@sha256:fc2cff6625f3c1c92e6c85938ac5bd09034ad0d4bc2dfb08278020b68540dbb5 AS builder 13 | # Build stage (using golang:1.24.11-alpine-3.22) 14 | FROM golang@sha256:fb828ef85a4c4140fae45f145a84ca9c0a83fd0baa437a301b35b551e91ceed5 AS Builder 15 | 16 | 17 | RUN apk update && apk add ca-certificates && rm -rf /var/cache/apk/* 18 | RUN apk add --no-cache bash 19 | 20 | WORKDIR /app 21 | 22 | COPY ./cmd ./cmd 23 | COPY ./pkg ./pkg 24 | COPY ./services ./services 25 | COPY ./go.mod . 26 | COPY ./go.sum . 27 | COPY ./main.go . 28 | COPY ./config.yaml . 29 | COPY ./schemas/ ./schemas 30 | COPY ./autobuild.sh . 31 | 32 | # Ensure the script has correct permissions and check its presence 33 | RUN chmod +x autobuild.sh 34 | RUN ls -la 35 | 36 | # Run the build script using shell 37 | RUN bash ./autobuild.sh 38 | 39 | # Run stage (using alpine:3.20) 40 | #FROM alpine@sha256:e1c082e3d3c45cccac829840a25941e679c25d438cc8412c2fa221cf1a824e6a 41 | #FROM alpine@sha256:77726ef6b57ddf65bb551896826ec38bc3e53f75cdde31354fbffb4f25238ebd 42 | # alpine 3.21.3 43 | FROM alpine@sha256:a8560b36e8b8210634f77d9f7f9efd7ffa463e380b75e2e74aff4511df3ef88c 44 | 45 | WORKDIR /app 46 | 47 | # Add tzdata 48 | RUN apk add --no-cache tzdata 49 | 50 | # Create a non-root user and switch to it 51 | RUN adduser -D apiuser 52 | 53 | COPY --from=builder /app/bin/api /app/ 54 | COPY --from=builder /app/bin/addSource /app/ 55 | COPY --from=builder /app/bin/removeSource /app/ 56 | COPY --from=builder /app/bin/healthCheck /app/ 57 | COPY --from=builder /app/config.yaml /app/ 58 | COPY --from=builder /app/schemas /app/schemas 59 | 60 | # Ensure the executables have correct permissions 61 | RUN chmod +x api 62 | RUN chmod +x addSource 63 | RUN chmod +x removeSource 64 | RUN chmod +x healthCheck 65 | 66 | # Create the data directory with appropriate permissions 67 | RUN mkdir /app/data 68 | RUN chmod 755 /app/data 69 | RUN chown -R apiuser:apiuser /app 70 | 71 | USER apiuser 72 | 73 | # Expose port 8080 to the outside world 74 | EXPOSE 8080 75 | 76 | # Command to run the executable 77 | WORKDIR /app 78 | CMD ["./api"] 79 | -------------------------------------------------------------------------------- /pkg/database/database.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Paolo Fabio Zaino 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Package database is responsible for handling the database setup, configuration and abstraction. 16 | package database 17 | 18 | import ( 19 | "context" 20 | "database/sql" 21 | "time" 22 | 23 | cfg "github.com/pzaino/thecrowler/pkg/config" 24 | ) 25 | 26 | // ListenerEventType represents the type of event that the listener has received. 27 | type ListenerEventType int 28 | 29 | const ( 30 | // ListenerEventUnknown represents an unknown event. 31 | ListenerEventUnknown ListenerEventType = iota 32 | // ListenerEventConnected represents a notification event. 33 | ListenerEventConnected 34 | // ListenerEventDisconnected represents a disconnected event. 35 | ListenerEventDisconnected 36 | // ListenerEventReconnected represents a reconnected event. 37 | ListenerEventReconnected 38 | ) 39 | 40 | // Handler is the interface that wraps the basic methods 41 | // to interact with the database. 42 | type Handler interface { 43 | Connect(c cfg.Config) error 44 | Close() error 45 | Ping() error 46 | ExecuteQuery(query string, args ...interface{}) (*sql.Rows, error) 47 | Exec(query string, args ...interface{}) (sql.Result, error) 48 | ExecContext(ctx context.Context, query string, args ...interface{}) (sql.Result, error) 49 | DBMS() string 50 | Begin() (*sql.Tx, error) 51 | BeginTx(ctx context.Context, opts *sql.TxOptions) (*sql.Tx, error) 52 | Commit(tx *sql.Tx) error 53 | Rollback(tx *sql.Tx) error 54 | QueryRow(query string, args ...interface{}) *sql.Row 55 | CheckConnection(c cfg.Config) error 56 | NewListener() Listener 57 | } 58 | 59 | // Listener is the interface that wraps the basic methods 60 | // to interact with the database listener. 61 | type Listener interface { 62 | Connect(c cfg.Config, minReconnectInterval, maxReconnectInterval time.Duration, eventCallback func(ev ListenerEventType, err error)) error 63 | ConnectWithDBHandler(dbh *Handler, channel string) error 64 | Ping() error 65 | Close() error 66 | Listen(channel string) error 67 | Notify() <-chan Notification 68 | UnlistenAll() error 69 | } 70 | 71 | // Notification is the interface that wraps the basic methods 72 | // to interact with the database notification. 73 | type Notification interface { 74 | Channel() string 75 | Extra() string 76 | } 77 | -------------------------------------------------------------------------------- /Dockerfile.events: -------------------------------------------------------------------------------- 1 | # Build stage (using golang:1.23.6) 2 | #FROM golang@sha256:f8113c4b13e2a8b3a168dceaee88ac27743cc84e959f43b9dbd2291e9c3f57a0 AS builder 3 | # Build stage (using golang:1.23.8-alpine3.21) 4 | #FROM golang@sha256:b7486658b87d34ecf95125e5b97e8dfe86c21f712aa36fc0c702e5dc41dc63e1 AS builder 5 | # Build stage (using golang:1.23.10-alpine3.22) 6 | #FROM golang@sha256:9a425d78a8257fc92d41ad979d38cb54005bac3fdefbdadde868e004eccbb898 AS builder 7 | # Build stage (using golang:1.23.11-alpine3.22) 8 | # FROM golang@sha256:ddcd26ec6b109c838725a1d93e3bec6d8b9c47f1fdc696b58820c63c70349c9a AS builder 9 | # Build stage (using golang:1.23.12-alpine3.22) 10 | #FROM golang@sha256:383395b794dffa5b53012a212365d40c8e37109a626ca30d6151c8348d380b5f AS builder 11 | # Build stage (using golang:1.24.7-alpine3.22) 12 | #FROM golang@sha256:fc2cff6625f3c1c92e6c85938ac5bd09034ad0d4bc2dfb08278020b68540dbb5 AS builder 13 | # Build stage (using golang:1.24.11-alpine-3.22) 14 | FROM golang@sha256:fb828ef85a4c4140fae45f145a84ca9c0a83fd0baa437a301b35b551e91ceed5 AS Builder 15 | 16 | RUN apk update && apk add ca-certificates && rm -rf /var/cache/apk/* 17 | RUN apk add --no-cache bash 18 | 19 | WORKDIR /app 20 | 21 | COPY ./cmd ./cmd 22 | COPY ./pkg ./pkg 23 | COPY ./services ./services 24 | COPY ./go.mod . 25 | COPY ./go.sum . 26 | COPY ./main.go . 27 | COPY ./config.yaml . 28 | COPY ./schemas/ ./schemas 29 | COPY ./plugins/ ./plugins 30 | COPY ./agents/ ./agents 31 | COPY ./support/ ./support 32 | COPY ./autobuild.sh . 33 | 34 | # Ensure the script has correct permissions and check its presence 35 | RUN chmod +x autobuild.sh 36 | RUN ls -la 37 | 38 | # Run the build script using shell 39 | RUN bash ./autobuild.sh 40 | 41 | # Run stage (using alpine:3.20) 42 | #FROM alpine@sha256:e1c082e3d3c45cccac829840a25941e679c25d438cc8412c2fa221cf1a824e6a 43 | #FROM alpine@sha256:77726ef6b57ddf65bb551896826ec38bc3e53f75cdde31354fbffb4f25238ebd 44 | # alpine 3.21.3 45 | FROM alpine@sha256:a8560b36e8b8210634f77d9f7f9efd7ffa463e380b75e2e74aff4511df3ef88c 46 | 47 | WORKDIR /app 48 | 49 | # Add tzdata 50 | RUN apk add --no-cache tzdata 51 | 52 | # Create a non-root user and switch to it 53 | RUN adduser -D eventsuser 54 | 55 | COPY --from=builder /app/bin/events /app/ 56 | COPY --from=builder /app/bin/healthCheck /app/ 57 | COPY --from=builder /app/config.yaml /app/ 58 | COPY --from=builder /app/schemas /app/schemas 59 | COPY --from=builder /app/plugins /app/plugins 60 | COPY --from=builder /app/agents /app/agents 61 | COPY --from=builder /app/support /app/support 62 | 63 | # Ensure the executables have correct permissions 64 | RUN chmod +x events 65 | RUN chmod +x healthCheck 66 | 67 | # Create the data directory with appropriate permissions 68 | RUN mkdir /app/data 69 | RUN chmod 755 /app/data 70 | RUN chown -R eventsuser:eventsuser /app 71 | 72 | USER eventsuser 73 | 74 | # Expose port 8080 to the outside world 75 | EXPOSE 8080 76 | 77 | # Command to run the executable 78 | WORKDIR /app 79 | CMD ["./events"] 80 | -------------------------------------------------------------------------------- /pkg/ruleset/actionrule.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Paolo Fabio Zaino 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Package ruleset implements the ruleset library for the Crowler and 16 | // the scrapper. 17 | package ruleset 18 | 19 | import ( 20 | "strings" 21 | ) 22 | 23 | ///// --------------------- ActionRule ------------------------------- ///// 24 | 25 | // GetActionType returns the action type for the specified action rule. 26 | func (r *ActionRule) GetActionType() string { 27 | return strings.ToLower(strings.TrimSpace(r.ActionType)) 28 | } 29 | 30 | // GetRuleName returns the rule name for the specified action rule. 31 | func (r *ActionRule) GetRuleName() string { 32 | return strings.TrimSpace(r.RuleName) 33 | } 34 | 35 | // GetURL returns the URL for the specified action rule. 36 | func (r *ActionRule) GetURL() string { 37 | return strings.TrimSpace(r.URL) 38 | } 39 | 40 | // GetSelectors returns the selectors for the specified action rule. 41 | func (r *ActionRule) GetSelectors() []Selector { 42 | return r.Selectors 43 | } 44 | 45 | // GetValue returns the value for the specified action rule. 46 | func (r *ActionRule) GetValue() string { 47 | return strings.TrimSpace(r.Value) 48 | } 49 | 50 | // GetWaitConditions returns the wait conditions for the specified action rule. 51 | func (r *ActionRule) GetWaitConditions() []WaitCondition { 52 | return r.WaitConditions 53 | } 54 | 55 | // GetConditions returns the conditions for the specified action rule. 56 | func (r *ActionRule) GetConditions() map[string]interface{} { 57 | return r.Conditions 58 | } 59 | 60 | // GetErrorHandling returns the error handling configuration for the specified action rule. 61 | func (r *ActionRule) GetErrorHandling() ErrorHandling { 62 | return r.ErrorHandling 63 | } 64 | 65 | ///// ------------------------ Selector ---------------------------- ///// 66 | 67 | // GetSelectorType returns the selector type for the specified selector. 68 | func (s *Selector) GetSelectorType() string { 69 | return strings.ToLower(strings.TrimSpace(s.SelectorType)) 70 | } 71 | 72 | // GetSelector returns the selector for the specified selector. 73 | func (s *Selector) GetSelector() string { 74 | return strings.TrimSpace(s.Selector) 75 | } 76 | 77 | // GetAttribute returns the attribute for the specified selector. 78 | func (s *Selector) GetAttribute() (string, string) { 79 | return strings.TrimSpace(s.Attribute.Name), strings.TrimSpace(s.Attribute.Value) 80 | } 81 | -------------------------------------------------------------------------------- /pkg/agent/setup_test.go: -------------------------------------------------------------------------------- 1 | package agent 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestInitialize(t *testing.T) { 8 | // Reset the global variables before each test 9 | AgentsEngine = nil 10 | 11 | // Call the Initialize function (global variable AgentsEngine should be initialized) 12 | Initialize() 13 | 14 | // Check if AgentsEngine is not nil 15 | if AgentsEngine == nil { 16 | t.Errorf("Expected AgentsEngine to be initialized, but it is nil") 17 | } 18 | 19 | // Check if actions are registered 20 | expectedActions := []string{ 21 | "APIRequest", 22 | "CreateEvent", 23 | "RunCommand", 24 | "AIInteraction", 25 | "DBQuery", 26 | "PluginExecution", 27 | "Decision", 28 | } 29 | 30 | for _, action := range expectedActions { 31 | if _, exists := AgentsEngine.actions[action]; !exists { 32 | t.Errorf("Expected action %s to be registered, but it is not", action) 33 | } 34 | } 35 | } 36 | 37 | func TestRegisterActions(t *testing.T) { 38 | // Create a new JobEngine instance 39 | engine := NewJobEngine() 40 | 41 | // Call the RegisterActions function 42 | RegisterActions(engine) 43 | 44 | // Check if actions are registered 45 | expectedActions := []string{ 46 | "APIRequest", 47 | "CreateEvent", 48 | "RunCommand", 49 | "AIInteraction", 50 | "DBQuery", 51 | "PluginExecution", 52 | "Decision", 53 | } 54 | 55 | for _, action := range expectedActions { 56 | if _, exists := engine.actions[action]; !exists { 57 | t.Errorf("Expected action %s to be registered, but it is not", action) 58 | } 59 | } 60 | } 61 | 62 | func TestRegisterActionsWithNilEngine(t *testing.T) { 63 | // Call the RegisterActions function with nil engine 64 | RegisterActions(nil) 65 | 66 | // Check if a new engine is created and actions are registered 67 | if AgentsEngine == nil { 68 | t.Errorf("Expected a new JobEngine to be created, but it is nil") 69 | } 70 | 71 | expectedActions := []string{ 72 | "APIRequest", 73 | "CreateEvent", 74 | "RunCommand", 75 | "AIInteraction", 76 | "DBQuery", 77 | "PluginExecution", 78 | "Decision", 79 | } 80 | 81 | for _, action := range expectedActions { 82 | if _, exists := AgentsEngine.GetAction(action); !exists { 83 | t.Errorf("Expected action %s to be registered, but it is not", action) 84 | } 85 | } 86 | } 87 | 88 | func TestNewJobConfig(t *testing.T) { 89 | // Call the NewJobConfig function 90 | jobConfig := NewJobConfig() 91 | 92 | // Check if the returned JobConfig is not nil 93 | if jobConfig == nil { 94 | t.Errorf("Expected JobConfig to be initialized, but it is nil") 95 | } 96 | 97 | // Check if the Jobs slice is initialized and empty 98 | if jobConfig.Jobs != nil { 99 | t.Errorf("Expected Jobs slice to be nil after initialization, but it is not") 100 | } 101 | if len(jobConfig.Jobs) != 0 { 102 | t.Errorf("Expected Jobs slice to be empty, but it has %d elements", len(jobConfig.Jobs)) 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /pkg/common/network.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Paolo Fabio Zaino 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Package common package is used to store common functions and variables 16 | package common 17 | 18 | import ( 19 | "fmt" 20 | "net" 21 | "os" 22 | ) 23 | 24 | // DetectLocalNetwork finds the local network the machine is connected to 25 | func DetectLocalNetwork() (string, error) { 26 | interfaces, err := net.Interfaces() 27 | if err != nil { 28 | return "", fmt.Errorf("failed to get network interfaces: %w", err) 29 | } 30 | 31 | for _, iface := range interfaces { 32 | // Skip interfaces that are down 33 | if iface.Flags&net.FlagUp == 0 { 34 | continue 35 | } 36 | 37 | addrs, err := getAddrs(iface) 38 | if err != nil { 39 | continue 40 | } 41 | 42 | for _, addr := range addrs { 43 | ipNet, ok := addr.(*net.IPNet) 44 | if ok && ipNet.IP.To4() != nil { 45 | if isPrivateIP(ipNet.IP) { 46 | // Calculate the network address 47 | networkIP := ipNet.IP.Mask(ipNet.Mask) 48 | return fmt.Sprintf("%s/%d", networkIP.String(), maskToCIDR(ipNet.Mask)), nil 49 | } 50 | } 51 | } 52 | } 53 | 54 | return "", fmt.Errorf("no local network detected") 55 | } 56 | 57 | func isPrivateIP(ip net.IP) bool { 58 | privateBlocks := []string{ 59 | "10.0.0.0/8", 60 | "172.16.0.0/12", 61 | "192.168.0.0/16", 62 | } 63 | 64 | for _, block := range privateBlocks { 65 | _, cidr, _ := net.ParseCIDR(block) 66 | if cidr.Contains(ip) { 67 | return true 68 | } 69 | } 70 | 71 | return false 72 | } 73 | 74 | // maskToCIDR converts a net.IPMask to its CIDR prefix length 75 | func maskToCIDR(mask net.IPMask) int { 76 | ones, _ := mask.Size() 77 | return ones 78 | } 79 | 80 | // getAddrs is the default implementation for fetching interface addresses 81 | func getAddrs(iface net.Interface) ([]net.Addr, error) { 82 | return iface.Addrs() 83 | } 84 | 85 | // GetHostName returns the hostname of the machine 86 | func GetHostName() string { 87 | const unknown = "unknown" 88 | hostname, err := os.Hostname() 89 | if err != nil { 90 | hostnames, err := net.LookupHost("localhost") 91 | if err != nil { 92 | return unknown 93 | } 94 | if len(hostnames) == 0 { 95 | return unknown 96 | } 97 | return hostnames[0] 98 | } 99 | if len(hostname) == 0 { 100 | return unknown 101 | } 102 | return hostname 103 | } 104 | -------------------------------------------------------------------------------- /pkg/database/queries.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Paolo Fabio Zaino 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Package database is responsible for handling the database 16 | // setup, configuration and abstraction. 17 | package database 18 | 19 | import ( 20 | "database/sql" 21 | "fmt" 22 | "strings" 23 | 24 | cmn "github.com/pzaino/thecrowler/pkg/common" 25 | ) 26 | 27 | // GetSourceID retrieves the source ID from the database based on the provided filter. 28 | func GetSourceID(filter SourceFilter, db *Handler) (uint64, error) { 29 | var sourceID int64 // Use int64 here since PostgreSQL BIGSERIAL maps to int64 30 | var whereClauses []string 31 | var args []interface{} 32 | parID := 1 33 | 34 | // Dynamically build the WHERE clause based on the input struct 35 | if filter.URL != "" { 36 | whereClauses = append(whereClauses, "url = $"+fmt.Sprint(parID)) 37 | args = append(args, cmn.NormalizeURL(filter.URL)) 38 | parID++ 39 | } 40 | if filter.SourceID > 0 { 41 | whereClauses = append(whereClauses, "source_id = $"+fmt.Sprint(parID)) 42 | args = append(args, filter.SourceID) 43 | } 44 | 45 | if len(whereClauses) == 0 { 46 | return 0, fmt.Errorf("at least one filter (URL or SourceID) must be provided") 47 | } 48 | 49 | query := fmt.Sprintf(` 50 | SELECT source_id 51 | FROM Sources 52 | WHERE %s 53 | LIMIT 1 54 | `, strings.Join(whereClauses, " AND ")) 55 | 56 | // Query the database 57 | err := (*db).QueryRow(query, args...).Scan(&sourceID) 58 | if err == sql.ErrNoRows { 59 | return 0, fmt.Errorf("no source found matching the provided filters") 60 | } else if err != nil { 61 | return 0, fmt.Errorf("error querying the source ID: %w", err) 62 | } 63 | 64 | // Ensure the value is non-negative 65 | if sourceID < 0 { 66 | return 0, fmt.Errorf("invalid source ID retrieved from database: %d", sourceID) 67 | } 68 | 69 | // Convert to uint64 (safe because of the check above) 70 | return uint64(sourceID), nil //nolint:gosec // This is a read-only operation and the int64 is never negative 71 | } 72 | 73 | // IsURLKnown checks if a URL is already present in Sources or SearchIndex 74 | func IsURLKnown(url string, db *Handler) (bool, error) { 75 | const query = ` 76 | SELECT EXISTS ( 77 | SELECT 1 FROM SearchIndex WHERE page_url = $1 78 | ); 79 | ` 80 | 81 | var exists bool 82 | err := (*db).QueryRow(query, url).Scan(&exists) 83 | if err != nil { 84 | return false, fmt.Errorf("failed to check URL existence: %w", err) 85 | } 86 | 87 | return exists, nil 88 | } 89 | -------------------------------------------------------------------------------- /doc/sources.md: -------------------------------------------------------------------------------- 1 | # TheCROWler Sources 2 | 3 | TheCROWler use the concept of "sources" to define a website "entry-point" from 4 | where to start the crawling, scrapping and interaction process. 5 | 6 | A source is a combination of: 7 | 8 | - a URL 9 | - a crawling scope 10 | - a flagset 11 | - a configuration file (which is expressed in YAML format and has a well defined 12 | schema). 13 | 14 | The configuration file is used to define the rulesets to be used for the source 15 | and the interactions to be performed. 16 | 17 | The flagset is used to define the flags to be used for the source. 18 | 19 | The URL is the entry-point of the source. 20 | 21 | A source can also be "enabled" or "disabled". If a source is "enabled", then it 22 | will be used by TheCROWler. If a source is "disabled", then it will be ignored. 23 | 24 | The crawling scope is used to define the scope of the crawling. The crawling 25 | scope has 4 possible values: 26 | 27 | - `page`: The crawling will be limited to the current page only (aka just the 28 | source entry-point). 29 | - `FQDN`: The crawling will be limited to the current FQDN only (aka all the 30 | pages of the current FQDN, which includes the hostname, for example 31 | "www.example.com"). 32 | - `domain`: The crawling will be limited to the current domain only (aka all the 33 | pages of the current domain, which includes ALL found hostnames within the 34 | domain, for example "example.com"). 35 | - `l1 domain`: The crawling will be limited to the current l1 domain only (aka 36 | all the pages of the current l1 domain, which includes ALL found hostnames and 37 | ALL found subdomains within the l1 domain, for example ".com"). 38 | - `global`: The crawling will be global (aka all the pages of the current source 39 | and everything else on the entire internet that is linked from the source and 40 | then recursively crawled as well). 41 | 42 | ## Using addSource and removeSource commands 43 | 44 | The `addSource` and `removeSource` commands are used to add and remove sources 45 | from the source configuration. 46 | 47 | To add a configuration with addSource, you have to write the configuration in a 48 | YAML file and then use the `addSource` command to add the source pinpointing the 49 | YAML file. 50 | 51 | This will ensure that the source is added (or updated) in the DB and that the 52 | configuration file is uploaded in the config field of the source. 53 | 54 | To remove a source, you have to use the `removeSource` command and the URL of 55 | the source you want to remove. 56 | 57 | ## Adding Sources configuration validation in VSCode 58 | 59 | To add Sources configuration validation in VSCode, you can use the 60 | following extension: 61 | 62 | Open (or create) your VSCode settings.json file and add the following: 63 | 64 | ```json 65 | "yaml.schemas": { 66 | "./schemas/crowler-source-config-schema.json": "*-sourcecfg.y*ml", 67 | } 68 | ``` 69 | 70 | Then, ensure you call all your config files with the `-sourcecfg.yaml` or 71 | `-sourcecfg.yml` extension. 72 | 73 | This will allow you to validate your Sources configurations in VSCode 74 | as you type them. 75 | -------------------------------------------------------------------------------- /selenium-patches/4.21.0/Dockerfile_Chromium_ARM64_4.21.0.patch: -------------------------------------------------------------------------------- 1 | --- ./docker-selenium/NodeChromium/Dockerfile 2024-09-07 00:06:57 2 | +++ ./docker-selenium/NodeChromium/Dockerfile_Chromium_ARM64_4.21.0 2024-09-07 00:07:30 3 | @@ -7,12 +7,37 @@ 4 | USER root 5 | 6 | # Install Chromium 7 | -RUN echo "deb http://deb.debian.org/debian/ sid main" >> /etc/apt/sources.list \ 8 | - && apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 0E98404D386FA1D9 6ED0E7B82643E131 \ 9 | - && apt-get update -qqy \ 10 | - && apt-get -qqy install chromium \ 11 | - && rm -rf /var/lib/apt/lists/* /var/cache/apt/* 12 | +#RUN echo "deb http://deb.debian.org/debian/ sid main" >> /etc/apt/sources.list \ 13 | +# && apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 0E98404D386FA1D9 6ED0E7B82643E131 \ 14 | +# && apt-get update -qqy \ 15 | +# && apt-get -qqy install chromium \ 16 | +# && rm -rf /var/lib/apt/lists/* /var/cache/apt/* 17 | 18 | +# Add the Debian buster repositories 19 | +RUN echo "deb [arch=arm64 signed-by=/usr/share/keyrings/debian-buster.gpg] http://deb.debian.org/debian buster main" > /etc/apt/sources.list.d/debian-buster.list \ 20 | + && echo "deb [arch=arm64 signed-by=/usr/share/keyrings/debian-buster-updates.gpg] http://deb.debian.org/debian buster-updates main" >> /etc/apt/sources.list.d/debian-buster.list \ 21 | + && echo "deb [arch=arm64 signed-by=/usr/share/keyrings/debian-security-buster.gpg] http://deb.debian.org/debian-security buster/updates main" >> /etc/apt/sources.list.d/debian-buster.list 22 | + 23 | +# Add the Debian GPG keys 24 | +RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517 648ACFD622F3D138 112695A0E562B32A \ 25 | + && apt-key export DCC9EFBF77E11517 | gpg --dearmour -o /usr/share/keyrings/debian-buster.gpg \ 26 | + && apt-key export 648ACFD622F3D138 | gpg --dearmour -o /usr/share/keyrings/debian-buster-updates.gpg \ 27 | + && apt-key export 112695A0E562B32A | gpg --dearmour -o /usr/share/keyrings/debian-security-buster.gpg 28 | + 29 | +# Configure apt pinning for Chromium 30 | +RUN echo "Package: *" > /etc/apt/preferences.d/chromium.pref \ 31 | + && echo "Pin: release a=jammy" >> /etc/apt/preferences.d/chromium.pref \ 32 | + && echo "Pin-Priority: 500" >> /etc/apt/preferences.d/chromium.pref \ 33 | + && echo "" >> /etc/apt/preferences.d/chromium.pref \ 34 | + && echo "Package: chromium*" >> /etc/apt/preferences.d/chromium.pref \ 35 | + && echo "Pin: origin deb.debian.org" >> /etc/apt/preferences.d/chromium.pref \ 36 | + && echo "Pin-Priority: 700" >> /etc/apt/preferences.d/chromium.pref 37 | + 38 | +# Install Chromium from Debian buster 39 | +RUN apt-get update \ 40 | + && apt-get install -y chromium \ 41 | + && rm -rf /var/lib/apt/lists/* /var/cache/apt/* 42 | + 43 | #================================= 44 | # Chromium Launch Script Wrapper 45 | #================================= 46 | @@ -32,6 +57,9 @@ 47 | COPY chrome-cleanup.sh /opt/bin/chrome-cleanup.sh 48 | COPY chrome-cleanup.conf /etc/supervisor/conf.d/chrome-cleanup.conf 49 | 50 | +# Disable Debian repositories after installing Chromium 51 | +RUN rm /etc/apt/sources.list.d/debian-buster.list 52 | + 53 | USER ${SEL_UID} 54 | 55 | #============================================ 56 | -------------------------------------------------------------------------------- /.github/workflows/codeql.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | branches: ["main"] 17 | pull_request: 18 | # The branches below must be a subset of the branches above 19 | branches: ["main"] 20 | schedule: 21 | - cron: "0 0 * * 1" 22 | 23 | permissions: 24 | contents: read 25 | 26 | jobs: 27 | analyze: 28 | name: Analyze 29 | runs-on: ubuntu-latest 30 | permissions: 31 | actions: read 32 | contents: read 33 | security-events: write 34 | 35 | strategy: 36 | fail-fast: false 37 | matrix: 38 | language: ["go", "javascript"] 39 | # CodeQL supports [ $supported-codeql-languages ] 40 | # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support 41 | 42 | steps: 43 | - name: Harden Runner 44 | uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 45 | with: 46 | egress-policy: audit 47 | 48 | - name: Checkout repository 49 | uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 50 | 51 | # Initializes the CodeQL tools for scanning. 52 | - name: Initialize CodeQL 53 | uses: github/codeql-action/init@4dd16135b69a43b6c8efb853346f8437d92d3c93 # v3.26.6 54 | with: 55 | languages: ${{ matrix.language }} 56 | # If you wish to specify custom queries, you can do so here or in a config file. 57 | # By default, queries listed here will override any specified in a config file. 58 | # Prefix the list here with "+" to use these queries and those in the config file. 59 | 60 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 61 | # If this step fails, then you should remove it and run the build manually (see below) 62 | - name: Autobuild 63 | uses: github/codeql-action/autobuild@4dd16135b69a43b6c8efb853346f8437d92d3c93 # v3.26.6 64 | 65 | # ℹ️ Command-line programs to run using the OS shell. 66 | # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun 67 | 68 | # If the Autobuild fails above, remove it and uncomment the following three lines. 69 | # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance. 70 | 71 | # - run: | 72 | # echo "Run, Build Application using script" 73 | # ./location_of_script_within_repo/buildscript.sh 74 | 75 | - name: Perform CodeQL Analysis 76 | uses: github/codeql-action/analyze@4dd16135b69a43b6c8efb853346f8437d92d3c93 # v3.26.6 77 | with: 78 | category: "/language:${{matrix.language}}" 79 | -------------------------------------------------------------------------------- /pkg/netinfo/helper.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Paolo Fabio Zaino 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Package netinfo provides functionality to extract network information 16 | package netinfo 17 | 18 | import ( 19 | "fmt" 20 | "strconv" 21 | "strings" 22 | "unicode" 23 | 24 | "net/url" 25 | 26 | "golang.org/x/net/publicsuffix" 27 | ) 28 | 29 | // helper function to extract the host from a URL 30 | func urlToHost(url string) string { 31 | host := url 32 | if strings.Contains(host, "://") { 33 | host = host[strings.Index(host, "://")+3:] 34 | } 35 | if strings.Contains(host, "/") { 36 | host = host[:strings.Index(host, "/")] 37 | } 38 | host = strings.TrimSuffix(host, "/") 39 | host = strings.TrimSpace(host) 40 | return host 41 | } 42 | 43 | // helper function to extract the domain from a URL 44 | func urlToDomain(inputURL string) string { 45 | _, err := url.Parse(inputURL) 46 | if err != nil { 47 | return "" 48 | } 49 | 50 | // Given that url.Parse() does always extract a hostname correctly 51 | // we can safely ignore the error here 52 | h := urlToHost(inputURL) 53 | 54 | // Use EffectiveTLDPlusOne to correctly handle domains like "example.co.uk" 55 | domain, err := publicsuffix.EffectiveTLDPlusOne(h) 56 | if err != nil { 57 | fmt.Printf("Error extracting domain from URL: %v\n", err) 58 | return "" 59 | } 60 | return domain 61 | } 62 | 63 | // Helper function to return "N/A" for empty strings 64 | func defaultNA(s string) string { 65 | if s == "" { 66 | return naStr 67 | } 68 | return s 69 | } 70 | 71 | // Helper function to check if a string is numeric 72 | func isNumeric(s string) bool { 73 | _, err := strconv.Atoi(s) 74 | return err == nil 75 | } 76 | 77 | // fieldsQuotes splits the string around each instance of one or more consecutive white space characters, 78 | // as defined by unicode.IsSpace, taking into consideration quoted substrings. 79 | func fieldsQuotes(s string) []string { 80 | var fields []string 81 | var buf []rune 82 | inQuotes := false 83 | 84 | var o rune 85 | for _, r := range s { 86 | switch { 87 | case r == '"' && o != '\\': 88 | inQuotes = !inQuotes // Toggle the inQuotes state 89 | case unicode.IsSpace(r) && !inQuotes: 90 | if len(buf) > 0 { 91 | fields = append(fields, string(buf)) 92 | buf = buf[:0] // Reset buffer 93 | } 94 | default: 95 | buf = append(buf, r) 96 | } 97 | o = r 98 | } 99 | 100 | // Add the last field if it's non-empty 101 | if len(buf) > 0 { 102 | fields = append(fields, string(buf)) 103 | return fields 104 | } 105 | 106 | return []string{} 107 | } 108 | -------------------------------------------------------------------------------- /pkg/agent/agentmeta.go: -------------------------------------------------------------------------------- 1 | // Package agent provides the agent functionality for the CROWler. 2 | package agent 3 | 4 | // This file defines the manifest structure for user-defined agents in the CROWler. 5 | 6 | import ( 7 | "fmt" 8 | "os" 9 | "path/filepath" 10 | 11 | "gopkg.in/yaml.v3" 12 | ) 13 | 14 | // Manifest represents a YAML or JSON declaration of a user-defined agent. 15 | type Manifest struct { 16 | Name string `yaml:"name" json:"name"` 17 | Version string `yaml:"version" json:"version"` 18 | Description string `yaml:"description" json:"description"` 19 | Author string `yaml:"author" json:"author"` 20 | Tags []string `yaml:"tags" json:"tags"` 21 | Capabilities []string `yaml:"capabilities" json:"capabilities"` 22 | Trigger TriggerDefinition `yaml:"trigger" json:"trigger"` 23 | Inputs []IOField `yaml:"inputs" json:"inputs"` 24 | Outputs []IOField `yaml:"outputs" json:"outputs"` 25 | Dependencies []string `yaml:"dependencies" json:"dependencies"` 26 | Sandbox *SandboxConfig `yaml:"sandbox,omitempty" json:"sandbox,omitempty"` 27 | } 28 | 29 | // TriggerDefinition defines the trigger for the agent. 30 | type TriggerDefinition struct { 31 | Type string `yaml:"type" json:"type"` // event, api, cron 32 | Name string `yaml:"name" json:"name"` 33 | } 34 | 35 | // IOField represents an input/output field in the agent's manifest. 36 | type IOField struct { 37 | Name string `yaml:"name" json:"name"` 38 | Type string `yaml:"type" json:"type"` 39 | Required bool `yaml:"required,omitempty" json:"required,omitempty"` 40 | } 41 | 42 | // SandboxConfig defines the sandboxing options for the agent. 43 | type SandboxConfig struct { 44 | AllowedNetwork bool `yaml:"allowed_network" json:"allowed_network"` 45 | MaxDuration string `yaml:"max_duration" json:"max_duration"` 46 | } 47 | 48 | // LoadManifest loads a manifest file from disk (YAML only for now) 49 | func LoadManifest(path string) (*Manifest, error) { 50 | f, err := os.Open(path) //nolint:gosec // this path is controlled by the system owner 51 | if err != nil { 52 | return nil, err 53 | } 54 | defer f.Close() // nolint:errcheck // ignore close error 55 | 56 | var manifest Manifest 57 | yamlDecoder := yaml.NewDecoder(f) 58 | if err := yamlDecoder.Decode(&manifest); err != nil { 59 | return nil, fmt.Errorf("failed to parse manifest file %s: %w", path, err) 60 | } 61 | return &manifest, nil 62 | } 63 | 64 | // LoadAllManifests walks a directory and loads all manifest.yaml or .yml files 65 | func LoadAllManifests(dir string) ([]*Manifest, error) { 66 | var manifests []*Manifest 67 | err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error { 68 | if err != nil { 69 | return err 70 | } 71 | if info.IsDir() { 72 | return nil 73 | } 74 | ext := filepath.Ext(path) 75 | if ext == ".yaml" || ext == ".yml" { 76 | m, err := LoadManifest(path) 77 | if err != nil { 78 | return fmt.Errorf("failed loading manifest %s: %w", path, err) 79 | } 80 | manifests = append(manifests, m) 81 | } 82 | return nil 83 | }) 84 | if err != nil { 85 | return nil, err 86 | } 87 | return manifests, nil 88 | } 89 | -------------------------------------------------------------------------------- /pkg/agent/action_run_db_query.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Paolo Fabio Zaino 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Package agent provides the agent functionality for the CROWler. 16 | package agent 17 | 18 | import ( 19 | "fmt" 20 | 21 | cmn "github.com/pzaino/thecrowler/pkg/common" 22 | cdb "github.com/pzaino/thecrowler/pkg/database" 23 | ) 24 | 25 | // DBQueryAction performs database queries or operations 26 | type DBQueryAction struct{} 27 | 28 | // Name returns the name of the action 29 | func (d *DBQueryAction) Name() string { 30 | return "DBQuery" 31 | } 32 | 33 | // Execute runs a database query or operation 34 | func (d *DBQueryAction) Execute(params map[string]interface{}) (map[string]interface{}, error) { 35 | rval := make(map[string]interface{}) 36 | rval[StrResponse] = nil 37 | rval[StrConfig] = nil 38 | 39 | // Check if params has a field called config 40 | config, err := getConfig(params) 41 | if err != nil { 42 | rval[StrStatus] = StatusError 43 | rval[StrMessage] = err.Error() 44 | return rval, err 45 | } 46 | rval[StrConfig] = config 47 | 48 | // Extract dbHandler from config 49 | dbHandler, ok := config["db_handler"].(cdb.Handler) 50 | if !ok { 51 | rval[StrStatus] = StatusError 52 | rval[StrMessage] = "missing 'dbHandler' in config" 53 | return rval, fmt.Errorf("missing 'dbHandler' in config") 54 | } 55 | 56 | // Extract query string 57 | inputRaw, err := getInput(params) 58 | if err != nil { 59 | rval[StrStatus] = StatusError 60 | rval[StrMessage] = err.Error() 61 | return rval, err 62 | } 63 | 64 | // Check if there is a params field called query 65 | if params["query"] == nil { 66 | rval[StrStatus] = StatusError 67 | rval[StrMessage] = "missing 'query' parameter" 68 | return rval, fmt.Errorf("missing 'query' parameter") 69 | } 70 | query, _ := params["query"].(string) 71 | // Check if query needs to be resolved 72 | query = resolveResponseString(inputRaw, query) 73 | 74 | // Execute the query based on type 75 | var result interface{} 76 | result, err = dbHandler.ExecuteQuery(query) 77 | if err != nil { 78 | rval[StrStatus] = StatusError 79 | rval[StrMessage] = fmt.Sprintf("database operation failed: %v", err) 80 | return rval, fmt.Errorf("database operation failed: %v", err) 81 | } 82 | 83 | // Transform the result into a JSON document where the headers are they keys 84 | // and the values are the values 85 | resultMap := cmn.ConvertMapToJSON(cmn.ConvertInfToMap(result)) // This converts result into a map[string]interface{} and then into a JSON document 86 | 87 | // Return the result 88 | rval[StrResponse] = resultMap 89 | rval[StrStatus] = StatusSuccess 90 | rval[StrMessage] = "database operation successful" 91 | 92 | return rval, nil 93 | } 94 | -------------------------------------------------------------------------------- /pkg/database/database_test.go: -------------------------------------------------------------------------------- 1 | package database 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | 7 | cfg "github.com/pzaino/thecrowler/pkg/config" 8 | ) 9 | 10 | func TestBuildConnectionString(t *testing.T) { 11 | // Test cases 12 | tests := []struct { 13 | name string 14 | config cfg.Config 15 | expected string 16 | }{ 17 | { 18 | name: "Test case 1: Default values", 19 | config: cfg.Config{ 20 | Database: cfg.Database{}, 21 | }, 22 | expected: "host=localhost port=5432 user=crowler password= dbname=SitesIndex sslmode=disable", 23 | }, 24 | { 25 | name: "Test case 2: Custom values", 26 | config: cfg.Config{ 27 | Database: cfg.Database{ 28 | Port: 5433, 29 | Host: "example.com", 30 | User: "customuser", 31 | Password: "custompassword", 32 | DBName: "customdb", 33 | SSLMode: "require", 34 | }, 35 | }, 36 | expected: "host=example.com port=5433 user=customuser password=custompassword dbname=customdb sslmode=require", 37 | }, 38 | } 39 | 40 | // Run tests 41 | for _, test := range tests { 42 | t.Run(test.name, func(t *testing.T) { 43 | result := buildConnectionString(test.config) 44 | if result != test.expected { 45 | t.Errorf("expected '%s', got '%s'", test.expected, result) 46 | } 47 | }) 48 | } 49 | } 50 | 51 | func TestNewHandler(t *testing.T) { 52 | // Test cases 53 | tests := []struct { 54 | name string 55 | config cfg.Config 56 | expectedType interface{} 57 | expectedErr error 58 | }{ 59 | { 60 | name: "Test case 1: Postgres", 61 | config: cfg.Config{ 62 | Database: cfg.Database{ 63 | Type: DBPostgresStr, 64 | }, 65 | }, 66 | expectedType: &PostgresHandler{}, 67 | expectedErr: nil, 68 | }, 69 | { 70 | name: "Test case 2: SQLite", 71 | config: cfg.Config{ 72 | Database: cfg.Database{ 73 | Type: DBSQLiteStr, 74 | }, 75 | }, 76 | expectedType: &SQLiteHandler{}, 77 | expectedErr: nil, 78 | }, 79 | { 80 | name: "Test case 3: Unsupported database type", 81 | config: cfg.Config{ 82 | Database: cfg.Database{ 83 | Type: "mysql", 84 | }, 85 | }, 86 | expectedType: nil, 87 | expectedErr: fmt.Errorf("unsupported database type: 'mysql'"), 88 | }, 89 | } 90 | 91 | // Run tests 92 | for _, test := range tests { 93 | t.Run(test.name, func(t *testing.T) { 94 | handler, err := NewHandler(test.config) 95 | 96 | if (err != nil && test.expectedErr == nil) || (err == nil && test.expectedErr != nil) || (err != nil && err.Error() != test.expectedErr.Error()) { 97 | t.Errorf("expected error '%v', got '%v'", test.expectedErr, err) 98 | } 99 | 100 | if test.expectedType != nil { 101 | switch test.expectedType.(type) { 102 | case *PostgresHandler: 103 | if _, ok := handler.(*PostgresHandler); !ok { 104 | t.Errorf("expected type *PostgresHandler, got %T", handler) 105 | } 106 | case *SQLiteHandler: 107 | if _, ok := handler.(*SQLiteHandler); !ok { 108 | t.Errorf("expected type *SQLiteHandler, got %T", handler) 109 | } 110 | } 111 | } else if handler != nil { 112 | t.Errorf("expected nil handler, got %T", handler) 113 | } 114 | }) 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /pkg/common/slices.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Paolo Fabio Zaino 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Package common package is used to store common functions and variables 16 | package common 17 | 18 | import ( 19 | "strconv" 20 | "strings" 21 | ) 22 | 23 | // PrepareSlice trims spaces from all elements of a slice. 24 | // PrepareSlice prepares a slice of strings by trimming and lowercasing each element. 25 | func PrepareSlice(slice *[]string, flags uint32) []string { 26 | prepared := make([]string, len(*slice)) // Pre-allocate slice to required size 27 | for i, s := range *slice { 28 | if flags&01 == 01 { 29 | s = strings.TrimSpace(s) 30 | } 31 | if flags&02 == 02 { 32 | s = strings.ToLower(s) 33 | } 34 | prepared[i] = s // Direct assignment to pre-allocated slice 35 | } 36 | return prepared 37 | } 38 | 39 | // SliceContains checks if a slice contains a specific item. 40 | func SliceContains(slice []string, item string) bool { 41 | // After some benchmarking tests, this is the fastest way to check if a slice contains an item. 42 | // the performance resulted better than using "range" and pre-unrolled loops. 43 | for i := 0; i < len(slice); i++ { 44 | if slice[i] == item { 45 | return true 46 | } 47 | } 48 | return false 49 | } 50 | 51 | // IntSliceToString converts a slice of integers to a string. 52 | func IntSliceToString(slice []int, joinStr string) string { 53 | // Convert the slice of integers to a slice of strings 54 | strSlice := make([]string, len(slice)) 55 | for i, v := range slice { 56 | strSlice[i] = strconv.Itoa(v) 57 | } 58 | return strings.Join(strSlice, joinStr) 59 | } 60 | 61 | // Float64SliceToString converts a slice of float64 to a string. 62 | func Float64SliceToString(slice []float64, joinStr string) string { 63 | // Convert the slice of float64 to a slice of strings 64 | strSlice := make([]string, len(slice)) 65 | for i, v := range slice { 66 | strSlice[i] = strconv.FormatFloat(v, 'f', -1, 64) 67 | } 68 | return strings.Join(strSlice, joinStr) 69 | } 70 | 71 | // Float32SliceToString converts a slice of float32 to a string. 72 | func Float32SliceToString(slice []float32, joinStr string) string { 73 | // Convert the slice of float32 to a slice of strings 74 | strSlice := make([]string, len(slice)) 75 | for i, v := range slice { 76 | strSlice[i] = strconv.FormatFloat(float64(v), 'f', -1, 32) 77 | } 78 | return strings.Join(strSlice, joinStr) 79 | } 80 | 81 | // BoolSliceToString converts a slice of bool to a string. 82 | func BoolSliceToString(slice []bool, joinStr string) string { 83 | // Convert the slice of bool to a slice of strings 84 | strSlice := make([]string, len(slice)) 85 | for i, v := range slice { 86 | strSlice[i] = strconv.FormatBool(v) 87 | } 88 | return strings.Join(strSlice, joinStr) 89 | } 90 | -------------------------------------------------------------------------------- /pkg/ruleset/crawlingrule.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Paolo Fabio Zaino 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Package ruleset implements the ruleset library for the Crowler and 16 | // the scrapper. 17 | package ruleset 18 | 19 | import ( 20 | "strings" 21 | ) 22 | 23 | ///// --------------------- CrawlingRule ------------------------------- ///// 24 | 25 | // GetRuleName returns the rule name for the specified crawling rule. 26 | func (c *CrawlingRule) GetRuleName() string { 27 | return strings.TrimSpace(c.RuleName) 28 | } 29 | 30 | // GetRequestType returns the request type for the specified crawling rule. 31 | // GET or POST etc. 32 | func (c *CrawlingRule) GetRequestType() string { 33 | return strings.ToUpper(strings.TrimSpace(c.RequestType)) 34 | } 35 | 36 | // GetTargetElements returns the target elements for the specified crawling rule. 37 | func (c *CrawlingRule) GetTargetElements() []TargetElement { 38 | return c.TargetElements 39 | } 40 | 41 | // GetFuzzingParameters returns the fuzzing parameters for the specified crawling rule. 42 | func (c *CrawlingRule) GetFuzzingParameters() []FuzzingParameter { 43 | return c.FuzzingParameters 44 | } 45 | 46 | ///// --------------------- TargetElement ------------------------------- ///// 47 | 48 | // GetSelectorType returns the selector type for the specified target element. 49 | func (t *TargetElement) GetSelectorType() string { 50 | return strings.ToLower(strings.TrimSpace(t.SelectorType)) 51 | } 52 | 53 | // GetSelector returns the selector for the specified target element. 54 | func (t *TargetElement) GetSelector() string { 55 | return strings.TrimSpace(t.Selector) 56 | } 57 | 58 | ///// --------------------- FuzzingParameter ------------------------------- ///// 59 | 60 | // GetParameterName returns the parameter name for the specified fuzzing parameter. 61 | func (f *FuzzingParameter) GetParameterName() string { 62 | return strings.TrimSpace(f.ParameterName) 63 | } 64 | 65 | // GetFuzzingType returns the fuzzing type for the specified fuzzing parameter. 66 | func (f *FuzzingParameter) GetFuzzingType() string { 67 | return strings.ToLower(strings.TrimSpace(f.FuzzingType)) 68 | } 69 | 70 | // GetValues returns the list of values for the specified fuzzing parameter. 71 | func (f *FuzzingParameter) GetValues() []string { 72 | trimmedValues := []string{} 73 | for _, v := range f.Values { 74 | trimmedValues = append(trimmedValues, strings.TrimSpace(v)) 75 | } 76 | return trimmedValues 77 | } 78 | 79 | // GetPattern returns the pattern for the specified fuzzing parameter. 80 | func (f *FuzzingParameter) GetPattern() string { 81 | return strings.TrimSpace(f.Pattern) 82 | } 83 | 84 | // GetSelector returns the selector for the specified fuzzing parameter. 85 | func (f *FuzzingParameter) GetSelector() string { 86 | return strings.TrimSpace(f.Selector) 87 | } 88 | -------------------------------------------------------------------------------- /pkg/ruleset/scrapingrule.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Paolo Fabio Zaino 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Package ruleset implements the ruleset library for the Crowler and 16 | // the scrapper. 17 | package ruleset 18 | 19 | import ( 20 | "strings" 21 | ) 22 | 23 | ///// ------------------------ ScrapingRule ---------------------------- ///// 24 | 25 | // GetRuleName returns the rule name for the specified scraping rule. 26 | func (r *ScrapingRule) GetRuleName() string { 27 | return strings.TrimSpace(r.RuleName) 28 | } 29 | 30 | // GetPaths returns the path for the specified scraping rule. 31 | func (r *ScrapingRule) GetPaths() []string { 32 | var paths []string 33 | for _, p := range r.PreConditions { 34 | if strings.TrimSpace(p.Path) == "" { 35 | continue 36 | } 37 | paths = append(paths, strings.TrimSpace(p.Path)) 38 | } 39 | return paths 40 | } 41 | 42 | // GetURLs returns the URL for the specified scraping rule. 43 | func (r *ScrapingRule) GetURLs() []string { 44 | var urls []string 45 | for _, u := range r.PreConditions { 46 | if strings.TrimSpace(u.URL) == "" { 47 | continue 48 | } 49 | urls = append(urls, strings.TrimSpace(u.URL)) 50 | } 51 | return urls 52 | } 53 | 54 | // GetElements returns the elements for the specified scraping rule. 55 | func (r *ScrapingRule) GetElements() []Element { 56 | return r.Elements 57 | } 58 | 59 | // GetJsFiles returns the js_files flag for the specified scraping rule. 60 | func (r *ScrapingRule) GetJsFiles() bool { 61 | return r.JsFiles 62 | } 63 | 64 | // GetJSONFieldMappings returns the JSON field mappings for the specified scraping rule. 65 | func (r *ScrapingRule) GetJSONFieldMappings() map[string]string { 66 | return r.JSONFieldMappings 67 | } 68 | 69 | // GetWaitConditions returns the wait conditions for the specified scraping rule. 70 | func (r *ScrapingRule) GetWaitConditions() []WaitCondition { 71 | return r.WaitConditions 72 | } 73 | 74 | // GetPostProcessing returns the post-processing steps for the specified scraping rule. 75 | func (r *ScrapingRule) GetPostProcessing() []PostProcessingStep { 76 | return r.PostProcessing 77 | } 78 | 79 | // GetConditionType returns the condition type for the specified wait condition. 80 | func (w *WaitCondition) GetConditionType() string { 81 | return strings.ToLower(strings.TrimSpace(w.ConditionType)) 82 | } 83 | 84 | // GetSelector returns the selector for the specified wait condition. 85 | func (w *WaitCondition) GetSelector() Selector { 86 | return w.Selector 87 | } 88 | 89 | // GetStepType returns the step type for the specified post-processing step. 90 | func (p *PostProcessingStep) GetStepType() string { 91 | return strings.ToLower(strings.TrimSpace(p.Type)) 92 | } 93 | 94 | // GetDetails returns the details for the specified post-processing step. 95 | func (p *PostProcessingStep) GetDetails() map[string]interface{} { 96 | return p.Details 97 | } 98 | -------------------------------------------------------------------------------- /pkg/crawler/fuzzing_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Paolo Fabio Zaino 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Package crawler implements the crawling logic of the application. 16 | // It's responsible for crawling a website and extracting information from it. 17 | package crawler 18 | 19 | import ( 20 | "testing" 21 | 22 | rules "github.com/pzaino/thecrowler/pkg/ruleset" 23 | ) 24 | 25 | // Mock data for tests 26 | var testRules = []struct { 27 | name string 28 | rule rules.CrawlingRule 29 | expectedURLCount int 30 | }{ 31 | { 32 | name: "FuzzQueryParameter", 33 | rule: rules.CrawlingRule{ 34 | RequestType: "GET", 35 | TargetElements: []rules.TargetElement{ 36 | { 37 | SelectorType: "query", 38 | Selector: "q", 39 | }, 40 | }, 41 | FuzzingParameters: []rules.FuzzingParameter{ 42 | { 43 | ParameterName: "query", 44 | FuzzingType: "fixed_list", 45 | Values: []string{"fuzz1", "fuzz2", "fuzz3"}, 46 | }, 47 | }, 48 | }, 49 | expectedURLCount: 3, 50 | }, 51 | { 52 | name: "FuzzURLPath", 53 | rule: rules.CrawlingRule{ 54 | RequestType: "GET", 55 | TargetElements: []rules.TargetElement{ 56 | { 57 | SelectorType: "path", 58 | Selector: "to-be-fuzzed", 59 | }, 60 | }, 61 | FuzzingParameters: []rules.FuzzingParameter{ 62 | { 63 | ParameterName: "path", 64 | FuzzingType: "fixed_list", 65 | Values: []string{"fuzzedPath1", "fuzzedPath2"}, 66 | }, 67 | }, 68 | }, 69 | expectedURLCount: 2, 70 | }, 71 | { 72 | name: "FuzzBothQueryAndPath", 73 | rule: rules.CrawlingRule{ 74 | RequestType: "GET", 75 | TargetElements: []rules.TargetElement{ 76 | { 77 | SelectorType: "query", 78 | Selector: "page", 79 | }, 80 | { 81 | SelectorType: "path", 82 | Selector: "to-be-fuzzed", 83 | }, 84 | }, 85 | FuzzingParameters: []rules.FuzzingParameter{ 86 | { 87 | ParameterName: "query", 88 | FuzzingType: "fixed_list", 89 | Values: []string{"10", "20"}, 90 | }, 91 | { 92 | ParameterName: "path", 93 | FuzzingType: "fixed_list", 94 | Values: []string{"fuzzedPath1", "fuzzedPath2"}, 95 | }, 96 | }, 97 | }, 98 | expectedURLCount: 4, // 2 for the query "page" parameter and 2 for the path 99 | }, 100 | } 101 | 102 | func TestFuzzURL(t *testing.T) { 103 | baseURL := "http://example.com/search?q=test&page=1" 104 | 105 | for _, tc := range testRules { 106 | t.Run(tc.name, func(t *testing.T) { 107 | fuzzedURLs, err := FuzzURL(baseURL, tc.rule) 108 | if err != nil { 109 | t.Errorf("FuzzURL returned an error: %v", err) 110 | } 111 | if len(fuzzedURLs) != tc.expectedURLCount { 112 | t.Errorf("Expected %d fuzzed URLs, got %d", tc.expectedURLCount, len(fuzzedURLs)) 113 | } 114 | }) 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /pkg/fingerprints/ja4.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Paolo Fabio Zaino 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Package fingerprints implements the fingerprints library for the Crowler 16 | package fingerprints 17 | 18 | import ( 19 | //nolint:gosec // Disabling G501: Md5 is required for backward compatibility, we do not use it for security purposes 20 | "crypto/md5" 21 | "encoding/hex" 22 | "strconv" 23 | "strings" 24 | ) 25 | 26 | // JA4 implements the Fingerprint interface for JA4 fingerprints. 27 | type JA4 struct { 28 | Version uint16 29 | Ciphers []uint16 30 | Extensions []uint16 31 | SupportedGroups []uint16 32 | SignatureAlgorithms []uint16 33 | SNI string 34 | ALPN []string 35 | } 36 | 37 | // Generic compute function to compute the JA4 fingerprint of a given TLS data string. 38 | func compute(data string) string { 39 | // Split data string to retrieve all fields 40 | fields := strings.Split(data, ",") 41 | 42 | // Join fields relevant to JA4, e.g., Version, Ciphers, Extensions, etc. 43 | // Example: "Version,Ciphers,Extensions,Groups,Signatures,SNI,ALPN" 44 | fingerprint := strings.Join(fields, ",") 45 | 46 | // Compute the MD5 hash of the joined fields 47 | //nolint:gosec // Disabling G401: Md5 is required for backward compatibility, we do not use it for security purposes 48 | hash := md5.Sum([]byte(fingerprint)) 49 | return hex.EncodeToString(hash[:]) 50 | } 51 | 52 | // Compute computes the JA4 fingerprint of a given TLS data string. 53 | // The data string should be constructed from the TLS handshake fields relevant to JA4. 54 | func (j JA4) Compute(data string) string { 55 | if data == "" { 56 | // generate data using j's fields 57 | data = strings.Join([]string{ 58 | strconv.Itoa(int(j.Version)), 59 | strconv.Itoa(len(j.Ciphers)), 60 | strconv.Itoa(len(j.Extensions)), 61 | strconv.Itoa(len(j.SupportedGroups)), 62 | strconv.Itoa(len(j.SignatureAlgorithms)), 63 | j.SNI, 64 | strconv.Itoa(len(j.ALPN)), 65 | }, ",") 66 | } 67 | return compute(data) 68 | } 69 | 70 | // JA4S implements the Fingerprint interface for JA4S fingerprints. 71 | type JA4S struct { 72 | Version uint16 73 | Ciphers []uint16 74 | Extensions []uint16 75 | SupportedGroups []uint16 76 | SignatureAlgorithms []uint16 77 | SNI string 78 | ALPN []string 79 | } 80 | 81 | // Compute computes the JA4S fingerprint of a given TLS data string. 82 | // The data string should be constructed from the TLS server handshake fields. 83 | func (j JA4S) Compute(data string) string { 84 | if data == "" { 85 | // generate data using j's fields 86 | data = strings.Join([]string{ 87 | strconv.Itoa(int(j.Version)), 88 | strconv.Itoa(len(j.Ciphers)), 89 | strconv.Itoa(len(j.Extensions)), 90 | strconv.Itoa(len(j.SupportedGroups)), 91 | strconv.Itoa(len(j.SignatureAlgorithms)), 92 | j.SNI, 93 | strconv.Itoa(len(j.ALPN)), 94 | }, ",") 95 | } 96 | return compute(data) 97 | } 98 | -------------------------------------------------------------------------------- /pkg/agent/execute_isolated_unix_other.go: -------------------------------------------------------------------------------- 1 | //go:build darwin || freebsd || openbsd || netbsd 2 | // +build darwin freebsd openbsd netbsd 3 | 4 | // Copyright 2023 Paolo Fabio Zaino 5 | // 6 | // Licensed under the Apache License, Version 2.0 (the "License"); 7 | // you may not use this file except in compliance with the License. 8 | // You may obtain a copy of the License at 9 | // 10 | // http://www.apache.org/licenses/LICENSE-2.0 11 | // 12 | // Unless required by applicable law or agreed to in writing, software 13 | // distributed under the License is distributed on an "AS IS" BASIS, 14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | // See the License for the specific language governing permissions and 16 | // limitations under the License. 17 | 18 | // Package agent provides the agent functionality for the CROWler. 19 | package agent 20 | 21 | import ( 22 | "bytes" 23 | "context" 24 | "fmt" 25 | "os" 26 | "os/exec" 27 | "syscall" 28 | "time" 29 | ) 30 | 31 | // This is the implementation of executeIsolatedCommand for Unix-like systems 32 | // like macOS, BSD etc. 33 | 34 | // executeIsolatedCommand (macOS/BSD) 35 | // - UID/GID drop if requested (requires privilege) 36 | // - Captures stdout/stderr + exit code 37 | // - Timeout with process-group kill 38 | // - chroot not supported here via child-only SysProcAttr; reject if requested. 39 | func executeIsolatedCommand( 40 | command string, 41 | args []string, 42 | chrootDir string, 43 | uid, gid uint32, 44 | timeout time.Duration, 45 | ) (stdout string, stderr string, exitCode int, err error) { 46 | if command == "" { 47 | return "", "", -1, fmt.Errorf("empty command") 48 | } 49 | if chrootDir != "" { 50 | return "", "", -1, fmt.Errorf("chroot is not supported on this platform via child-only SysProcAttr") 51 | } 52 | 53 | ctx := context.Background() 54 | if timeout > 0 { 55 | var cancel context.CancelFunc 56 | ctx, cancel = context.WithTimeout(ctx, timeout) 57 | defer cancel() 58 | } 59 | 60 | cmd := exec.CommandContext(ctx, command, args...) 61 | 62 | var outBuf, errBuf bytes.Buffer 63 | cmd.Stdout = &outBuf 64 | cmd.Stderr = &errBuf 65 | cmd.Env = []string{"PATH=/usr/bin:/bin"} 66 | 67 | sys := &syscall.SysProcAttr{ 68 | Setpgid: true, 69 | } 70 | // only set Credential if caller asked for a change 71 | curUID, curGID := os.Geteuid(), os.Getegid() 72 | useCred := false 73 | credUID, credGID := uint32(curUID), uint32(curGID) // nolint:gosec // this is fine here, go seems to return int for actual uint32 74 | if uid != 0 { 75 | useCred, credUID = true, uint32(uid) 76 | } 77 | if gid != 0 { 78 | useCred, credGID = true, uint32(gid) 79 | } 80 | if useCred { 81 | sys.Credential = &syscall.Credential{Uid: credUID, Gid: credGID} 82 | } 83 | cmd.SysProcAttr = sys 84 | 85 | if err := cmd.Start(); err != nil { 86 | return "", "", -1, fmt.Errorf("start failed: %w", err) 87 | } 88 | 89 | done := make(chan struct{}) 90 | go func() { _ = cmd.Wait(); close(done) }() 91 | 92 | select { 93 | case <-done: 94 | case <-ctx.Done(): 95 | _ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL) 96 | <-done 97 | } 98 | 99 | exit := -1 100 | if ps := cmd.ProcessState; ps != nil { 101 | if ws, ok := ps.Sys().(syscall.WaitStatus); ok { 102 | exit = ws.ExitStatus() 103 | } 104 | } 105 | 106 | if ctx.Err() == context.DeadlineExceeded { 107 | return outBuf.String(), errBuf.String(), exit, fmt.Errorf("command timeout after %s", timeout) 108 | } 109 | if cmd.ProcessState != nil && cmd.ProcessState.ExitCode() != 0 { 110 | return outBuf.String(), errBuf.String(), exit, fmt.Errorf("command exited with status %d", exit) 111 | } 112 | 113 | return outBuf.String(), errBuf.String(), exit, nil 114 | } 115 | -------------------------------------------------------------------------------- /pkg/crawler/fuzzing_rules.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Paolo Fabio Zaino 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Package crawler implements the crawling logic of the application. 16 | // It's responsible for crawling a website and extracting information from it. 17 | package crawler 18 | 19 | import ( 20 | "net/url" 21 | "strings" 22 | 23 | rules "github.com/pzaino/thecrowler/pkg/ruleset" 24 | ) 25 | 26 | // FuzzURL takes a base URL and a CrawlingRule, generating fuzzed URLs based on the rule's parameters. 27 | func FuzzURL(baseURL string, rule rules.CrawlingRule) ([]string, error) { 28 | var fuzzedURLs []string 29 | 30 | parsedURL, err := url.Parse(baseURL) 31 | if err != nil { 32 | return nil, err 33 | } 34 | 35 | fuzzedURLs = fuzzQueryParameters(parsedURL, rule, fuzzedURLs) 36 | fuzzedURLs = fuzzURLPath(parsedURL, rule, fuzzedURLs) 37 | 38 | return fuzzedURLs, nil 39 | } 40 | 41 | func fuzzQueryParameters(parsedURL *url.URL, rule rules.CrawlingRule, fuzzedURLs []string) []string { 42 | if parsedURL.RawQuery != "" { 43 | originalQuery := parsedURL.Query() 44 | 45 | for _, fuzzParam := range rule.GetFuzzingParameters() { 46 | // Check if this parameter is meant to be fuzzed 47 | if fuzzParam.GetParameterName() == "query" { 48 | values := generateFuzzValues(fuzzParam) 49 | for _, value := range values { 50 | fuzzedQuery := cloneQueryValues(originalQuery) 51 | selector := fuzzParam.GetSelector() // Declare the selector variable 52 | fuzzedQuery.Set(selector, value) 53 | fuzzedURL := *parsedURL 54 | fuzzedURL.RawQuery = fuzzedQuery.Encode() 55 | fuzzedURLs = append(fuzzedURLs, fuzzedURL.String()) 56 | } 57 | } 58 | } 59 | } 60 | 61 | return fuzzedURLs 62 | } 63 | 64 | func fuzzURLPath(parsedURL *url.URL, rule rules.CrawlingRule, fuzzedURLs []string) []string { 65 | const strPath = "path" 66 | for _, target := range rule.GetTargetElements() { 67 | selectorType := target.GetSelectorType() 68 | selector := target.GetSelector() 69 | 70 | if selectorType == strPath { 71 | for _, fuzzParam := range rule.GetFuzzingParameters() { 72 | if fuzzParam.GetParameterName() == strPath { 73 | values := generateFuzzValues(fuzzParam) 74 | for _, value := range values { 75 | fuzzedURL := *parsedURL 76 | fuzzedURL.Path = strings.Replace(fuzzedURL.Path, selector, value, 1) 77 | fuzzedURLs = append(fuzzedURLs, fuzzedURL.String()) 78 | } 79 | } 80 | } 81 | } 82 | } 83 | 84 | return fuzzedURLs 85 | } 86 | 87 | // Helper function to generate fuzz values based on the fuzzing parameter 88 | func generateFuzzValues(fuzzParam rules.FuzzingParameter) []string { 89 | var values []string 90 | if fuzzParam.GetFuzzingType() == "pattern_based" { 91 | // Implement pattern-based value generation logic 92 | values = append(values, fuzzParam.GetPattern()) // Simplification 93 | } else { 94 | values = fuzzParam.GetValues() 95 | } 96 | return values 97 | } 98 | 99 | // Helper function to clone query values to avoid mutating the original 100 | func cloneQueryValues(originalQuery url.Values) url.Values { 101 | fuzzedQuery := url.Values{} 102 | for k, v := range originalQuery { 103 | fuzzedQuery[k] = v 104 | } 105 | return fuzzedQuery 106 | } 107 | -------------------------------------------------------------------------------- /pkg/common/types.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Paolo Fabio Zaino 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Package common package is used to store common functions and variables 16 | package common 17 | 18 | // DbgLogType is an enum to represent the debug log type 19 | type DbgLogType int 20 | 21 | const ( 22 | // DbgLogTypeStdout is the standard output 23 | DbgLogTypeStdout = 0 24 | // DbgLogTypeFile is the file output 25 | DbgLogTypeFile = 1 26 | // DbgLogTypeSyslog is the syslog output 27 | DbgLogTypeSyslog = 2 28 | ) 29 | 30 | // LoggerCfg is the logger configuration 31 | type LoggerCfg struct { 32 | // Type is the type of logger to use (stdout, file, syslog) 33 | Type DbgLogType 34 | // File is the file to write the logs to 35 | File string 36 | // Host is the syslog server to send the logs to 37 | Host string 38 | // Port is the syslog server port 39 | Port int 40 | // Tag is the syslog tag 41 | Tag string 42 | // Facility is the syslog facility 43 | Facility string 44 | // Priority is the syslog priority 45 | Priority string 46 | } 47 | 48 | // DbgLevel is an enum to represent the debug level type 49 | type DbgLevel int 50 | 51 | const ( 52 | // DbgLvlDebug5 is the debug level 53 | DbgLvlDebug5 = 5 54 | // DbgLvlDebug4 is the debug level 55 | DbgLvlDebug4 = 4 56 | // DbgLvlDebug3 is the debug level 57 | DbgLvlDebug3 = 3 58 | // DbgLvlDebug2 is the debug level 59 | DbgLvlDebug2 = 2 60 | // DbgLvlDebug1 is the debug level 61 | DbgLvlDebug1 = 1 62 | // DbgLvlDebug is the debug level 63 | DbgLvlDebug = 1 64 | // DbgLvlInfo is the info debug level 65 | DbgLvlInfo = 0 66 | // DbgLvlWarn is the warning debug level 67 | DbgLvlWarn = -1 68 | // DbgLvlError is the error debug level 69 | DbgLvlError = -2 70 | // DbgLvlFatal is the fatal debug level (this will also exit the program!) 71 | DbgLvlFatal = -3 72 | ) 73 | 74 | // EnvValue is a struct to represent an environment variable 75 | type EnvValue struct { 76 | // Name is the name of the environment variable 77 | Name string 78 | // Value is the value of the environment variable 79 | Value interface{} 80 | // Type is the type of the environment variable 81 | Type string 82 | } 83 | 84 | var ( 85 | // UsrAgentStrMap is a list of valid user agent strings. 86 | UsrAgentStrMap = map[string]string{ 87 | "chrome-desktop01": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.78 Safari/537.36", 88 | "chrome-mobile01": "Mozilla/5.0 (Linux; Android 10; SM-G960F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.112 Mobile Safari/537.36", 89 | "firefox-desktop01": "Mozilla/5.0 (X11; Linux x86_64; rv:85.0) Gecko/20100101 Firefox/85.0", 90 | "firefox-mobile01": "Mozilla/5.0 (Android 10; Mobile; rv:85.0) Gecko/20100101 Firefox/85.0", 91 | "chromium-desktop01": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.112 Safari/537.36", 92 | "chromium-mobile01": "Mozilla/5.0 (Linux; Android 10; SM-G960F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.112 Mobile Safari/537.36", 93 | } 94 | ) 95 | 96 | const ( 97 | // DefaultFilePerms is the default file permissions 98 | DefaultFilePerms = 0644 99 | // DefaultDirPerms is the default directory permissions 100 | DefaultDirPerms = 0755 101 | ) 102 | -------------------------------------------------------------------------------- /tests/fuzz/api_fuzz_payloads.txt: -------------------------------------------------------------------------------- 1 | value1 2 | ex' 1 = 1 3 | 4 | null 5 | string_with_special_chars_!@#$% 6 | ' 7 | '' 8 | ` 9 | `` 10 | " 11 | "" 12 | -- 13 | # 14 | ; 15 | ' OR '1'='1 16 | " OR "1"="1 17 | ' OR '1'='1' -- 18 | " OR "1"="1" -- 19 | ' OR '1'='1' /* 20 | " OR "1"="1" /* 21 | ') OR ('1'='1' -- 22 | ") OR ("1"="1" -- 23 | ') OR ('1'='1' /* 24 | ") OR ("1"="1" /* 25 | ' OR 1=1 -- 26 | " OR 1=1 -- 27 | ') OR 1=1 -- 28 | ") OR 1=1 -- 29 | ' OR 'x'='x 30 | " OR "x"="x 31 | ') OR ('x'='x 32 | ") OR ("x"="x 33 | ' OR 1=1# 34 | ' OR 1=1-- 35 | ' OR 1=1; -- 36 | ' OR 'x'='x'; -- 37 | %00 38 | foo%00bar 39 | ../../../../%00/etc/passwd 40 | ../../../../etc/passwd%00 41 | %00/etc/passwd 42 | %00 43 | %22 44 | %27 45 | %2b 46 | %2d 47 | %2e 48 | %2f 49 | %3c 50 | %3e 51 | %5c 52 | %60 53 | < > % & ; $ ( ) \ ' " 54 | \0 \\ 55 | %00 %25 %2e %2f %5c %7e 56 | %2e%2e%2f %2e%2e\ %c0%af %c1%9c 57 | A*5000 58 | A*10000 59 | A*20000 60 | %u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141%u4141 61 | %u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090%u9090 62 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 63 | -------------------------------------------------------------------------------- /doc/ruleset_architecture.md: -------------------------------------------------------------------------------- 1 | # The Ruleset Architecture 2 | 3 | The crowler uses a rules engine to determine which: 4 | 5 | - URLs to visit (crawl) 6 | - Actions to take (how to interact with the page) 7 | - Data to collect (scraping and indexing) 8 | - Data to store (saving to a database, filesystem, etc.) 9 | - Entities to identify/detect (e.g. products, technologies, etc.) 10 | 11 | One way to describe what the CROWler is at its essence, 12 | is: **The CROWler is as smart as your ruleset is.** 13 | 14 | Rulesets can be expressed either as JSON files or YAML files. They 15 | can be provided locally with the crowler engine or fetched from a 16 | remote distribution server. 17 | 18 | Rules are generally declarative, however some rules type (like for 19 | example the Action rules) may be extended via imperative code (in 20 | Javascript). 21 | 22 | The combination of the CROWler configuration, the Source configuration 23 | and the rulesets gives the CROWler the ability to adapt to a large set 24 | of scenarios, together with the ability to be easily extended. 25 | 26 | Scraping rules, for example, can be used to also extend the CROWler's 27 | data model, by defining new entities and relationships between collected 28 | data. 29 | 30 | ## Rules Architecture Hierarchy 31 | 32 | ### Rules Engine 33 | 34 | At the top of the hierarchy is the Rules Engine (Rulesengine). The Rules 35 | Engine is responsible for orchestrating all the rulesets and provides 36 | methods to access them all. 37 | 38 | A rule engine is fundamentally a collection of rulesets. 39 | 40 | The Rules Engine is responsible for: 41 | 42 | - Loading all the rulesets 43 | - Provides methods for easy access to all the rulesets from all the 44 | CROWler components that requires it 45 | 46 | ### Ruleset 47 | 48 | The Ruleset is a collection of rule groups. 49 | 50 | A ruleset is a single file on the filesystem. The way you should think 51 | of a ruleset is a collection of rules that are related to each other and 52 | organized in groups. 53 | 54 | The Ruleset is responsible for: 55 | 56 | - Provide methods to access all the rule groups 57 | - Provide methods to access all the rules 58 | 59 | ### Rule Group 60 | 61 | The Rule Group is a collection of rules. It usually represents the 62 | concept that a set of rules are trying to achieve. 63 | 64 | A rules group can contain rules of different types, but they should be 65 | related to each other conceptually. 66 | 67 | The Rule Group is responsible for: 68 | 69 | - Provide methods to access all the rules 70 | - Provide properties that are common to all the rules in the group (like 71 | the group name, description, etc.) 72 | - Provide properties that would easily determine if a set of rules are 73 | enabled or disabled, valid in a certain context, etc. 74 | 75 | ### Rule 76 | 77 | The Rule is the smallest unit of the ruleset hierarchy. Each rule has a 78 | **rule type** and a set of conditions. 79 | 80 | The Rule is responsible for: 81 | 82 | - Representing a single activity that the CROWler should perform 83 | - Providing "what" the CROWler should do (not how it should do it) 84 | - Providing the conditions that must be met for the rule to be executed 85 | 86 | #### Rule Types 87 | 88 | The CROWler supports the following rule types: 89 | 90 | - Crawling rules (describe what we wish to crawl on a given site) 91 | - Action rules (describe what we wish to interact with on a page) 92 | - Scraping rules (describe what we wish to scrape on a page) 93 | - Detection rules (describe what we wish to detect on a page) 94 | 95 | #### Conditions 96 | 97 | Conditions are the criteria that must be met for the rule to be executed. 98 | Each rule type may present different types of conditions. 99 | 100 | ## Ruleset Reference 101 | 102 | Check this [link](./ruleset_reference.md) for a detailed reference of 103 | the ruleset schema. 104 | --------------------------------------------------------------------------------