├── .github └── workflows │ ├── cookieless.yml │ ├── deploy.yml │ ├── integration_tests │ └── telemetry │ │ ├── micro_config │ │ ├── config.hocon │ │ └── iglu.json │ │ ├── sender_config │ │ ├── config.hocon │ │ └── config_disabled.hocon │ │ └── verify_micro_content.py │ ├── ssc-collector-config │ └── config.hocon │ ├── telemetryIntegTest.yml │ └── test.yml ├── .gitignore ├── .gitleaksignore ├── .scalafmt.conf ├── CHANGELOG.md ├── LICENSE.md ├── README.md ├── build.sbt ├── core └── src │ ├── it │ └── scala │ │ └── com │ │ └── snowplowanalytics │ │ └── snowplow │ │ └── collectors │ │ └── scalastream │ │ └── it │ │ ├── CollectorContainer.scala │ │ ├── CollectorOutput.scala │ │ ├── EventGenerator.scala │ │ ├── Http.scala │ │ └── utils.scala │ ├── main │ ├── resources │ │ └── reference.conf │ └── scala │ │ └── com.snowplowanalytics.snowplow.collector.core │ │ ├── App.scala │ │ ├── AppInfo.scala │ │ ├── Config.scala │ │ ├── ConfigParser.scala │ │ ├── HttpServer.scala │ │ ├── Rfc6265Cookie.scala │ │ ├── Routes.scala │ │ ├── Run.scala │ │ ├── Service.scala │ │ ├── Sink.scala │ │ ├── SplitBatch.scala │ │ ├── Telemetry.scala │ │ └── model.scala │ └── test │ ├── resources │ ├── test-config-new-style.hocon │ └── test-config-old-style.hocon │ └── scala │ └── com.snowplowanalytics.snowplow.collector.core │ ├── ConfigParserSpec.scala │ ├── HttpServerSpec.scala │ ├── Rfc6265CookieSpec.scala │ ├── RoutesSpec.scala │ ├── ServiceSpec.scala │ ├── SplitBatchSpec.scala │ ├── TelemetrySpec.scala │ ├── TestSink.scala │ └── TestUtils.scala ├── examples ├── config.kafka.extended.hocon ├── config.kafka.minimal.hocon ├── config.kinesis.extended.hocon ├── config.kinesis.minimal.hocon ├── config.nsq.extended.hocon ├── config.nsq.minimal.hocon ├── config.pubsub.extended.hocon ├── config.pubsub.minimal.hocon ├── config.sqs.extended.hocon ├── config.sqs.minimal.hocon ├── config.stdout.extended.hocon └── config.stdout.minimal.hocon ├── flake.lock ├── flake.nix ├── http4s └── src │ └── it │ └── scala │ └── com │ └── snowplowanalytics │ └── snowplow │ └── collectors │ └── scalastream │ └── it │ └── CollectorContainer.scala ├── kafka └── src │ ├── it │ ├── resources │ │ └── collector.hocon │ └── scala │ │ └── com │ │ └── snowplowanalytics │ │ └── snowplow │ │ └── collectors │ │ └── scalastream │ │ └── it │ │ └── kafka │ │ ├── Containers.scala │ │ ├── KafkaCollectorSpec.scala │ │ └── KafkaUtils.scala │ ├── main │ ├── resources │ │ └── application.conf │ └── scala │ │ └── com.snowplowanalytics.snowplow.collectors.scalastream │ │ ├── KafkaCollector.scala │ │ ├── TelemetryUtils.scala │ │ └── sinks │ │ ├── AzureAuthenticationCallbackHandler.scala │ │ ├── KafkaSink.scala │ │ └── KafkaSinkConfig.scala │ └── test │ └── scala │ └── com.snowplowanalytics.snowplow.collectors.scalastream │ └── KafkaConfigSpec.scala ├── kinesis └── src │ ├── it │ ├── resources │ │ ├── collector-client-cookie.hocon │ │ ├── collector-cookie-anonymous.hocon │ │ ├── collector-cookie-attributes-1.hocon │ │ ├── collector-cookie-attributes-2.hocon │ │ ├── collector-cookie-domain.hocon │ │ ├── collector-cookie-fallback.hocon │ │ ├── collector-cookie-no-domain.hocon │ │ ├── collector-custom-paths.hocon │ │ ├── collector-doNotTrackCookie-disabled.hocon │ │ ├── collector-doNotTrackCookie-enabled.hocon │ │ └── collector.hocon │ └── scala │ │ └── com │ │ └── snowplowanalytics │ │ └── snowplow │ │ └── collectors │ │ └── scalastream │ │ └── it │ │ ├── core │ │ ├── CookieSpec.scala │ │ ├── CustomPathsSpec.scala │ │ ├── DoNotTrackCookieSpec.scala │ │ ├── HealthEndpointSpec.scala │ │ ├── RobotsSpec.scala │ │ └── XForwardedForSpec.scala │ │ └── kinesis │ │ ├── Kinesis.scala │ │ ├── KinesisCollectorSpec.scala │ │ └── containers │ │ ├── Collector.scala │ │ └── Localstack.scala │ ├── main │ ├── resources │ │ └── application.conf │ └── scala │ │ └── com.snowplowanalytics.snowplow.collectors.scalastream │ │ ├── KinesisCollector.scala │ │ ├── TelemetryUtils.scala │ │ └── sinks │ │ ├── KinesisSink.scala │ │ └── KinesisSinkConfig.scala │ └── test │ └── scala │ └── com.snowplowanalytics.snowplow.collectors.scalastream │ ├── TelemetryUtilsSpec.scala │ └── sinks │ ├── KinesisConfigSpec.scala │ └── KinesisSinkSpec.scala ├── nsq └── src │ ├── main │ ├── resources │ │ └── application.conf │ └── scala │ │ └── com.snowplowanalytics.snowplow.collectors.scalastream │ │ ├── NsqCollector.scala │ │ └── sinks │ │ ├── NsqSink.scala │ │ └── NsqSinkConfig.scala │ └── test │ └── scala │ └── com.snowplowanalytics.snowplow.collectors.scalastream │ └── NsqConfigSpec.scala ├── project ├── BuildSettings.scala ├── Dependencies.scala ├── build.properties └── plugins.sbt ├── pubsub └── src │ ├── it │ ├── resources │ │ └── collector.hocon │ └── scala │ │ └── com │ │ └── snowplowanalytics │ │ └── snowplow │ │ └── collectors │ │ └── scalastream │ │ └── it │ │ └── pubsub │ │ ├── Containers.scala │ │ ├── GooglePubSubCollectorSpec.scala │ │ └── PubSub.scala │ ├── main │ ├── resources │ │ └── application.conf │ └── scala │ │ └── com.snowplowanalytics.snowplow.collectors.scalastream │ │ ├── PubSubCollector.scala │ │ └── sinks │ │ ├── BuilderOps.scala │ │ ├── PubSubHealthCheck.scala │ │ ├── PubSubSink.scala │ │ └── PubSubSinkConfig.scala │ └── test │ └── scala │ └── com.snowplowanalytics.snowplow.collectors.scalastream │ ├── ConfigSpec.scala │ └── sinks │ └── GcpUserAgentSpec.scala ├── sqs └── src │ ├── main │ ├── resources │ │ └── application.conf │ └── scala │ │ └── com.snowplowanalytics.snowplow.collectors.scalastream │ │ ├── SqsCollector.scala │ │ ├── TelemetryUtils.scala │ │ └── sinks │ │ ├── SqsSink.scala │ │ └── SqsSinkConfig.scala │ └── test │ └── scala │ └── com.snowplowanalytics.snowplow.collectors.scalastream │ ├── SqsConfigSpec.scala │ └── TelemetryUtilsSpec.scala └── stdout └── src ├── main ├── resources │ └── application.conf └── scala │ └── com.snowplowanalytics.snowplow.collector.stdout │ ├── PrintingSink.scala │ ├── SinkConfig.scala │ └── StdoutCollector.scala └── test └── scala └── com.snowplowanalytics.snowplow.collectors.scalastream └── sinks └── PrintingSinkSpec.scala /.github/workflows/cookieless.yml: -------------------------------------------------------------------------------- 1 | name: Test cookieless tracking 2 | 3 | on: push 4 | 5 | jobs: 6 | run_test: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v2 10 | - uses: coursier/cache-action@v3 11 | - name: Set up JDK 12 | uses: actions/setup-java@v1 13 | with: 14 | java-version: 11 15 | - name: Install sbt 16 | uses: sbt/setup-sbt@v1 17 | - name: Publish Docker image 18 | run: sbt 'project stdout; set Docker / version := "0.0.0"' docker:publishLocal 19 | - name: Run Docker image 20 | run: docker run -d -v "$PWD"/.github/workflows/ssc-collector-config:/snowplow/config -p 12345:12345 snowplow/scala-stream-collector-stdout:0.0.0 --config /snowplow/config/config.hocon 21 | - name: Allow time for collector to start 22 | run: sleep 30 23 | - name: Test non-anonymous tracking 24 | id: non-anonymous 25 | run: | 26 | output=$(curl -X POST -i http://0.0.0.0:12345/com.snowplowanalytics.snowplow/tp2 -d '{}' 2>&1 | grep -q 'Set-Cookie') 27 | echo "exit_code=$?" >> $GITHUB_OUTPUT 28 | - name: Test anonymous tracking 29 | id: anonymous 30 | if: ${{ steps.non-anonymous.outputs.exit_code == 0 }} 31 | run: | 32 | set +e 33 | output=$(curl -X POST -i http://0.0.0.0:12345/com.snowplowanalytics.snowplow/tp2 -H 'SP-Anonymous: *' -d '{}' 2>&1 | grep -q 'Set-Cookie') 34 | echo "exit_code=$?" >> $GITHUB_OUTPUT 35 | - name: Report outcome 36 | if: ${{ steps.non-anonymous.outputs.exit_code == 0 && steps.anonymous.outputs.exit_code == 1 }} 37 | run: echo "All tests successful!" 38 | - name: Stop Docker container 39 | run: docker stop $(docker ps -aq) 40 | -------------------------------------------------------------------------------- /.github/workflows/integration_tests/telemetry/micro_config/config.hocon: -------------------------------------------------------------------------------- 1 | collector { 2 | interface = "0.0.0.0" 3 | port = 9191 4 | ssl { 5 | enable = true 6 | redirect = false 7 | port = 9092 8 | } 9 | paths {} 10 | p3p { 11 | policyRef = "/w3c/p3p.xml" 12 | CP = "NOI DSP COR NID PSA OUR IND COM NAV STA" 13 | } 14 | crossDomain { 15 | enabled = false 16 | domains = [ "*" ] 17 | secure = true 18 | } 19 | cookie { 20 | enabled = true 21 | expiration = 365 days 22 | name = "mycookiename" 23 | domains = [ ] 24 | secure = false 25 | httpOnly = false 26 | } 27 | doNotTrackCookie { 28 | enabled = false 29 | name = mydntcname 30 | value = mydntcvalue 31 | } 32 | cookieBounce { 33 | enabled = false 34 | name = "n3pc" 35 | fallbackNetworkUserId = "00000000-0000-4000-A000-000000000000" 36 | forwardedProtocolHeader = "X-Forwarded-Proto" 37 | } 38 | enableDefaultRedirect = false 39 | redirectMacro { 40 | enabled = false 41 | placeholder = "[TOKEN]" 42 | } 43 | rootResponse { 44 | enabled = false 45 | statusCode = 302 46 | headers = { 47 | Location = "https://127.0.0.1/", 48 | X-Custom = "something" 49 | } 50 | body = "302, redirecting" 51 | } 52 | cors { 53 | accessControlMaxAge = 5 seconds 54 | } 55 | prometheusMetrics { 56 | enabled = false 57 | } 58 | streams { 59 | good = mygood 60 | bad = mybad 61 | useIpAddressAsPartitionKey = false 62 | sink { 63 | enabled = stdout 64 | } 65 | buffer { 66 | byteLimit = 1 67 | recordLimit = 1 68 | timeLimit = 1 69 | } 70 | } 71 | } 72 | 73 | akka { 74 | loglevel = DEBUG 75 | loggers = ["akka.event.slf4j.Slf4jLogger"] 76 | http.server { 77 | remote-address-header = on 78 | raw-request-uri-header = on 79 | parsing { 80 | max-uri-length = 32768 81 | uri-parsing-mode = relaxed 82 | } 83 | } 84 | } 85 | 86 | -------------------------------------------------------------------------------- /.github/workflows/integration_tests/telemetry/micro_config/iglu.json: -------------------------------------------------------------------------------- 1 | { 2 | "schema": "iglu:com.snowplowanalytics.iglu/resolver-config/jsonschema/1-0-1", 3 | "data": { 4 | "cacheSize": 500, 5 | "repositories": [ 6 | { 7 | "name": "Iglu Central", 8 | "priority": 1, 9 | "vendorPrefixes": [ "com.snowplowanalytics" ], 10 | "connection": { 11 | "http": { 12 | "uri": "http://iglucentral.com" 13 | } 14 | } 15 | } 16 | ] 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /.github/workflows/integration_tests/telemetry/verify_micro_content.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | import requests 3 | import unittest 4 | 5 | 6 | class TestTelemetry(unittest.TestCase): 7 | all = requests.request("GET", "http://127.0.0.1:9191/micro/all").json() 8 | # { 9 | # "total": 55, 10 | # "good": 55, 11 | # "bad": 0 12 | # } 13 | good = requests.request("GET", "http://127.0.0.1:9191/micro/good").json() 14 | # [ ...{ 15 | # "unstruct_event": { 16 | # "schema": "iglu:com.snowplowanalytics.snowplow/unstruct_event/jsonschema/1-0-0", 17 | # "data": 18 | # "schema": "iglu:com.snowplowanalytics.oss/oss_context/jsonschema/1-0-1", 19 | # "data": { 20 | # "userProvidedId": "userProvidedIdValue", 21 | # "moduleName": "moduleNameValue", 22 | # "moduleVersion": null, 23 | # "instanceId": null, 24 | # "region": null, 25 | # "cloud": null, 26 | # "applicationName": "snowplow-stream-collector-stdout", 27 | # "applicationVersion": "2.3.1", 28 | # "appGeneratedId": "00968dc0-26de-4378-abcf-00329c8020b6" 29 | # } 30 | # } 31 | # } 32 | event_data = [entry['event']['unstruct_event']['data'] for entry in good] 33 | 34 | def test_no_bad_events(self): 35 | self.assertEqual(self.all["bad"], 0) 36 | 37 | def test_frequency(self): 38 | # should be around 30 - 1 per second over 30 seconds 39 | self.assertGreater(self.all["good"], 20) 40 | 41 | def test_consistency(self): 42 | # Test that we got the same events. Comparing first to last. 43 | self.assertDictEqual(self.event_data[0], self.event_data[-1]) 44 | 45 | def test_no_events_when_disabled(self): 46 | # disabled collector should not send any events. 47 | self.assertFalse(any(e['data']['moduleName'] == 'Disabled' for e in self.event_data)) 48 | 49 | def test_collector_name_is_taken_from_build(self): 50 | # Version correctly taken from BuildInfo 51 | self.assertTrue( 52 | all(e['data']['applicationName'] == "snowplow-stream-collector-stdout" for e in self.event_data)) 53 | 54 | 55 | if __name__ == '__main__': 56 | unittest.main() 57 | -------------------------------------------------------------------------------- /.github/workflows/ssc-collector-config/config.hocon: -------------------------------------------------------------------------------- 1 | collector { 2 | license { accept = true } 3 | interface = 0.0.0.0 4 | port = 12345 5 | 6 | paths { } 7 | 8 | p3p { 9 | policyRef = "/w3c/p3p.xml" 10 | CP = "NOI DSP COR NID PSA OUR IND COM NAV STA" 11 | } 12 | 13 | crossDomain { 14 | enabled = false 15 | domains = [ "*" ] 16 | secure = true 17 | } 18 | 19 | cookie { 20 | enabled = true 21 | expiration = "365 days" 22 | name = "sp" 23 | secure = false 24 | httpOnly = false 25 | } 26 | 27 | doNotTrackCookie { 28 | enabled = false 29 | name = "dnt" 30 | value = "dnt" 31 | } 32 | 33 | cookieBounce { 34 | enabled = false 35 | name = "n3pc" 36 | fallbackNetworkUserId = "00000000-0000-4000-A000-000000000000" 37 | } 38 | 39 | redirectMacro { 40 | enabled = false 41 | } 42 | 43 | rootResponse { 44 | enabled = false 45 | statusCode = 302 46 | } 47 | 48 | cors { 49 | accessControlMaxAge = 5 seconds 50 | } 51 | 52 | streams { 53 | good = "good" 54 | bad = "bad" 55 | useIpAddressAsPartitionKey = true 56 | 57 | sink { 58 | enabled = "stdout" 59 | } 60 | 61 | buffer { 62 | byteLimit = 1024 63 | recordLimit = 1 64 | timeLimit = 30 65 | } 66 | } 67 | 68 | telemetry{ 69 | disabled = true 70 | } 71 | } 72 | 73 | akka { 74 | loglevel = DEBUG 75 | loggers = ["akka.event.slf4j.Slf4jLogger"] 76 | 77 | http.server { 78 | remote-address-header = on 79 | raw-request-uri-header = on 80 | 81 | parsing { 82 | max-uri-length = 32768 83 | uri-parsing-mode = relaxed 84 | } 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /.github/workflows/telemetryIntegTest.yml: -------------------------------------------------------------------------------- 1 | name: telemetryIntegTest 2 | 3 | on: push 4 | 5 | jobs: 6 | integ_test: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v2 10 | - uses: coursier/cache-action@v3 11 | - name: Set up JDK 12 | uses: actions/setup-java@v1 13 | with: 14 | java-version: 11 15 | - name: Install sbt 16 | uses: sbt/setup-sbt@v1 17 | - name: Set up python 18 | uses: actions/setup-python@v2 19 | with: 20 | python-version: '3.8' 21 | - name: build 22 | run: | 23 | sbt "project stdout" assembly 24 | find $(pwd) -name "*.jar" 25 | - name: run micro 26 | run: > 27 | docker run -d 28 | --name micro 29 | --mount type=bind,source=$(pwd)/.github/workflows/integration_tests/telemetry/micro_config,destination=/config 30 | -p 9191:9191 snowplow/snowplow-micro:1.2.1 31 | --collector-config /config/config.hocon 32 | --iglu /config/iglu.json 33 | - name: run collectors 34 | run: | 35 | java -jar $(pwd)/stdout/target/scala-*/*.jar --config $(pwd)/.github/workflows/integration_tests/telemetry/sender_config/config.hocon & 36 | PID_C1=$! 37 | java -jar $(pwd)/stdout/target/scala-*/*.jar --config $(pwd)/.github/workflows/integration_tests/telemetry/sender_config/config_disabled.hocon & 38 | PID_C2=$! 39 | sleep 30 40 | kill $PID_C1 $PID_C2 41 | - name: assess result 42 | run: | 43 | pip install requests 44 | python3 .github/workflows/integration_tests/telemetry/verify_micro_content.py 45 | - name: clean up 46 | run: docker stop micro 47 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: test 2 | 3 | on: push 4 | 5 | jobs: 6 | test: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v2 10 | - uses: coursier/cache-action@v3 11 | - name: Set up JDK 12 | uses: actions/setup-java@v1 13 | with: 14 | java-version: 11 15 | - name: Install sbt 16 | uses: sbt/setup-sbt@v1 17 | - name: Check formatting 18 | run: sbt scalafmtCheckAll 19 | - name: Run unit tests 20 | run: sbt +test 21 | - name: Run integration tests Kinesis 22 | run: sbt "project kinesisDistroless" IntegrationTest/test 23 | - name: Run integration tests PubSub 24 | run: sbt "project pubsubDistroless" IntegrationTest/test 25 | - name: Run integration tests Kafka 26 | run: sbt "project kafkaDistroless" IntegrationTest/test 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | project/target 2 | project/project 3 | target 4 | 5 | src/main/resources/application.conf 6 | 7 | .bsp 8 | -------------------------------------------------------------------------------- /.gitleaksignore: -------------------------------------------------------------------------------- 1 | examples/config.hocon.sample:hashicorp-tf-password:365 2 | .github/workflows/integration_tests/telemetry/sender_config/config_disabled.hocon:hashicorp-tf-password:234 3 | .github/workflows/integration_tests/telemetry/sender_config/config.hocon:hashicorp-tf-password:235 4 | -------------------------------------------------------------------------------- /.scalafmt.conf: -------------------------------------------------------------------------------- 1 | version = 2.3.2 2 | style = default 3 | maxColumn = 120 4 | optIn.breakChainOnFirstMethodDot = false 5 | assumeStandardLibraryStripMargin = true 6 | align = most 7 | align.tokens.add = ["|", "!", "!!", "||", "=>", "=", "->", "<-", "|@|", "//", "/", "+", "%", "%%"] 8 | continuationIndent.defnSite = 2 9 | rewrite.rules = [ 10 | AsciiSortImports, 11 | AvoidInfix, 12 | PreferCurlyFors, 13 | RedundantBraces, 14 | RedundantParens, 15 | SortModifiers 16 | ] 17 | project.git = true 18 | includeNoParensInSelectChains = true -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # Snowplow Limited Use License Agreement 2 | 3 | _Version 1.1, November, 2024_ 4 | 5 | This Snowplow Limited Use License Agreement, Version 1.1 (the “Agreement”) sets forth the terms on which Snowplow Analytics, Ltd. (“Snowplow”) makes available certain software (the “Software”). BY INSTALLING, DOWNLOADING, ACCESSING, OR USING ANY OF THE SOFTWARE, YOU AGREE TO THE TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE TO SUCH TERMS AND CONDITIONS, YOU MUST NOT USE THE SOFTWARE. IF YOU ARE RECEIVING THE SOFTWARE ON BEHALF OF A LEGAL ENTITY, YOU REPRESENT AND WARRANT THAT YOU HAVE THE ACTUAL AUTHORITY TO AGREE TO THE TERMS AND CONDITIONS OF THIS AGREEMENT ON BEHALF OF SUCH ENTITY. “Licensee” means you, an individual, or the entity on whose behalf you are receiving the Software. 6 | 7 | ## 1. LICENSE GRANT AND CONDITIONS 8 | 9 | **1.1 License.** Subject to the terms and conditions of this Agreement, Snowplow hereby grants to Licensee a non-exclusive, royalty-free, worldwide, non-transferable, non-sublicensable license during the term of this Agreement to: (a) use the Software; (b) prepare modifications and derivative works of the Software; and (c) reproduce copies of the Software (the “License”). No right to distribute or make available the Software is granted under this License. Licensee is not granted the right to, and Licensee shall not, exercise the License for any Competing Use, and Licensee may exercise the License only for Non-Production Use or Non-Commercial Use. 10 | 11 | **1.2 Definitions.** For purposes of this Agreement: 12 | 13 | * **1.2.1** “Competing Use” is making available any on-premises or distributed software product, or any software-as-a-service, platform-as-a-service, infrastructure-as-a-service, or other similar online service, that competes with any products or services that Snowplow or any of its affiliates provides using the Software. 14 | 15 | * **1.2.2** “Non-Production Use” means any use of the Software to process test or synthetic data to evaluate the sufficiency of the Software for use by Licensee. 16 | 17 | * **1.2.3** “Non-Commercial Use” is only: (a) personal use for research, experiment, personal study, or hobby projects, without any anticipated commercial application, or (b) use for teaching purposes by lecturers of a school or university. 18 | 19 | **1.3 Conditions.** In consideration of the License, Licensee’s use of the Software is subject to the following conditions: 20 | 21 | * **a.** Licensee must cause any Software modified by Licensee to carry prominent notices stating that Licensee modified the Software. 22 | 23 | * **b.** On each Software copy, Licensee shall reproduce and not remove or alter all Snowplow or third party copyright or other proprietary notices contained in the Software, and Licensee must include the notice below on each copy. 24 | 25 | ``` 26 | This software is made available by Snowplow Analytics, Ltd., 27 | under the terms of the Snowplow Limited Use License Agreement, Version 1.1 28 | located at https://docs.snowplow.io/limited-use-license-1.1 29 | BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION 30 | OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT. 31 | ``` 32 | 33 | **1.4 Licensee Modifications.** Licensee may add its own copyright notices to modifications made by Licensee. 34 | 35 | **1.5 No Sublicensing.** The License does not include the right to sublicense the Software, however, each recipient to which Licensee provides the Software may exercise the Licenses so long as such recipient agrees to the terms and conditions of this Agreement. 36 | 37 | ## 2. TERM AND TERMINATION 38 | 39 | This Agreement will continue unless and until earlier terminated as set forth herein. If Licensee breaches any of its conditions or obligations under this Agreement, this Agreement will terminate automatically and the License will terminate automatically and permanently. 40 | 41 | ## 3. INTELLECTUAL PROPERTY 42 | 43 | As between the parties, Snowplow will retain all right, title, and interest in the Software, and all intellectual property rights therein. Snowplow hereby reserves all rights not expressly granted to Licensee in this Agreement. Snowplow hereby reserves all rights in its trademarks and service marks, and no licenses therein are granted in this Agreement. 44 | 45 | ## 4. DISCLAIMER 46 | 47 | SNOWPLOW HEREBY DISCLAIMS ANY AND ALL WARRANTIES AND CONDITIONS, EXPRESS, IMPLIED, STATUTORY, OR OTHERWISE, AND SPECIFICALLY DISCLAIMS ANY WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, WITH RESPECT TO THE SOFTWARE. 48 | 49 | ## 5. LIMITATION OF LIABILITY 50 | 51 | SNOWPLOW WILL NOT BE LIABLE FOR ANY DAMAGES OF ANY KIND, INCLUDING BUT NOT LIMITED TO LOST PROFITS OR ANY CONSEQUENTIAL, SPECIAL, INCIDENTAL, INDIRECT, OR DIRECT DAMAGES, HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ARISING OUT OF THIS AGREEMENT. THE FOREGOING SHALL APPLY TO THE EXTENT PERMITTED BY APPLICABLE LAW. 52 | 53 | ## 6. GENERAL 54 | 55 | **6.1 Governing Law.** This Agreement will be governed by and interpreted in accordance with the laws of the state of Delaware, without reference to its conflict of laws principles. If Licensee is located within the United States, all disputes arising out of this Agreement are subject to the exclusive jurisdiction of courts located in Delaware, USA. If Licensee is located outside of the United States, any dispute, controversy or claim arising out of or relating to this Agreement will be referred to and finally determined by arbitration in accordance with the JAMS International Arbitration Rules. The tribunal will consist of one arbitrator. The place of arbitration will be in the State of Delaware, USA. The language to be used in the arbitral proceedings will be English. Judgment upon the award rendered by the arbitrator may be entered in any court having jurisdiction thereof. 56 | 57 | **6.2. Assignment.** Licensee is not authorized to assign its rights under this Agreement to any third party. Snowplow may freely assign its rights under this Agreement to any third party. 58 | 59 | **6.3. Other.** This Agreement is the entire agreement between the parties regarding the subject matter hereof. No amendment or modification of this Agreement will be valid or binding upon the parties unless made in writing and signed by the duly authorized representatives of both parties. In the event that any provision, including without limitation any condition, of this Agreement is held to be unenforceable, this Agreement and all licenses and rights granted hereunder will immediately terminate. Waiver by Snowplow of a breach of any provision of this Agreement or the failure by Snowplow to exercise any right hereunder will not be construed as a waiver of any subsequent breach of that right or as a waiver of any other right. 60 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Scala Stream Collector 2 | [![Build Status][build-image]][build-wf] 3 | [![Release][release-image]][releases] 4 | [![License][license-image]][license] 5 | 6 | 7 | ## Introduction 8 | 9 | Stream Collector receives raw [Snowplow][snowplow] events sent over HTTP by trackers or webhooks. It serializes them to a [Thrift][thrift] record format, and then writes them to one of the supported sinks like [Amazon Kinesis][kinesis], [Google PubSub][pubsub], [Apache Kafka][kafka], [Amazon SQS][sqs], [NSQ][nsq]. 10 | The Stream Collector supports cross-domain Snowplow deployments, setting a `user_id` (used to identify unique visitors) server side to reliably identify the same user across domains. 11 | 12 | ## Find out more 13 | 14 | | Technical Docs | Setup Guide | Contributing | 15 | |----------------------------|----------------------|------------------------------| 16 | | ![i1][techdocs-image] | ![i2][setup-image] | ![i4][contributing-image] | 17 | | [Technical Docs][techdocs] | [Setup Guide][setup] | [Contributing][contributing] | 18 | 19 | ## Copyright and license 20 | 21 | Copyright (c) 2023-present Snowplow Analytics Ltd. All rights reserved. 22 | 23 | Licensed under the [Snowplow Limited Use License Agreement][license]. _(If you are uncertain how it applies to your use case, check our answers to [frequently asked questions][faq].)_ 24 | 25 | [snowplow]: https://snowplow.io/ 26 | 27 | [thrift]: http://thrift.apache.org 28 | [kinesis]: http://aws.amazon.com/kinesis 29 | [pubsub]: https://cloud.google.com/pubsub/ 30 | [kafka]: http://kafka.apache.org 31 | [sqs]: https://aws.amazon.com/sqs/ 32 | [nsq]: http://nsq.io/ 33 | 34 | [techdocs-image]: https://d3i6fms1cm1j0i.cloudfront.net/github/images/techdocs.png 35 | [setup-image]: https://d3i6fms1cm1j0i.cloudfront.net/github/images/setup.png 36 | [contributing-image]: https://d3i6fms1cm1j0i.cloudfront.net/github/images/contributing.png 37 | 38 | [techdocs]: https://docs.snowplow.io/docs/pipeline-components-and-applications/stream-collector/ 39 | [setup]: https://docs.snowplow.io/docs/getting-started-on-community-edition/ 40 | [contributing]: https://docs.snowplow.io/docs/contributing/ 41 | 42 | [build-image]: https://github.com/snowplow/stream-collector/workflows/build/badge.svg 43 | [build-wf]: https://github.com/snowplow/stream-collector/actions?query=workflow%3Abuild 44 | 45 | [release-image]: https://img.shields.io/github/v/release/snowplow/stream-collector?sort=semver&style=flat 46 | [releases]: https://github.com/snowplow/stream-collector 47 | 48 | [license]: https://docs.snowplow.io/limited-use-license-1.1 49 | [license-image]: https://img.shields.io/badge/license-Snowplow--Limited--Use-blue.svg?style=flat 50 | 51 | [faq]: https://docs.snowplow.io/docs/contributing/limited-use-license-faq/ 52 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2013-present Snowplow Analytics Ltd. 3 | * All rights reserved. 4 | * 5 | * This software is made available by Snowplow Analytics, Ltd., 6 | * under the terms of the Snowplow Limited Use License Agreement, Version 1.1 7 | * located at https://docs.snowplow.io/limited-use-license-1.1 8 | * BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION 9 | * OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT. 10 | */ 11 | 12 | lazy val root = project 13 | .in(file(".")) 14 | .aggregate(kinesis, pubsub, kafka, nsq, stdout, sqs, core) 15 | 16 | lazy val core = project 17 | .settings(moduleName := "snowplow-stream-collector-http4s-core") 18 | .settings(BuildSettings.coreHttp4sSettings) 19 | .settings( 20 | libraryDependencies ++= Seq( 21 | Dependencies.Libraries.http4sDsl, 22 | Dependencies.Libraries.http4sBlaze, 23 | Dependencies.Libraries.http4sClient, 24 | Dependencies.Libraries.log4cats, 25 | Dependencies.Libraries.thrift, 26 | Dependencies.Libraries.badRows, 27 | Dependencies.Libraries.collectorPayload, 28 | Dependencies.Libraries.slf4j, 29 | Dependencies.Libraries.decline, 30 | Dependencies.Libraries.circeGeneric, 31 | Dependencies.Libraries.circeConfig, 32 | Dependencies.Libraries.trackerCore, 33 | Dependencies.Libraries.emitterHttps, 34 | Dependencies.Libraries.datadogHttp4s, 35 | Dependencies.Libraries.datadogStatsd, 36 | Dependencies.Libraries.specs2, 37 | Dependencies.Libraries.specs2CE, 38 | Dependencies.Libraries.ceTestkit, 39 | Dependencies.Libraries.jnrPosix, 40 | Dependencies.Libraries.httpClient, 41 | 42 | //Integration tests 43 | Dependencies.Libraries.IntegrationTests.testcontainers, 44 | Dependencies.Libraries.IntegrationTests.http4sClient, 45 | Dependencies.Libraries.IntegrationTests.catsRetry 46 | 47 | ) 48 | ) 49 | .configs(IntegrationTest) 50 | 51 | lazy val kinesis = project 52 | .settings(BuildSettings.kinesisSettings) 53 | .enablePlugins(JavaAppPackaging, SnowplowDockerPlugin, BuildInfoPlugin) 54 | .dependsOn(core % "test->test;compile->compile;it->it") 55 | .configs(IntegrationTest) 56 | 57 | lazy val kinesisDistroless = project 58 | .in(file("distroless/kinesis")) 59 | .settings(sourceDirectory := (kinesis / sourceDirectory).value) 60 | .settings(BuildSettings.kinesisSettings) 61 | .enablePlugins(JavaAppPackaging, SnowplowDistrolessDockerPlugin, BuildInfoPlugin) 62 | .dependsOn(core % "test->test;compile->compile;it->it") 63 | .configs(IntegrationTest) 64 | 65 | lazy val sqs = project 66 | .settings(BuildSettings.sqsSettings) 67 | .enablePlugins(JavaAppPackaging, SnowplowDockerPlugin, BuildInfoPlugin) 68 | .dependsOn(core % "test->test;compile->compile") 69 | 70 | lazy val sqsDistroless = project 71 | .in(file("distroless/sqs")) 72 | .settings(sourceDirectory := (sqs / sourceDirectory).value) 73 | .settings(BuildSettings.sqsSettings) 74 | .enablePlugins(JavaAppPackaging, SnowplowDistrolessDockerPlugin, BuildInfoPlugin) 75 | .dependsOn(core % "test->test;compile->compile") 76 | 77 | lazy val pubsub = project 78 | .settings(BuildSettings.pubsubSettings) 79 | .enablePlugins(JavaAppPackaging, SnowplowDockerPlugin, BuildInfoPlugin) 80 | .dependsOn(core % "test->test;compile->compile;it->it") 81 | .configs(IntegrationTest) 82 | 83 | lazy val pubsubDistroless = project 84 | .in(file("distroless/pubsub")) 85 | .settings(sourceDirectory := (pubsub / sourceDirectory).value) 86 | .settings(BuildSettings.pubsubSettings) 87 | .enablePlugins(JavaAppPackaging, SnowplowDistrolessDockerPlugin, BuildInfoPlugin) 88 | .dependsOn(core % "test->test;compile->compile;it->it") 89 | .configs(IntegrationTest) 90 | 91 | lazy val kafka = project 92 | .settings(BuildSettings.kafkaSettings) 93 | .enablePlugins(JavaAppPackaging, SnowplowDockerPlugin, BuildInfoPlugin) 94 | .dependsOn(core % "test->test;compile->compile;it->it") 95 | .configs(IntegrationTest) 96 | 97 | lazy val kafkaDistroless = project 98 | .in(file("distroless/kafka")) 99 | .settings(sourceDirectory := (kafka / sourceDirectory).value) 100 | .settings(BuildSettings.kafkaSettings) 101 | .enablePlugins(JavaAppPackaging, SnowplowDistrolessDockerPlugin, BuildInfoPlugin) 102 | .dependsOn(core % "test->test;compile->compile;it->it") 103 | .configs(IntegrationTest) 104 | 105 | lazy val nsq = project 106 | .settings(BuildSettings.nsqSettings) 107 | .enablePlugins(JavaAppPackaging, SnowplowDockerPlugin, BuildInfoPlugin) 108 | .dependsOn(core % "test->test;compile->compile") 109 | 110 | lazy val nsqDistroless = project 111 | .in(file("distroless/nsq")) 112 | .settings(sourceDirectory := (nsq / sourceDirectory).value) 113 | .settings(BuildSettings.nsqSettings) 114 | .enablePlugins(JavaAppPackaging, SnowplowDistrolessDockerPlugin, BuildInfoPlugin) 115 | .dependsOn(core % "test->test;compile->compile") 116 | 117 | lazy val stdout = project 118 | .settings(BuildSettings.stdoutSettings) 119 | .enablePlugins(JavaAppPackaging, SnowplowDockerPlugin, BuildInfoPlugin) 120 | .dependsOn(core % "test->test;compile->compile") 121 | 122 | lazy val stdoutDistroless = project 123 | .in(file("distroless/stdout")) 124 | .settings(sourceDirectory := (stdout / sourceDirectory).value) 125 | .settings(BuildSettings.stdoutSettings) 126 | .enablePlugins(JavaAppPackaging, SnowplowDistrolessDockerPlugin, BuildInfoPlugin) 127 | .dependsOn(core % "test->test;compile->compile") -------------------------------------------------------------------------------- /core/src/it/scala/com/snowplowanalytics/snowplow/collectors/scalastream/it/CollectorContainer.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2013-present Snowplow Analytics Ltd. 3 | * All rights reserved. 4 | * 5 | * This software is made available by Snowplow Analytics, Ltd., 6 | * under the terms of the Snowplow Limited Use License Agreement, Version 1.1 7 | * located at https://docs.snowplow.io/limited-use-license-1.1 8 | * BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION 9 | * OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT. 10 | */ 11 | package com.snowplowanalytics.snowplow.collectors.scalastream.it 12 | 13 | import org.testcontainers.containers.GenericContainer 14 | 15 | case class CollectorContainer( 16 | container: GenericContainer[_], 17 | host: String, 18 | port: Int 19 | ) 20 | -------------------------------------------------------------------------------- /core/src/it/scala/com/snowplowanalytics/snowplow/collectors/scalastream/it/CollectorOutput.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2013-present Snowplow Analytics Ltd. 3 | * All rights reserved. 4 | * 5 | * This software is made available by Snowplow Analytics, Ltd., 6 | * under the terms of the Snowplow Limited Use License Agreement, Version 1.1 7 | * located at https://docs.snowplow.io/limited-use-license-1.1 8 | * BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION 9 | * OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT. 10 | */ 11 | package com.snowplowanalytics.snowplow.collectors.scalastream.it 12 | 13 | import com.snowplowanalytics.snowplow.badrows.BadRow 14 | 15 | import com.snowplowanalytics.snowplow.CollectorPayload.thrift.model1.CollectorPayload 16 | 17 | case class CollectorOutput( 18 | good: List[CollectorPayload], 19 | bad: List[BadRow] 20 | ) 21 | -------------------------------------------------------------------------------- /core/src/it/scala/com/snowplowanalytics/snowplow/collectors/scalastream/it/EventGenerator.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2013-present Snowplow Analytics Ltd. 3 | * All rights reserved. 4 | * 5 | * This software is made available by Snowplow Analytics, Ltd., 6 | * under the terms of the Snowplow Limited Use License Agreement, Version 1.1 7 | * located at https://docs.snowplow.io/limited-use-license-1.1 8 | * BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION 9 | * OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT. 10 | */ 11 | package com.snowplowanalytics.snowplow.collectors.scalastream.it 12 | 13 | import cats.effect.IO 14 | 15 | import org.http4s.{Method, Request, Uri} 16 | 17 | object EventGenerator { 18 | 19 | def sendEvents( 20 | collectorHost: String, 21 | collectorPort: Int, 22 | nbGood: Int, 23 | nbBad: Int, 24 | maxBytes: Int 25 | ): IO[Unit] = { 26 | val requests = generateEvents(collectorHost, collectorPort, nbGood, nbBad, maxBytes) 27 | Http.statuses(requests) 28 | .flatMap { responses => 29 | responses.collect { case resp if resp.code != 200 => resp.reason } match { 30 | case Nil => IO.unit 31 | case errors => IO.raiseError(new RuntimeException(s"${errors.size} requests were not successful. Example error: ${errors.head}")) 32 | } 33 | } 34 | } 35 | 36 | def generateEvents( 37 | collectorHost: String, 38 | collectorPort: Int, 39 | nbGood: Int, 40 | nbBad: Int, 41 | maxBytes: Int 42 | ): List[Request[IO]] = { 43 | val good = List.fill(nbGood)(mkTp2Event(collectorHost, collectorPort, valid = true, maxBytes)) 44 | val bad = List.fill(nbBad)(mkTp2Event(collectorHost, collectorPort, valid = false, maxBytes)) 45 | good ++ bad 46 | } 47 | 48 | def mkTp2Event( 49 | collectorHost: String, 50 | collectorPort: Int, 51 | valid: Boolean = true, 52 | maxBytes: Int = 100 53 | ): Request[IO] = { 54 | val uri = Uri.unsafeFromString(s"http://$collectorHost:$collectorPort/com.snowplowanalytics.snowplow/tp2") 55 | val body = if (valid) "foo" else "a" * (maxBytes + 1) 56 | Request[IO](Method.POST, uri).withEntity(body) 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /core/src/it/scala/com/snowplowanalytics/snowplow/collectors/scalastream/it/Http.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2013-present Snowplow Analytics Ltd. 3 | * All rights reserved. 4 | * 5 | * This software is made available by Snowplow Analytics, Ltd., 6 | * under the terms of the Snowplow Limited Use License Agreement, Version 1.1 7 | * located at https://docs.snowplow.io/limited-use-license-1.1 8 | * BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION 9 | * OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT. 10 | */ 11 | package com.snowplowanalytics.snowplow.collectors.scalastream.it 12 | 13 | import cats.effect.{IO, Resource} 14 | import cats.implicits._ 15 | import org.http4s.blaze.client.BlazeClientBuilder 16 | import org.http4s.client.Client 17 | import org.http4s.{Request, Response, Status} 18 | 19 | object Http { 20 | 21 | def statuses(requests: List[Request[IO]]): IO[List[Status]] = 22 | mkClient.use { client => requests.traverse(client.status) } 23 | 24 | def status(request: Request[IO]): IO[Status] = 25 | mkClient.use { client => client.status(request) } 26 | 27 | def response(request: Request[IO]): IO[Response[IO]] = 28 | mkClient.use(c => c.run(request).use(resp => IO.pure(resp))) 29 | 30 | def responses(requests: List[Request[IO]]): IO[List[Response[IO]]] = 31 | mkClient.use(c => requests.traverse(r => c.run(r).use(resp => IO.pure(resp)))) 32 | 33 | def mkClient: Resource[IO, Client[IO]] = 34 | BlazeClientBuilder.apply[IO].resource 35 | } 36 | -------------------------------------------------------------------------------- /core/src/it/scala/com/snowplowanalytics/snowplow/collectors/scalastream/it/utils.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2013-present Snowplow Analytics Ltd. 3 | * All rights reserved. 4 | * 5 | * This software is made available by Snowplow Analytics, Ltd., 6 | * under the terms of the Snowplow Limited Use License Agreement, Version 1.1 7 | * located at https://docs.snowplow.io/limited-use-license-1.1 8 | * BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION 9 | * OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT. 10 | */ 11 | package com.snowplowanalytics.snowplow.collectors.scalastream.it 12 | 13 | import scala.concurrent.duration._ 14 | 15 | import org.apache.thrift.TDeserializer 16 | 17 | import org.slf4j.LoggerFactory 18 | 19 | import org.testcontainers.containers.GenericContainer 20 | import org.testcontainers.containers.output.Slf4jLogConsumer 21 | 22 | import io.circe.parser 23 | 24 | import cats.implicits._ 25 | 26 | import cats.effect.IO 27 | 28 | import retry.syntax.all._ 29 | import retry.RetryPolicies 30 | 31 | import com.snowplowanalytics.snowplow.badrows.BadRow 32 | 33 | import com.snowplowanalytics.iglu.core.SelfDescribingData 34 | import com.snowplowanalytics.iglu.core.circe.implicits._ 35 | 36 | import com.snowplowanalytics.snowplow.CollectorPayload.thrift.model1.CollectorPayload 37 | 38 | object utils { 39 | 40 | def parseCollectorPayload(bytes: Array[Byte]): CollectorPayload = { 41 | val deserializer = new TDeserializer() 42 | val target = new CollectorPayload() 43 | deserializer.deserialize(target, bytes) 44 | target 45 | } 46 | 47 | def parseBadRow(bytes: Array[Byte]): BadRow = { 48 | val str = new String(bytes) 49 | val parsed = for { 50 | json <- parser.parse(str).leftMap(_.message) 51 | sdj <- SelfDescribingData.parse(json).leftMap(_.message("Can't decode JSON as SDJ")) 52 | br <- sdj.data.as[BadRow].leftMap(_.getMessage()) 53 | } yield br 54 | parsed match { 55 | case Right(br) => br 56 | case Left(err) => throw new RuntimeException(s"Can't parse bad row. Error: $err") 57 | } 58 | } 59 | 60 | def printBadRows(testName: String, badRows: List[BadRow]): IO[Unit] = { 61 | log(testName, "Bad rows:") *> 62 | badRows.traverse_(br => log(testName, br.compact)) 63 | } 64 | 65 | def log(testName: String, line: String): IO[Unit] = 66 | IO(println(s"[$testName] $line")) 67 | 68 | def startContainerWithLogs( 69 | container: GenericContainer[_], 70 | loggerName: String 71 | ): GenericContainer[_] = { 72 | container.start() 73 | val logger = LoggerFactory.getLogger(loggerName) 74 | val logs = new Slf4jLogConsumer(logger) 75 | container.followOutput(logs) 76 | container 77 | } 78 | 79 | def waitWhile[A]( 80 | a: A, 81 | condition: A => Boolean, 82 | maxDelay: FiniteDuration 83 | ): IO[Boolean] = { 84 | val retryPolicy = RetryPolicies.limitRetriesByCumulativeDelay( 85 | maxDelay, 86 | RetryPolicies.capDelay[IO]( 87 | 2.second, 88 | RetryPolicies.fullJitter[IO](1.second) 89 | ) 90 | ) 91 | 92 | IO(condition(a)).retryingOnFailures( 93 | result => IO(!result), 94 | retryPolicy, 95 | (_, _) => IO.unit 96 | ) 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /core/src/main/resources/reference.conf: -------------------------------------------------------------------------------- 1 | { 2 | license { 3 | accept = false 4 | accept = ${?ACCEPT_LIMITED_USE_LICENSE} 5 | } 6 | 7 | paths {} 8 | 9 | p3p { 10 | policyRef = "/w3c/p3p.xml" 11 | CP = "NOI DSP COR NID PSA OUR IND COM NAV STA" 12 | } 13 | 14 | crossDomain { 15 | enabled = false 16 | domains = [ "*" ] 17 | secure = true 18 | } 19 | 20 | cookie { 21 | enabled = true 22 | expiration = 365 days 23 | domains = [] 24 | name = sp 25 | secure = true 26 | httpOnly = true 27 | sameSite = "None" 28 | } 29 | 30 | doNotTrackCookie { 31 | enabled = false 32 | name = "" 33 | value = "" 34 | } 35 | 36 | cookieBounce { 37 | enabled = false 38 | name = "n3pc" 39 | fallbackNetworkUserId = "00000000-0000-4000-A000-000000000000" 40 | } 41 | 42 | redirectMacro { 43 | enabled = false 44 | } 45 | 46 | rootResponse { 47 | enabled = false 48 | statusCode = 302 49 | headers = {} 50 | body = "" 51 | } 52 | 53 | cors { 54 | accessControlMaxAge = 60 minutes 55 | } 56 | 57 | streams { 58 | useIpAddressAsPartitionKey = false 59 | } 60 | 61 | telemetry { 62 | disable = false 63 | interval = 60 minutes 64 | method = POST 65 | url = telemetry-g.snowplowanalytics.com 66 | port = 443 67 | secure = true 68 | } 69 | 70 | monitoring { 71 | metrics { 72 | statsd { 73 | enabled = false 74 | hostname = localhost 75 | port = 8125 76 | period = 10 seconds 77 | prefix = snowplow.collector 78 | tags = { } 79 | } 80 | } 81 | } 82 | 83 | ssl { 84 | enable = false 85 | redirect = false 86 | port = 443 87 | } 88 | 89 | hsts { 90 | enable = false 91 | maxAge = 365 days 92 | } 93 | 94 | networking { 95 | maxConnections = 1024 96 | idleTimeout = 610 seconds 97 | responseHeaderTimeout = 30 seconds 98 | maxRequestLineLength = 20480 99 | maxHeadersLength = 40960 100 | maxPayloadSize = 1048576 # 1MB 101 | dropPayloadSize = 2097152 # 2MB 102 | } 103 | 104 | enableDefaultRedirect = false 105 | preTerminationPeriod = 10 seconds 106 | 107 | redirectDomains = [] 108 | 109 | preTerminationPeriod = 10 seconds 110 | } 111 | -------------------------------------------------------------------------------- /core/src/main/scala/com.snowplowanalytics.snowplow.collector.core/App.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2013-present Snowplow Analytics Ltd. 3 | * All rights reserved. 4 | * 5 | * This software is made available by Snowplow Analytics, Ltd., 6 | * under the terms of the Snowplow Limited Use License Agreement, Version 1.1 7 | * located at https://docs.snowplow.io/limited-use-license-1.1 8 | * BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION 9 | * OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT. 10 | */ 11 | package com.snowplowanalytics.snowplow.collector.core 12 | 13 | import scala.concurrent.duration._ 14 | 15 | import cats.effect.{ExitCode, IO} 16 | import cats.effect.kernel.Resource 17 | import cats.effect.metrics.CpuStarvationWarningMetrics 18 | 19 | import com.monovore.decline.effect.CommandIOApp 20 | import com.monovore.decline.Opts 21 | 22 | import io.circe.Decoder 23 | 24 | import org.typelevel.log4cats.Logger 25 | import org.typelevel.log4cats.slf4j.Slf4jLogger 26 | 27 | import com.snowplowanalytics.snowplow.scalatracker.emitters.http4s.ceTracking 28 | 29 | import com.snowplowanalytics.snowplow.collector.core.model.Sinks 30 | 31 | abstract class App[SinkConfig: Decoder](appInfo: AppInfo) 32 | extends CommandIOApp( 33 | name = App.helpCommand(appInfo), 34 | header = "Snowplow application that collects tracking events", 35 | version = appInfo.version 36 | ) { 37 | 38 | implicit private val logger: Logger[IO] = Slf4jLogger.getLogger[IO] 39 | 40 | override def runtimeConfig = 41 | super.runtimeConfig.copy(cpuStarvationCheckInterval = 10.seconds) 42 | 43 | override def onCpuStarvationWarn(metrics: CpuStarvationWarningMetrics): IO[Unit] = 44 | Logger[IO].debug( 45 | s"Cats Effect measured responsiveness in excess of ${metrics.starvationInterval * metrics.starvationThreshold}" 46 | ) 47 | 48 | def mkSinks(config: Config.Streams[SinkConfig]): Resource[IO, Sinks[IO]] 49 | 50 | def telemetryInfo(config: Config.Streams[SinkConfig]): IO[Telemetry.TelemetryInfo] 51 | 52 | final def main: Opts[IO[ExitCode]] = Run.fromCli[IO, SinkConfig](appInfo, mkSinks, telemetryInfo) 53 | } 54 | 55 | object App { 56 | private def helpCommand(appInfo: AppInfo) = s"docker run ${appInfo.dockerAlias}" 57 | } 58 | -------------------------------------------------------------------------------- /core/src/main/scala/com.snowplowanalytics.snowplow.collector.core/AppInfo.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2013-present Snowplow Analytics Ltd. 3 | * All rights reserved. 4 | * 5 | * This software is made available by Snowplow Analytics, Ltd., 6 | * under the terms of the Snowplow Limited Use License Agreement, Version 1.1 7 | * located at https://docs.snowplow.io/limited-use-license-1.1 8 | * BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION 9 | * OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT. 10 | */ 11 | package com.snowplowanalytics.snowplow.collector.core 12 | 13 | trait AppInfo { 14 | def name: String 15 | def moduleName: String 16 | def version: String 17 | def dockerAlias: String 18 | def shortName: String 19 | } 20 | -------------------------------------------------------------------------------- /core/src/main/scala/com.snowplowanalytics.snowplow.collector.core/ConfigParser.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2013-present Snowplow Analytics Ltd. 3 | * All rights reserved. 4 | * 5 | * This software is made available by Snowplow Analytics, Ltd., 6 | * under the terms of the Snowplow Limited Use License Agreement, Version 1.1 7 | * located at https://docs.snowplow.io/limited-use-license-1.1 8 | * BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION 9 | * OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT. 10 | */ 11 | package com.snowplowanalytics.snowplow.collector.core 12 | 13 | import java.nio.file.{Files, Path} 14 | import org.typelevel.log4cats.Logger 15 | import org.typelevel.log4cats.slf4j.Slf4jLogger 16 | import com.typesafe.config.{ConfigFactory, Config => TypesafeConfig} 17 | import io.circe.Decoder 18 | import io.circe.config.syntax.CirceConfigOps 19 | import cats.implicits._ 20 | import cats.data.EitherT 21 | import cats.effect.{ExitCode, Sync} 22 | 23 | import scala.jdk.CollectionConverters._ 24 | 25 | object ConfigParser { 26 | 27 | implicit private def logger[F[_]: Sync]: Logger[F] = Slf4jLogger.getLogger[F] 28 | 29 | def fromPath[F[_]: Sync, SinkConfig: Decoder]( 30 | configPath: Option[Path] 31 | ): EitherT[F, ExitCode, Config[SinkConfig]] = { 32 | val eitherT = configPath match { 33 | case Some(path) => 34 | for { 35 | text <- EitherT(readTextFrom[F](path)) 36 | hocon <- EitherT.fromEither[F](hoconFromString(text)) 37 | result <- EitherT.fromEither[F](resolve[Config[SinkConfig]](hocon)) 38 | } yield result 39 | case None => 40 | EitherT.fromEither[F]( 41 | for { 42 | config <- Either 43 | .catchNonFatal(namespaced(ConfigFactory.load())) 44 | .leftMap(e => s"Error loading the configuration (without config file): ${e.getMessage}") 45 | parsed <- config.as[Config[SinkConfig]].leftMap(_.show) 46 | } yield parsed 47 | ) 48 | } 49 | 50 | eitherT.leftSemiflatMap { str => 51 | Logger[F].error(str).as(ExitCode.Error) 52 | } 53 | } 54 | 55 | private def readTextFrom[F[_]: Sync](path: Path): F[Either[String, String]] = 56 | Sync[F].blocking { 57 | Either 58 | .catchNonFatal(Files.readAllLines(path).asScala.mkString("\n")) 59 | .leftMap(e => s"Error reading ${path.toAbsolutePath} file from filesystem: ${e.getMessage}") 60 | } 61 | 62 | private def hoconFromString(str: String): Either[String, TypesafeConfig] = 63 | Either.catchNonFatal(ConfigFactory.parseString(str)).leftMap(_.getMessage) 64 | 65 | private def resolve[A: Decoder](hocon: TypesafeConfig): Either[String, A] = { 66 | val either = for { 67 | resolved <- Either.catchNonFatal(hocon.resolve()).leftMap(_.getMessage) 68 | resolved <- Either.catchNonFatal(loadAll(resolved)).leftMap(_.getMessage) 69 | parsed <- resolved.as[A].leftMap(_.show) 70 | } yield parsed 71 | either.leftMap(e => s"Cannot resolve config: $e") 72 | } 73 | 74 | private def loadAll(config: TypesafeConfig): TypesafeConfig = 75 | namespaced(ConfigFactory.load(namespaced(config.withFallback(namespaced(ConfigFactory.load()))))) 76 | 77 | private def namespaced(config: TypesafeConfig): TypesafeConfig = { 78 | val namespace = "collector" 79 | if (config.hasPath(namespace)) 80 | config.getConfig(namespace).withFallback(config.withoutPath(namespace)) 81 | else 82 | config 83 | } 84 | 85 | } 86 | -------------------------------------------------------------------------------- /core/src/main/scala/com.snowplowanalytics.snowplow.collector.core/HttpServer.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2013-present Snowplow Analytics Ltd. 3 | * All rights reserved. 4 | * 5 | * This software is made available by Snowplow Analytics, Ltd., 6 | * under the terms of the Snowplow Limited Use License Agreement, Version 1.1 7 | * located at https://docs.snowplow.io/limited-use-license-1.1 8 | * BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION 9 | * OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT. 10 | */ 11 | package com.snowplowanalytics.snowplow.collector.core 12 | 13 | import cats.effect.{Async, Resource} 14 | import cats.implicits._ 15 | import com.avast.datadog4s.api.Tag 16 | import com.avast.datadog4s.extension.http4s.DatadogMetricsOps 17 | import com.avast.datadog4s.{StatsDMetricFactory, StatsDMetricFactoryConfig} 18 | import org.http4s.{HttpApp, HttpRoutes} 19 | import org.http4s.blaze.server.BlazeServerBuilder 20 | import org.http4s.headers.`Strict-Transport-Security` 21 | import org.http4s.server.Server 22 | import org.http4s.server.middleware.{EntityLimiter, HSTS, Metrics, Timeout} 23 | import org.typelevel.log4cats.Logger 24 | import org.typelevel.log4cats.slf4j.Slf4jLogger 25 | 26 | import java.net.InetSocketAddress 27 | import javax.net.ssl.SSLContext 28 | import org.http4s.Response 29 | import org.http4s.Status 30 | 31 | object HttpServer { 32 | 33 | implicit private def logger[F[_]: Async]: Logger[F] = Slf4jLogger.getLogger[F] 34 | 35 | def build[F[_]: Async]( 36 | routes: HttpRoutes[F], 37 | healthRoutes: HttpRoutes[F], 38 | port: Int, 39 | secure: Boolean, 40 | hsts: Config.HSTS, 41 | networking: Config.Networking, 42 | metricsConfig: Config.Metrics 43 | )( 44 | mkServer: ((HttpApp[F], Int, Boolean, Config.Networking) => Resource[F, Server]) 45 | ): Resource[F, Server] = 46 | for { 47 | withMetricsMiddleware <- createMetricsMiddleware(routes, metricsConfig) 48 | httpApp <- Resource.pure(httpApp(withMetricsMiddleware, healthRoutes, hsts, networking)) 49 | server <- mkServer(httpApp, port, secure, networking) 50 | } yield server 51 | 52 | def buildBlazeServer[F[_]: Async]( 53 | httpApp: HttpApp[F], 54 | port: Int, 55 | secure: Boolean, 56 | networking: Config.Networking 57 | ): Resource[F, Server] = 58 | Resource.eval(Logger[F].info("Building blaze server")) >> 59 | BlazeServerBuilder[F] 60 | .bindSocketAddress(new InetSocketAddress(port)) 61 | .withHttpApp(httpApp) 62 | .withIdleTimeout(networking.idleTimeout) 63 | .withMaxConnections(networking.maxConnections) 64 | .withResponseHeaderTimeout(networking.responseHeaderTimeout) 65 | .withLengthLimits( 66 | maxRequestLineLen = networking.maxRequestLineLength, 67 | maxHeadersLen = networking.maxHeadersLength 68 | ) 69 | .cond(secure, _.withSslContext(SSLContext.getDefault)) 70 | .resource 71 | 72 | def httpApp[F[_]: Async]( 73 | routes: HttpRoutes[F], 74 | healthRoutes: HttpRoutes[F], 75 | hsts: Config.HSTS, 76 | networking: Config.Networking 77 | ): HttpApp[F] = hstsApp( 78 | hsts, 79 | timeoutMiddleware(entityLimiter(routes, networking.dropPayloadSize), networking) <+> healthRoutes 80 | ) 81 | 82 | private def createMetricsMiddleware[F[_]: Async]( 83 | routes: HttpRoutes[F], 84 | metricsConfig: Config.Metrics 85 | ): Resource[F, HttpRoutes[F]] = 86 | if (metricsConfig.statsd.enabled) { 87 | val metricsFactory = StatsDMetricFactory.make(createStatsdConfig(metricsConfig)) 88 | metricsFactory.evalMap(DatadogMetricsOps.builder[F](_).useDistributionBasedTimers().build()).map { metricsOps => 89 | Metrics[F](metricsOps)(routes) 90 | } 91 | } else { 92 | Resource.pure(routes) 93 | } 94 | 95 | private def createStatsdConfig(metricsConfig: Config.Metrics): StatsDMetricFactoryConfig = { 96 | val server = InetSocketAddress.createUnresolved(metricsConfig.statsd.hostname, metricsConfig.statsd.port) 97 | val tags = metricsConfig.statsd.tags.toVector.map { case (name, value) => Tag.of(name, value) } 98 | StatsDMetricFactoryConfig(Some(metricsConfig.statsd.prefix), server, defaultTags = tags) 99 | } 100 | 101 | private[core] def hstsApp[F[_]: Async](hsts: Config.HSTS, routes: HttpRoutes[F]): HttpApp[F] = 102 | if (hsts.enable) 103 | HSTS(routes.orNotFound, `Strict-Transport-Security`.unsafeFromDuration(hsts.maxAge)) 104 | else routes.orNotFound 105 | 106 | private def entityLimiter[F[_]: Async](routes: HttpRoutes[F], dropPayloadSize: Long): HttpRoutes[F] = 107 | EntityLimiter.httpRoutes[F](routes, dropPayloadSize).recover { 108 | case _: EntityLimiter.EntityTooLarge => 109 | Response[F](Status.PayloadTooLarge) 110 | } 111 | 112 | private def timeoutMiddleware[F[_]: Async](routes: HttpRoutes[F], networking: Config.Networking): HttpRoutes[F] = 113 | Timeout.httpRoutes[F](timeout = networking.responseHeaderTimeout)(routes).map { 114 | case Response(Status.ServiceUnavailable, httpVersion, headers, body, attributes) => 115 | Response[F](Status.RequestTimeout, httpVersion, headers, body, attributes) 116 | case response => response 117 | } 118 | 119 | implicit class ConditionalAction[A](item: A) { 120 | def cond(cond: Boolean, action: A => A): A = 121 | if (cond) action(item) else item 122 | } 123 | } 124 | -------------------------------------------------------------------------------- /core/src/main/scala/com.snowplowanalytics.snowplow.collector.core/Rfc6265Cookie.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2013-present Snowplow Analytics Ltd. 3 | * All rights reserved. 4 | * 5 | * This software is made available by Snowplow Analytics, Ltd., 6 | * under the terms of the Snowplow Limited Use License Agreement, Version 1.1 7 | * located at https://docs.snowplow.io/limited-use-license-1.1 8 | * BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION 9 | * OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT. 10 | */ 11 | package com.snowplowanalytics.snowplow.collector.core 12 | 13 | object Rfc6265Cookie { 14 | 15 | // See https://www.ietf.org/rfc/rfc6265.txt 16 | private val allowedChars = Set(0x21.toChar) ++ 17 | Set(0x23.toChar to 0x2b.toChar: _*) ++ 18 | Set(0x2d.toChar to 0x3a.toChar: _*) ++ 19 | Set(0x3c.toChar to 0x5b.toChar: _*) ++ 20 | Set(0x5d.toChar to 0x7e.toChar: _*) 21 | 22 | // Remove all the sub-parts (between two ';') that contain unauthorized characters 23 | def parse(rawCookie: String): Option[String] = 24 | rawCookie.replaceAll(" ", "").split(";").filter(_.forall(allowedChars.contains)).mkString(";") match { 25 | case s if s.nonEmpty => Some(s) 26 | case _ => None 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /core/src/main/scala/com.snowplowanalytics.snowplow.collector.core/Routes.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2013-present Snowplow Analytics Ltd. 3 | * All rights reserved. 4 | * 5 | * This software is made available by Snowplow Analytics, Ltd., 6 | * under the terms of the Snowplow Limited Use License Agreement, Version 1.1 7 | * located at https://docs.snowplow.io/limited-use-license-1.1 8 | * BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION 9 | * OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT. 10 | */ 11 | package com.snowplowanalytics.snowplow.collector.core 12 | 13 | import cats.implicits._ 14 | import cats.effect.{Async, Sync} 15 | import org.http4s._ 16 | import org.http4s.dsl.Http4sDsl 17 | import org.http4s.implicits._ 18 | import com.comcast.ip4s.Dns 19 | 20 | class Routes[F[_]: Async]( 21 | enableDefaultRedirect: Boolean, 22 | enableRootResponse: Boolean, 23 | enableCrossdomainTracking: Boolean, 24 | service: IService[F] 25 | ) extends Http4sDsl[F] { 26 | 27 | implicit val dns: Dns[F] = Dns.forSync[F] 28 | 29 | private val corsRoute = HttpRoutes.of[F] { 30 | case req @ OPTIONS -> _ => 31 | service.preflightResponse(req) 32 | } 33 | 34 | private val cookieRoutes = HttpRoutes.of[F] { 35 | case req @ POST -> Root / vendor / version => 36 | val path = service.determinePath(vendor, version) 37 | service.cookie( 38 | body = req.bodyText.compile.string.map(Some(_)), 39 | path = path, 40 | request = req, 41 | pixelExpected = false, 42 | contentType = req.contentType.map(_.value.toLowerCase) 43 | ) 44 | 45 | case req @ (GET | HEAD) -> Root / vendor / version => 46 | val path = service.determinePath(vendor, version) 47 | service.cookie( 48 | body = Sync[F].pure(None), 49 | path = path, 50 | request = req, 51 | pixelExpected = true, 52 | contentType = None 53 | ) 54 | 55 | case req @ (GET | HEAD) -> Root / ("ice.png" | "i") => 56 | service.cookie( 57 | body = Sync[F].pure(None), 58 | path = req.pathInfo.renderString, 59 | request = req, 60 | pixelExpected = true, 61 | contentType = None 62 | ) 63 | } 64 | 65 | def rejectRedirect = HttpRoutes.of[F] { 66 | case _ -> Root / "r" / _ => 67 | NotFound("redirects disabled") 68 | } 69 | 70 | private val rootRoute = HttpRoutes.of[F] { 71 | case GET -> Root if enableRootResponse => 72 | service.rootResponse 73 | } 74 | 75 | private val crossdomainRoute = HttpRoutes.of[F] { 76 | case GET -> Root / "crossdomain.xml" if enableCrossdomainTracking => 77 | service.crossdomainResponse 78 | } 79 | 80 | val health = HttpRoutes.of[F] { 81 | case GET -> Root / "health" => 82 | Ok("ok") 83 | case GET -> Root / "sink-health" => 84 | service 85 | .sinksHealthy 86 | .ifM( 87 | ifTrue = Ok("ok"), 88 | ifFalse = ServiceUnavailable("Service Unavailable") 89 | ) 90 | case GET -> Root / "robots.txt" => 91 | Ok("User-agent: *\nDisallow: /\n\nUser-agent: Googlebot\nDisallow: /\n\nUser-agent: AdsBot-Google\nDisallow: /") 92 | } 93 | 94 | val value: HttpRoutes[F] = { 95 | val routes = corsRoute <+> cookieRoutes <+> rootRoute <+> crossdomainRoute 96 | if (enableDefaultRedirect) routes else rejectRedirect <+> routes 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /core/src/main/scala/com.snowplowanalytics.snowplow.collector.core/Run.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2013-present Snowplow Analytics Ltd. 3 | * All rights reserved. 4 | * 5 | * This software is made available by Snowplow Analytics, Ltd., 6 | * under the terms of the Snowplow Limited Use License Agreement, Version 1.1 7 | * located at https://docs.snowplow.io/limited-use-license-1.1 8 | * BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION 9 | * OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT. 10 | */ 11 | package com.snowplowanalytics.snowplow.collector.core 12 | 13 | import java.nio.file.Path 14 | 15 | import org.typelevel.log4cats.Logger 16 | import org.typelevel.log4cats.slf4j.Slf4jLogger 17 | 18 | import scala.concurrent.duration.FiniteDuration 19 | 20 | import cats.implicits._ 21 | import cats.data.EitherT 22 | 23 | import cats.effect.{Async, ExitCode, Sync} 24 | import cats.effect.kernel.Resource 25 | 26 | import org.http4s.blaze.client.BlazeClientBuilder 27 | 28 | import com.monovore.decline.Opts 29 | 30 | import io.circe.Decoder 31 | 32 | import com.snowplowanalytics.snowplow.scalatracker.Tracking 33 | 34 | import com.snowplowanalytics.snowplow.collector.core.model.Sinks 35 | 36 | object Run { 37 | 38 | type MkSinks[F[_], SinkConfig] = Config.Streams[SinkConfig] => Resource[F, Sinks[F]] 39 | 40 | type TelemetryInfo[F[_], SinkConfig] = Config.Streams[SinkConfig] => F[Telemetry.TelemetryInfo] 41 | 42 | implicit private def logger[F[_]: Sync]: Logger[F] = Slf4jLogger.getLogger[F] 43 | 44 | def fromCli[F[_]: Async: Tracking, SinkConfig: Decoder]( 45 | appInfo: AppInfo, 46 | mkSinks: MkSinks[F, SinkConfig], 47 | telemetryInfo: TelemetryInfo[F, SinkConfig] 48 | ): Opts[F[ExitCode]] = { 49 | val configPath = Opts.option[Path]("config", "Path to HOCON configuration (optional)", "c", "config.hocon").orNone 50 | configPath.map(fromPath[F, SinkConfig](appInfo, mkSinks, telemetryInfo, _)) 51 | } 52 | 53 | private def fromPath[F[_]: Async: Tracking, SinkConfig: Decoder]( 54 | appInfo: AppInfo, 55 | mkSinks: MkSinks[F, SinkConfig], 56 | telemetryInfo: TelemetryInfo[F, SinkConfig], 57 | path: Option[Path] 58 | ): F[ExitCode] = { 59 | val eitherT = for { 60 | config <- ConfigParser.fromPath[F, SinkConfig](path) 61 | _ <- checkLicense(config.license.accept) 62 | _ <- EitherT.right[ExitCode](fromConfig(appInfo, mkSinks, telemetryInfo, config)) 63 | } yield ExitCode.Success 64 | 65 | eitherT.merge.handleErrorWith { e => 66 | Logger[F].error(e)("Exiting") >> 67 | prettyLogException(e).as(ExitCode.Error) 68 | } 69 | } 70 | 71 | private def checkLicense[F[_]: Sync](acceptLicense: Boolean): EitherT[F, ExitCode, _] = 72 | EitherT.liftF { 73 | if (acceptLicense) 74 | Sync[F].unit 75 | else 76 | Sync[F].raiseError( 77 | new IllegalStateException( 78 | "Please accept the terms of the Snowplow Limited Use License Agreement to proceed. See https://docs.snowplow.io/docs/pipeline-components-and-applications/stream-collector/configure/#license for more information on the license and how to configure this." 79 | ) 80 | ) 81 | } 82 | 83 | private def fromConfig[F[_]: Async: Tracking, SinkConfig]( 84 | appInfo: AppInfo, 85 | mkSinks: MkSinks[F, SinkConfig], 86 | telemetryInfo: TelemetryInfo[F, SinkConfig], 87 | config: Config[SinkConfig] 88 | ): F[ExitCode] = { 89 | val resources = for { 90 | sinks <- mkSinks(config.streams) 91 | collectorService = new Service[F]( 92 | config, 93 | Sinks(sinks.good, sinks.bad), 94 | appInfo 95 | ) 96 | routes = new Routes[F]( 97 | config.enableDefaultRedirect, 98 | config.rootResponse.enabled, 99 | config.crossDomain.enabled, 100 | collectorService 101 | ) 102 | httpServer = HttpServer.build[F]( 103 | routes.value, 104 | routes.health, 105 | if (config.ssl.enable) config.ssl.port else config.port, 106 | config.ssl.enable, 107 | config.hsts, 108 | config.networking, 109 | config.monitoring.metrics 110 | )(HttpServer.buildBlazeServer) 111 | _ <- withGracefulShutdown(config.preTerminationPeriod)(httpServer) 112 | httpClient <- BlazeClientBuilder[F].resource 113 | } yield httpClient 114 | 115 | resources.use { httpClient => 116 | val appId = java.util.UUID.randomUUID.toString 117 | Telemetry 118 | .run(config.telemetry, httpClient, appInfo, appId, telemetryInfo(config.streams)) 119 | .compile 120 | .drain 121 | .flatMap(_ => Async[F].never[ExitCode]) 122 | } 123 | } 124 | 125 | private def prettyLogException[F[_]: Sync](e: Throwable): F[Unit] = { 126 | 127 | def logCause(e: Throwable): F[Unit] = 128 | Option(e.getCause) match { 129 | case Some(e) => Logger[F].error(s"caused by: ${e.getMessage}") >> logCause(e) 130 | case None => Sync[F].unit 131 | } 132 | 133 | Logger[F].error(e.getMessage) >> logCause(e) 134 | } 135 | 136 | private def withGracefulShutdown[F[_]: Async, A](delay: FiniteDuration)(resource: Resource[F, A]): Resource[F, A] = 137 | for { 138 | a <- resource 139 | _ <- Resource.onFinalizeCase { 140 | case Resource.ExitCase.Canceled => 141 | Logger[F].warn(s"Shutdown interrupted. Will continue to serve requests for $delay") >> 142 | Async[F].sleep(delay) 143 | case _ => 144 | Async[F].unit 145 | } 146 | } yield a 147 | } 148 | -------------------------------------------------------------------------------- /core/src/main/scala/com.snowplowanalytics.snowplow.collector.core/Sink.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2013-present Snowplow Analytics Ltd. 3 | * All rights reserved. 4 | * 5 | * This software is made available by Snowplow Analytics, Ltd., 6 | * under the terms of the Snowplow Limited Use License Agreement, Version 1.1 7 | * located at https://docs.snowplow.io/limited-use-license-1.1 8 | * BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION 9 | * OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT. 10 | */ 11 | package com.snowplowanalytics.snowplow.collector.core 12 | 13 | trait Sink[F[_]] { 14 | 15 | // Maximum number of bytes that a single record can contain. 16 | // If a record is bigger, a size violation bad row is emitted instead 17 | val maxBytes: Int 18 | 19 | def isHealthy: F[Boolean] 20 | def storeRawEvents(events: List[Array[Byte]], key: String): F[Unit] 21 | } 22 | -------------------------------------------------------------------------------- /core/src/main/scala/com.snowplowanalytics.snowplow.collector.core/Telemetry.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2013-present Snowplow Analytics Ltd. 3 | * All rights reserved. 4 | * 5 | * This software is made available by Snowplow Analytics, Ltd., 6 | * under the terms of the Snowplow Limited Use License Agreement, Version 1.1 7 | * located at https://docs.snowplow.io/limited-use-license-1.1 8 | * BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION 9 | * OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT. 10 | */ 11 | package com.snowplowanalytics.snowplow.collector.core 12 | 13 | import org.typelevel.log4cats.Logger 14 | import org.typelevel.log4cats.slf4j.Slf4jLogger 15 | 16 | import org.apache.commons.codec.digest.DigestUtils 17 | 18 | import cats.data.NonEmptyList 19 | import cats.implicits._ 20 | 21 | import cats.effect.{Async, Resource, Sync} 22 | import cats.effect.std.Random 23 | 24 | import fs2.Stream 25 | 26 | import org.http4s.client.{Client => HttpClient} 27 | 28 | import _root_.io.circe.Json 29 | import _root_.io.circe.syntax._ 30 | 31 | import com.snowplowanalytics.iglu.core.{SchemaKey, SchemaVer, SelfDescribingData} 32 | 33 | import com.snowplowanalytics.snowplow.scalatracker.{Tracker, Tracking} 34 | import com.snowplowanalytics.snowplow.scalatracker.Emitter._ 35 | import com.snowplowanalytics.snowplow.scalatracker.Emitter.{Result => TrackerResult} 36 | import com.snowplowanalytics.snowplow.scalatracker.emitters.http4s.Http4sEmitter 37 | 38 | object Telemetry { 39 | 40 | implicit private def unsafeLogger[F[_]: Sync]: Logger[F] = 41 | Slf4jLogger.getLogger[F] 42 | 43 | def run[F[_]: Async: Tracking]( 44 | telemetryConfig: Config.Telemetry, 45 | httpClient: HttpClient[F], 46 | appInfo: AppInfo, 47 | appId: String, 48 | telemetryInfoF: F[TelemetryInfo] 49 | ): Stream[F, Unit] = 50 | if (telemetryConfig.disable) 51 | Stream.empty.covary[F] 52 | else 53 | for { 54 | telemetryInfo <- Stream.eval(telemetryInfoF) 55 | sdj = makeHeartbeatEvent(telemetryConfig, appInfo, appId, telemetryInfo) 56 | tracker <- Stream.resource(initTracker(telemetryConfig, appInfo.moduleName, httpClient)) 57 | _ <- Stream.fixedDelay[F](telemetryConfig.interval).evalMap { _ => 58 | tracker.trackSelfDescribingEvent(unstructEvent = sdj) >> tracker.flushEmitters() 59 | } 60 | } yield () 61 | 62 | private def initTracker[F[_]: Async: Tracking]( 63 | config: Config.Telemetry, 64 | appName: String, 65 | client: HttpClient[F] 66 | ): Resource[F, Tracker[F]] = 67 | for { 68 | random <- Resource.eval(Random.scalaUtilRandom[F]) 69 | emitter <- { 70 | implicit val r: Random[F] = random 71 | Http4sEmitter.build( 72 | EndpointParams(config.url, port = Some(config.port), https = config.secure), 73 | client, 74 | retryPolicy = RetryPolicy.MaxAttempts(10), 75 | callback = Some(emitterCallback[F] _) 76 | ) 77 | } 78 | } yield new Tracker(NonEmptyList.of(emitter), "tracker-telemetry", appName) 79 | 80 | private def emitterCallback[F[_]: Sync]( 81 | params: EndpointParams, 82 | req: Request, 83 | res: TrackerResult 84 | ): F[Unit] = 85 | res match { 86 | case TrackerResult.Success(_) => 87 | Logger[F].debug(s"Telemetry heartbeat successfully sent to ${params.getGetUri}") 88 | case TrackerResult.Failure(code) => 89 | Logger[F].warn(s"Sending telemetry hearbeat got unexpected HTTP code $code from ${params.getUri}") 90 | case TrackerResult.TrackerFailure(exception) => 91 | Logger[F].warn( 92 | s"Telemetry hearbeat failed to reach ${params.getUri} with following exception $exception after ${req.attempt} attempts" 93 | ) 94 | case TrackerResult.RetriesExceeded(failure) => 95 | Logger[F].error(s"Stopped trying to send telemetry heartbeat after following failure: $failure") 96 | } 97 | 98 | private def makeHeartbeatEvent( 99 | teleCfg: Config.Telemetry, 100 | appInfo: AppInfo, 101 | appId: String, 102 | telemetryInfo: TelemetryInfo 103 | ): SelfDescribingData[Json] = 104 | SelfDescribingData( 105 | SchemaKey("com.snowplowanalytics.oss", "oss_context", "jsonschema", SchemaVer.Full(1, 0, 2)), 106 | Json.obj( 107 | "userProvidedId" -> teleCfg.userProvidedId.asJson, 108 | "autoGeneratedId" -> teleCfg.autoGeneratedId.asJson, 109 | "moduleName" -> teleCfg.moduleName.asJson, 110 | "moduleVersion" -> teleCfg.moduleVersion.asJson, 111 | "instanceId" -> teleCfg.instanceId.asJson, 112 | "appGeneratedId" -> appId.asJson, 113 | "cloud" -> telemetryInfo.cloud.asJson, 114 | "region" -> telemetryInfo.region.asJson, 115 | "installationId" -> telemetryInfo.hashedInstallationId.asJson, 116 | "applicationName" -> appInfo.moduleName.asJson, 117 | "applicationVersion" -> appInfo.version.asJson 118 | ) 119 | ) 120 | 121 | /** 122 | * Stores destination specific telemetry data 123 | * @param region Cloud region application is deployed 124 | * @param cloud Cloud application is deployed 125 | * @param unhashedInstallationId Unhashed version of id that is used identify pipeline. 126 | * It should be something unique to that pipeline such as account id, project id etc. 127 | */ 128 | case class TelemetryInfo( 129 | region: Option[String], 130 | cloud: Option[String], 131 | unhashedInstallationId: Option[String] 132 | ) { 133 | def hashedInstallationId: Option[String] = unhashedInstallationId.map(DigestUtils.sha256Hex) 134 | } 135 | } 136 | -------------------------------------------------------------------------------- /core/src/main/scala/com.snowplowanalytics.snowplow.collector.core/model.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2013-present Snowplow Analytics Ltd. 3 | * All rights reserved. 4 | * 5 | * This software is made available by Snowplow Analytics, Ltd., 6 | * under the terms of the Snowplow Limited Use License Agreement, Version 1.1 7 | * located at https://docs.snowplow.io/limited-use-license-1.1 8 | * BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION 9 | * OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT. 10 | */ 11 | package com.snowplowanalytics.snowplow.collector.core 12 | 13 | import io.circe.Json 14 | 15 | object model { 16 | 17 | /** 18 | * Case class for holding both good and 19 | * bad sinks for the Stream Collector. 20 | */ 21 | final case class Sinks[F[_]](good: Sink[F], bad: Sink[F]) 22 | 23 | /** 24 | * Case class for holding the results of 25 | * splitAndSerializePayload. 26 | * 27 | * @param good All good results 28 | * @param bad All bad results 29 | */ 30 | final case class EventSerializeResult(good: List[Array[Byte]], bad: List[Array[Byte]]) 31 | 32 | /** 33 | * Class for the result of splitting a too-large array of events in the body of a POST request 34 | * 35 | * @param goodBatches List of batches of events 36 | * @param failedBigEvents List of events that were too large 37 | */ 38 | final case class SplitBatchResult(goodBatches: List[List[Json]], failedBigEvents: List[Json]) 39 | } 40 | -------------------------------------------------------------------------------- /core/src/test/resources/test-config-new-style.hocon: -------------------------------------------------------------------------------- 1 | collector { 2 | interface = "0.0.0.0" 3 | port = 8080 4 | 5 | streams { 6 | good { 7 | name = "good" 8 | 9 | foo = "hello" 10 | bar = "world" 11 | 12 | buffer { 13 | byteLimit = 3145728 14 | recordLimit = 500 15 | timeLimit = 5000 16 | } 17 | } 18 | 19 | bad { 20 | name = "bad" 21 | 22 | foo = "hello" 23 | bar = "world" 24 | 25 | buffer { 26 | byteLimit = 3145728 27 | recordLimit = 500 28 | timeLimit = 5000 29 | } 30 | } 31 | } 32 | 33 | ssl { 34 | enable = true 35 | } 36 | 37 | hsts { 38 | enable = true 39 | maxAge = 180 days 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /core/src/test/resources/test-config-old-style.hocon: -------------------------------------------------------------------------------- 1 | collector { 2 | interface = "0.0.0.0" 3 | port = 8080 4 | 5 | streams { 6 | good = "good" 7 | bad = "bad" 8 | 9 | sink { 10 | foo = "hello" 11 | bar = "world" 12 | } 13 | 14 | buffer { 15 | byteLimit = 3145728 16 | recordLimit = 500 17 | timeLimit = 5000 18 | } 19 | } 20 | 21 | ssl { 22 | enable = true 23 | } 24 | 25 | hsts { 26 | enable = true 27 | maxAge = 180 days 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /core/src/test/scala/com.snowplowanalytics.snowplow.collector.core/ConfigParserSpec.scala: -------------------------------------------------------------------------------- 1 | package com.snowplowanalytics.snowplow.collector.core 2 | 3 | import java.nio.file.Paths 4 | import org.specs2.mutable.Specification 5 | import cats.effect.IO 6 | import cats.effect.testing.specs2.CatsEffect 7 | import com.snowplowanalytics.snowplow.collector.core.Config.Buffer 8 | import io.circe.generic.semiauto._ 9 | 10 | import scala.concurrent.duration.DurationInt 11 | 12 | class ConfigParserSpec extends Specification with CatsEffect { 13 | 14 | "Loading the configuration" should { 15 | "use reference.conf and the hocon specified in the path" >> { 16 | "for new-style config" in { 17 | assert(resource = "/test-config-new-style.hocon") 18 | } 19 | "for old-style config" in { 20 | assert(resource = "/test-config-old-style.hocon") 21 | } 22 | } 23 | } 24 | 25 | private def assert(resource: String) = { 26 | case class SinkConfig(foo: String, bar: String) 27 | implicit val decoder = deriveDecoder[SinkConfig] 28 | 29 | val path = Paths.get(getClass.getResource(resource).toURI) 30 | 31 | val expectedStreams = Config.Streams[SinkConfig]( 32 | good = Config.Sink( 33 | name = "good", 34 | buffer = Buffer( 35 | 3145728, 36 | 500, 37 | 5000 38 | ), 39 | SinkConfig("hello", "world") 40 | ), 41 | bad = Config.Sink( 42 | name = "bad", 43 | buffer = Buffer( 44 | 3145728, 45 | 500, 46 | 5000 47 | ), 48 | SinkConfig("hello", "world") 49 | ), 50 | TestUtils.testConfig.streams.useIpAddressAsPartitionKey 51 | ) 52 | val expected = TestUtils 53 | .testConfig 54 | .copy[SinkConfig]( 55 | paths = Map.empty[String, String], 56 | streams = expectedStreams, 57 | ssl = TestUtils.testConfig.ssl.copy(enable = true), 58 | hsts = TestUtils.testConfig.hsts.copy(enable = true, 180.days), 59 | license = Config.License(false) 60 | ) 61 | 62 | ConfigParser.fromPath[IO, SinkConfig](Some(path)).value.map(_ should beRight(expected)) 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /core/src/test/scala/com.snowplowanalytics.snowplow.collector.core/HttpServerSpec.scala: -------------------------------------------------------------------------------- 1 | package com.snowplowanalytics.snowplow.collector.core 2 | 3 | import org.specs2.mutable.Specification 4 | import cats.effect.IO 5 | 6 | import org.http4s.client.Client 7 | import org.http4s._ 8 | import org.http4s.dsl.io._ 9 | import cats.implicits._ 10 | import org.http4s.implicits._ 11 | import scala.concurrent.duration._ 12 | import cats.effect.testing.specs2._ 13 | 14 | class HttpServerSpec extends Specification with CatsEffect { 15 | val routes = HttpRoutes.of[IO] { 16 | case r if r.pathInfo == path"/large" => 17 | r.decode[String](Response[IO](Ok).withEntity(_).pure[IO]) 18 | case _ -> Root / "fast" => 19 | Ok("Fast") 20 | case _ -> Root / "never" => 21 | IO.never[Response[IO]] 22 | } 23 | val healthRoutes = HttpRoutes.of[IO] { 24 | case _ -> Root / "health" => 25 | Ok("ok") 26 | } 27 | 28 | "HttpServer" should { 29 | "manage request timeout" should { 30 | "timeout threshold is configured" in { 31 | val config = 32 | TestUtils 33 | .testConfig 34 | .copy(networking = TestUtils.testConfig.networking.copy(responseHeaderTimeout = 100.millis)) 35 | 36 | val request: Request[IO] = Request(method = Method.GET, uri = uri"/never") 37 | 38 | check(config, request)( 39 | _ must beLeft[Throwable].which { 40 | case org.http4s.client.UnexpectedStatus(Status.RequestTimeout, _, _) => true 41 | case _ => false 42 | } 43 | ) 44 | } 45 | } 46 | "manage request size" should { 47 | "drop requests larger than `networking.dropPayloadSize`" in { 48 | val config = 49 | TestUtils 50 | .testConfig 51 | .copy(networking = TestUtils.testConfig.networking.copy(maxPayloadSize = 5L, dropPayloadSize = 10L)) 52 | val request: Request[IO] = Request( 53 | Method.POST, 54 | uri"/large" 55 | ).withEntity("s" * 1000) 56 | 57 | check(config, request)( 58 | _ must beLeft[Throwable].which { 59 | case org.http4s.client.UnexpectedStatus(Status.PayloadTooLarge, _, _) => true 60 | case _ => false 61 | } 62 | ) 63 | } 64 | "allow request that's smaller than `networking.dropPayloadSize`" in { 65 | val config = 66 | TestUtils.testConfig.copy(networking = TestUtils.testConfig.networking.copy(dropPayloadSize = 1002L)) 67 | val body = "s" * 1000 68 | val request: Request[IO] = Request( 69 | Method.POST, 70 | uri"/large" 71 | ).withEntity(body) 72 | 73 | check(config, request)(_ must beRight(body)) 74 | } 75 | } 76 | } 77 | 78 | private[this] def check(config: Config[Any], request: Request[IO])(assert: Either[Throwable, _] => Boolean) = { 79 | val httpApp = HttpServer.httpApp( 80 | routes, 81 | healthRoutes, 82 | config.hsts, 83 | config.networking 84 | ) 85 | 86 | Client.fromHttpApp(httpApp).expect[String](request).attempt.map(assert) 87 | } 88 | 89 | } 90 | -------------------------------------------------------------------------------- /core/src/test/scala/com.snowplowanalytics.snowplow.collector.core/Rfc6265CookieSpec.scala: -------------------------------------------------------------------------------- 1 | package com.snowplowanalytics.snowplow.collector.core 2 | 3 | import org.specs2.mutable.Specification 4 | 5 | class Rfc6265CookieSpec extends Specification { 6 | val valid1 = "name=value" 7 | val valid2 = "name1=value2" 8 | val bothValid = s"$valid1;$valid2" 9 | val invalid = "{\"key\": \"value\"}" 10 | 11 | "Rfc6265Cookie.parse" should { 12 | "leave a valid cookie as is" in { 13 | Rfc6265Cookie.parse(valid1) must beSome(valid1) 14 | Rfc6265Cookie.parse(bothValid) must beSome(bothValid) 15 | } 16 | 17 | "remove whitespaces" in { 18 | Rfc6265Cookie.parse(s" $valid1 ") must beSome(valid1) 19 | Rfc6265Cookie.parse("name = value") must beSome(valid1) 20 | } 21 | 22 | "remove invalid parts" in { 23 | Rfc6265Cookie.parse(s"$invalid;$valid1;$valid2") must beSome(bothValid) 24 | Rfc6265Cookie.parse(s"$valid1;$invalid;$valid2") must beSome(bothValid) 25 | Rfc6265Cookie.parse(s"$valid1;$valid2;$invalid") must beSome(bothValid) 26 | } 27 | 28 | "return None if no valid part is left" in { 29 | Rfc6265Cookie.parse(invalid) must beNone 30 | Rfc6265Cookie.parse(s";$invalid;") must beNone 31 | Rfc6265Cookie.parse(";") must beNone 32 | Rfc6265Cookie.parse(";;") must beNone 33 | } 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /core/src/test/scala/com.snowplowanalytics.snowplow.collector.core/TelemetrySpec.scala: -------------------------------------------------------------------------------- 1 | package com.snowplowanalytics.snowplow.collector.core 2 | 3 | import scala.concurrent.duration._ 4 | import scala.collection.mutable.ListBuffer 5 | 6 | import org.apache.commons.codec.binary.Base64 7 | import org.apache.commons.codec.digest.DigestUtils 8 | 9 | import java.nio.charset.StandardCharsets 10 | 11 | import cats.effect._ 12 | import cats.effect.unsafe.implicits.global 13 | import cats.effect.testkit.TestControl 14 | 15 | import org.http4s._ 16 | import org.http4s.client.{Client => HttpClient} 17 | 18 | import io.circe._ 19 | import io.circe.parser._ 20 | import io.circe.syntax._ 21 | 22 | import fs2.Stream 23 | 24 | import com.snowplowanalytics.snowplow.scalatracker.emitters.http4s.ceTracking 25 | 26 | import org.specs2.mutable.Specification 27 | 28 | class TelemetrySpec extends Specification { 29 | 30 | case class ProbeTelemetry( 31 | telemetryStream: Stream[IO, Unit], 32 | telemetryEvents: ListBuffer[Json] 33 | ) 34 | 35 | val appId = "testAppId" 36 | val region = Some("testRegion") 37 | val cloud = Some("testCloud") 38 | val unhashedInstallationId = Some("testInstallationId") 39 | val interval = 5.minutes 40 | val telemetryConfig = Config.Telemetry( 41 | disable = false, 42 | interval = interval, 43 | method = "POST", 44 | url = "127.0.0.1", 45 | port = 443, 46 | secure = true, 47 | userProvidedId = None, 48 | moduleName = None, 49 | moduleVersion = None, 50 | instanceId = None, 51 | autoGeneratedId = None 52 | ) 53 | 54 | def probeTelemetry(telemetryConfig: Config.Telemetry): ProbeTelemetry = { 55 | val telemetryEvents = ListBuffer[Json]() 56 | val mockHttpApp = HttpRoutes 57 | .of[IO] { 58 | case req => 59 | IO { 60 | telemetryEvents += extractTelemetryEvent(req) 61 | Response[IO](status = Status.Ok) 62 | } 63 | } 64 | .orNotFound 65 | val mockClient = HttpClient.fromHttpApp[IO](mockHttpApp) 66 | val telemetryInfoF = IO(Telemetry.TelemetryInfo(region, cloud, unhashedInstallationId)) 67 | val telemetryStream = Telemetry.run[IO]( 68 | telemetryConfig, 69 | mockClient, 70 | TestUtils.appInfo, 71 | appId, 72 | telemetryInfoF 73 | ) 74 | ProbeTelemetry(telemetryStream, telemetryEvents) 75 | } 76 | 77 | def extractTelemetryEvent(req: Request[IO]): Json = { 78 | val body = req.bodyText.compile.string.unsafeRunSync() 79 | val jsonBody = parse(body).toOption.get 80 | val uepxEncoded = jsonBody.hcursor.downField("data").downN(0).downField("ue_px").as[String].toOption.get 81 | val uePxDecoded = new String(Base64.decodeBase64(uepxEncoded), StandardCharsets.UTF_8) 82 | parse(uePxDecoded).toOption.get.hcursor.downField("data").as[Json].toOption.get 83 | } 84 | 85 | def expectedEvent(config: Config.Telemetry): Json = { 86 | val installationId = unhashedInstallationId.map(DigestUtils.sha256Hex) 87 | Json.obj( 88 | "schema" -> "iglu:com.snowplowanalytics.oss/oss_context/jsonschema/1-0-2".asJson, 89 | "data" -> Json.obj( 90 | "userProvidedId" -> config.userProvidedId.asJson, 91 | "autoGeneratedId" -> config.autoGeneratedId.asJson, 92 | "moduleName" -> config.moduleName.asJson, 93 | "moduleVersion" -> config.moduleVersion.asJson, 94 | "instanceId" -> config.instanceId.asJson, 95 | "appGeneratedId" -> appId.asJson, 96 | "cloud" -> cloud.asJson, 97 | "region" -> region.asJson, 98 | "installationId" -> installationId.asJson, 99 | "applicationName" -> TestUtils.appInfo.name.asJson, 100 | "applicationVersion" -> TestUtils.appInfo.version.asJson 101 | ) 102 | ) 103 | } 104 | 105 | "Telemetry" should { 106 | "send correct number of events" in { 107 | val eventCount = 10 108 | val timeout = (interval * eventCount.toLong) + 1.minutes 109 | val probe = probeTelemetry(telemetryConfig) 110 | TestControl.executeEmbed(probe.telemetryStream.timeout(timeout).compile.drain.voidError).unsafeRunSync() 111 | val events = probe.telemetryEvents 112 | val expected = (1 to eventCount).map(_ => expectedEvent(telemetryConfig)).toList 113 | events must beEqualTo(expected) 114 | } 115 | 116 | "not send any events if telemetry is disabled" in { 117 | val probe = probeTelemetry(telemetryConfig.copy(disable = true)) 118 | TestControl 119 | .executeEmbed( 120 | probe.telemetryStream.timeout(interval * 10).compile.drain.voidError 121 | ) 122 | .unsafeRunSync() 123 | probe.telemetryEvents must beEmpty 124 | } 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /core/src/test/scala/com.snowplowanalytics.snowplow.collector.core/TestSink.scala: -------------------------------------------------------------------------------- 1 | package com.snowplowanalytics.snowplow.collector.core 2 | 3 | import cats.effect.IO 4 | 5 | import scala.collection.mutable.ListBuffer 6 | 7 | class TestSink extends Sink[IO] { 8 | 9 | private val buf: ListBuffer[Array[Byte]] = ListBuffer() 10 | 11 | override val maxBytes: Int = Int.MaxValue 12 | 13 | override def isHealthy: IO[Boolean] = IO.pure(true) 14 | 15 | override def storeRawEvents(events: List[Array[Byte]], key: String): IO[Unit] = 16 | IO.delay(buf ++= events) 17 | 18 | def storedRawEvents: List[Array[Byte]] = buf.toList 19 | 20 | } 21 | -------------------------------------------------------------------------------- /core/src/test/scala/com.snowplowanalytics.snowplow.collector.core/TestUtils.scala: -------------------------------------------------------------------------------- 1 | package com.snowplowanalytics.snowplow.collector.core 2 | 3 | import scala.concurrent.duration._ 4 | 5 | import cats.Applicative 6 | 7 | import org.http4s.SameSite 8 | 9 | import com.snowplowanalytics.snowplow.collector.core.Config.{Sink => SinkConfig, _} 10 | 11 | object TestUtils { 12 | val appName = "collector-test" 13 | val appVersion = "testVersion" 14 | 15 | val appInfo = new AppInfo { 16 | def name = appName 17 | def moduleName = appName 18 | def version = appVersion 19 | def dockerAlias = "docker run collector" 20 | def shortName = "ssc" 21 | } 22 | 23 | def noopSink[F[_]: Applicative]: Sink[F] = new Sink[F] { 24 | val maxBytes: Int = Int.MaxValue 25 | def isHealthy: F[Boolean] = Applicative[F].pure(true) 26 | def storeRawEvents(events: List[Array[Byte]], key: String): F[Unit] = Applicative[F].unit 27 | } 28 | 29 | val testConfig = Config[Any]( 30 | interface = "0.0.0.0", 31 | port = 8080, 32 | paths = Map( 33 | "/com.acme/track" -> "/com.snowplowanalytics.snowplow/tp2", 34 | "/com.acme/redirect" -> "/r/tp2", 35 | "/com.acme/iglu" -> "/com.snowplowanalytics.iglu/v1" 36 | ), 37 | p3p = P3P( 38 | "/w3c/p3p.xml", 39 | "NOI DSP COR NID PSA OUR IND COM NAV STA" 40 | ), 41 | crossDomain = CrossDomain( 42 | false, 43 | List("*"), 44 | true 45 | ), 46 | cookie = Cookie( 47 | enabled = true, 48 | name = "sp", 49 | expiration = 365.days, 50 | domains = Nil, 51 | fallbackDomain = None, 52 | secure = true, 53 | httpOnly = true, 54 | sameSite = Some(SameSite.None), 55 | clientCookieName = None 56 | ), 57 | doNotTrackCookie = DoNotTrackCookie( 58 | false, 59 | "", 60 | "" 61 | ), 62 | cookieBounce = CookieBounce( 63 | false, 64 | "n3pc", 65 | "00000000-0000-4000-A000-000000000000", 66 | None 67 | ), 68 | redirectMacro = RedirectMacro( 69 | false, 70 | None 71 | ), 72 | rootResponse = RootResponse( 73 | false, 74 | 302, 75 | Map.empty[String, String], 76 | "" 77 | ), 78 | cors = CORS(60.minutes), 79 | streams = Streams( 80 | good = SinkConfig( 81 | name = "raw", 82 | Buffer( 83 | 3145728, 84 | 500, 85 | 5000 86 | ), 87 | AnyRef 88 | ), 89 | bad = SinkConfig( 90 | name = "bad-1", 91 | Buffer( 92 | 3145728, 93 | 500, 94 | 5000 95 | ), 96 | AnyRef 97 | ), 98 | useIpAddressAsPartitionKey = false 99 | ), 100 | monitoring = Monitoring( 101 | Metrics( 102 | Statsd( 103 | false, 104 | "localhost", 105 | 8125, 106 | 10.seconds, 107 | "snowplow.collector", 108 | Map.empty 109 | ) 110 | ) 111 | ), 112 | ssl = SSL( 113 | false, 114 | false, 115 | 443 116 | ), 117 | hsts = HSTS( 118 | false, 119 | 365.days 120 | ), 121 | networking = Networking( 122 | 1024, 123 | 610.seconds, 124 | 30.seconds, 125 | 20480, 126 | 40960, 127 | 1048576, 128 | 2097152 129 | ), 130 | enableDefaultRedirect = false, 131 | redirectDomains = Set.empty[String], 132 | preTerminationPeriod = 10.seconds, 133 | telemetry = Config.Telemetry( 134 | disable = false, 135 | interval = 60.minutes, 136 | method = "POST", 137 | url = "telemetry-g.snowplowanalytics.com", 138 | port = 443, 139 | secure = true, 140 | userProvidedId = None, 141 | moduleName = None, 142 | moduleVersion = None, 143 | instanceId = None, 144 | autoGeneratedId = None 145 | ), 146 | license = License(accept = true) 147 | ) 148 | } 149 | -------------------------------------------------------------------------------- /examples/config.kafka.minimal.hocon: -------------------------------------------------------------------------------- 1 | collector { 2 | license { 3 | accept = true 4 | } 5 | 6 | interface = "0.0.0.0" 7 | port = 8080 8 | 9 | streams { 10 | good { 11 | name = "good" 12 | brokers = "localhost:9092,another.host:9092" 13 | } 14 | bad { 15 | name = "bad" 16 | brokers = "localhost:9092,another.host:9092" 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /examples/config.kinesis.minimal.hocon: -------------------------------------------------------------------------------- 1 | collector { 2 | license { 3 | accept = true 4 | } 5 | 6 | interface = "0.0.0.0" 7 | port = 8080 8 | 9 | streams { 10 | good { 11 | name = "good" 12 | region = eu-central-1 13 | } 14 | bad { 15 | name = "bad" 16 | region = eu-central-1 17 | } 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /examples/config.nsq.minimal.hocon: -------------------------------------------------------------------------------- 1 | collector { 2 | license { 3 | accept = true 4 | } 5 | interface = "0.0.0.0" 6 | port = 8080 7 | 8 | streams { 9 | good { 10 | name = "good" 11 | host = "nsqHost" 12 | } 13 | 14 | bad { 15 | name = "bad" 16 | host = "nsqHost" 17 | } 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /examples/config.pubsub.minimal.hocon: -------------------------------------------------------------------------------- 1 | collector { 2 | license { 3 | accept = true 4 | } 5 | 6 | interface = "0.0.0.0" 7 | port = 8080 8 | 9 | streams { 10 | good { 11 | name = "good" 12 | googleProjectId = "google-project-id" 13 | } 14 | bad { 15 | name = "bad" 16 | googleProjectId = "google-project-id" 17 | } 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /examples/config.sqs.minimal.hocon: -------------------------------------------------------------------------------- 1 | collector { 2 | license { 3 | accept = true 4 | } 5 | interface = "0.0.0.0" 6 | port = 8080 7 | 8 | streams { 9 | good { 10 | name = "good" 11 | region = eu-central-1 12 | } 13 | bad { 14 | name = "bad" 15 | region = eu-central-1 16 | } 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /examples/config.stdout.minimal.hocon: -------------------------------------------------------------------------------- 1 | collector { 2 | license { 3 | accept = true 4 | } 5 | interface = "0.0.0.0" 6 | port = 8080 7 | 8 | streams { 9 | good = "good" 10 | bad = "bad" 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /flake.nix: -------------------------------------------------------------------------------- 1 | { 2 | description = "applications for recovering snowplow bad rows"; 3 | 4 | inputs = { 5 | nixpkgs.url = "github:nixos/nixpkgs/nixpkgs-unstable"; 6 | flake-utils.url = "github:numtide/flake-utils"; 7 | flake-utils.inputs.nixpkgs.follows = "nixpkgs"; 8 | devenv.url = "github:cachix/devenv"; 9 | devenv.inputs.nixpkgs.follows = "nixpkgs"; 10 | }; 11 | 12 | outputs = { 13 | nixpkgs, 14 | flake-utils, 15 | devenv, 16 | ... 17 | } @ inputs: 18 | flake-utils.lib.eachDefaultSystem ( 19 | system: let 20 | pkgs = import nixpkgs { 21 | inherit system; 22 | config.allowUnfree = true; 23 | config.allowUnsupportedSystem = true; 24 | }; 25 | jre = pkgs.openjdk11; 26 | sbt = pkgs.sbt.override {inherit jre;}; 27 | coursier = pkgs.coursier.override {inherit jre;}; 28 | metals = pkgs.metals.override {inherit coursier jre;}; 29 | in { 30 | devShell = devenv.lib.mkShell { 31 | inherit inputs pkgs; 32 | modules = [ 33 | { 34 | packages = [ 35 | jre 36 | metals 37 | sbt 38 | pkgs.kubernetes-helm 39 | # (pkgs.wrapHelm pkgs.kubernetes-helm {plugins = [pkgs.kubernetes-helmPlugins.helm-diff];}) 40 | # pkgs.google-cloud-sdk.withExtraComponents( with pkgs.google-cloud-sdk.components [ gke-gcloud-auth-plugin ]); 41 | (pkgs.google-cloud-sdk.withExtraComponents [pkgs.google-cloud-sdk.components.gke-gcloud-auth-plugin]) 42 | # pkgs.google-cloud-sdk-gce 43 | pkgs.snyk 44 | pkgs.gitleaks 45 | ]; 46 | scripts = { 47 | snyk-check.exec = '' 48 | for p in kinesis pubsub kafka nsq; do sbt "project ''${p}Distroless; set version := \"latest\"; Docker / publishLocal"; snyk container test --platform=linux/arm64 --app-vulns snowplow/scala-stream-collector-''${p}:latest-distroless; done 49 | ''; 50 | }; 51 | languages.nix.enable = true; 52 | pre-commit.hooks = { 53 | alejandra.enable = true; 54 | deadnix.enable = true; 55 | gitleaks = { 56 | enable = true; 57 | name = "gitleaks"; 58 | entry = "${pkgs.gitleaks}/bin/gitleaks detect --source . -v"; 59 | }; 60 | }; 61 | } 62 | ]; 63 | }; 64 | } 65 | ); 66 | } 67 | -------------------------------------------------------------------------------- /http4s/src/it/scala/com/snowplowanalytics/snowplow/collectors/scalastream/it/CollectorContainer.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2023-present Snowplow Analytics Ltd. 3 | * All rights reserved. 4 | * 5 | * This software is made available by Snowplow Analytics, Ltd., 6 | * under the terms of the Snowplow Limited Use License Agreement, Version 1.1 7 | * located at https://docs.snowplow.io/limited-use-license-1.1 8 | * BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION 9 | * OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT. 10 | */ 11 | package com.snowplowanalytics.snowplow.collectors.scalastream.it 12 | 13 | import org.testcontainers.containers.GenericContainer 14 | 15 | case class CollectorContainer( 16 | container: GenericContainer[_], 17 | host: String, 18 | port: Int 19 | ) 20 | -------------------------------------------------------------------------------- /kafka/src/it/resources/collector.hocon: -------------------------------------------------------------------------------- 1 | collector { 2 | license { accept = true } 3 | interface = "0.0.0.0" 4 | port = ${PORT} 5 | 6 | streams { 7 | good { 8 | name = ${TOPIC_GOOD} 9 | brokers = ${BROKER} 10 | maxBytes = ${MAX_BYTES} 11 | producerConf = { 12 | "security.protocol" = "PLAINTEXT" 13 | "sasl.mechanism" = "GSSAPI" 14 | } 15 | } 16 | bad { 17 | name = ${TOPIC_BAD} 18 | brokers = ${BROKER} 19 | maxBytes = ${MAX_BYTES} 20 | producerConf = { 21 | "security.protocol" = "PLAINTEXT" 22 | "sasl.mechanism" = "GSSAPI" 23 | } 24 | } 25 | } 26 | 27 | networking { 28 | responseHeaderTimeout = 10 seconds 29 | } 30 | } -------------------------------------------------------------------------------- /kafka/src/it/scala/com/snowplowanalytics/snowplow/collectors/scalastream/it/kafka/Containers.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2013-present Snowplow Analytics Ltd. 3 | * All rights reserved. 4 | * 5 | * This software is made available by Snowplow Analytics, Ltd., 6 | * under the terms of the Snowplow Limited Use License Agreement, Version 1.1 7 | * located at https://docs.snowplow.io/limited-use-license-1.1 8 | * BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION 9 | * OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT. 10 | */ 11 | package com.snowplowanalytics.snowplow.collectors.scalastream.it.kafka 12 | 13 | import cats.effect._ 14 | import com.dimafeng.testcontainers.{FixedHostPortGenericContainer, GenericContainer} 15 | import com.snowplowanalytics.snowplow.collectors.scalastream.BuildInfo 16 | import com.snowplowanalytics.snowplow.collectors.scalastream.it.CollectorContainer 17 | import com.snowplowanalytics.snowplow.collectors.scalastream.it.utils._ 18 | import org.testcontainers.containers.wait.strategy.Wait 19 | import org.testcontainers.containers.{BindMode, Network, GenericContainer => JGenericContainer} 20 | 21 | object Containers { 22 | 23 | val zookeeperContainerName = "zookeeper" 24 | val zookeeperPort = 2181 25 | val brokerContainerName = "broker" 26 | val brokerExternalPort = 9092 27 | val brokerInternalPort = 29092 28 | 29 | def createContainers( 30 | goodTopic: String, 31 | badTopic: String, 32 | maxBytes: Int 33 | ): Resource[IO, CollectorContainer] = 34 | for { 35 | network <- network() 36 | _ <- zookeeper(network) 37 | _ <- kafka(network) 38 | c <- collectorKafka(network, goodTopic, badTopic, maxBytes) 39 | } yield c 40 | 41 | private def network(): Resource[IO, Network] = 42 | Resource.make(IO(Network.newNetwork()))(n => IO(n.close())) 43 | 44 | private def kafka( 45 | network: Network 46 | ): Resource[IO, JGenericContainer[_]] = 47 | Resource.make( 48 | IO { 49 | val container = FixedHostPortGenericContainer( 50 | imageName = "confluentinc/cp-kafka:7.0.1", 51 | env = Map( 52 | "KAFKA_BROKER_ID" -> "1", 53 | "KAFKA_ZOOKEEPER_CONNECT" -> s"$zookeeperContainerName:$zookeeperPort", 54 | "KAFKA_LISTENER_SECURITY_PROTOCOL_MAP" -> "PLAINTEXT:PLAINTEXT,PLAINTEXT_INTERNAL:PLAINTEXT", 55 | "KAFKA_ADVERTISED_LISTENERS" -> s"PLAINTEXT://localhost:$brokerExternalPort,PLAINTEXT_INTERNAL://$brokerContainerName:$brokerInternalPort", 56 | "KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR" -> "1", 57 | "KAFKA_TRANSACTION_STATE_LOG_MIN_ISR" -> "1", 58 | "KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR" -> "1" 59 | ), 60 | exposedPorts = List(brokerExternalPort, brokerInternalPort), 61 | exposedHostPort = brokerExternalPort, 62 | exposedContainerPort = brokerExternalPort 63 | ) 64 | container.container.withNetwork(network) 65 | container.container.withNetworkAliases(brokerContainerName) 66 | container.start() 67 | container.container 68 | } 69 | )(e => IO(e.stop())) 70 | 71 | private def zookeeper( 72 | network: Network, 73 | ): Resource[IO, JGenericContainer[_]] = 74 | Resource.make( 75 | IO { 76 | val container = GenericContainer( 77 | dockerImage = "confluentinc/cp-zookeeper:7.0.1", 78 | env = Map( 79 | "ZOOKEEPER_CLIENT_PORT" -> zookeeperPort.toString, 80 | "ZOOKEEPER_TICK_TIME" -> "2000" 81 | ), 82 | exposedPorts = List(zookeeperPort) 83 | ) 84 | container.container.withNetwork(network) 85 | container.container.withNetworkAliases(zookeeperContainerName) 86 | container.start() 87 | container.container 88 | } 89 | )(e => IO(e.stop())) 90 | 91 | def collectorKafka( 92 | network: Network, 93 | goodTopic: String, 94 | badTopic: String, 95 | maxBytes: Int 96 | ): Resource[IO, CollectorContainer] = { 97 | Resource.make( 98 | IO { 99 | val collectorPort = 8080 100 | val container = GenericContainer( 101 | dockerImage = BuildInfo.dockerAlias, 102 | env = Map( 103 | "PORT" -> collectorPort.toString, 104 | "BROKER" -> s"$brokerContainerName:$brokerInternalPort", 105 | "TOPIC_GOOD" -> goodTopic, 106 | "TOPIC_BAD" -> badTopic, 107 | "MAX_BYTES" -> maxBytes.toString 108 | ), 109 | exposedPorts = Seq(collectorPort), 110 | fileSystemBind = Seq( 111 | GenericContainer.FileSystemBind( 112 | "kafka/src/it/resources/collector.hocon", 113 | "/snowplow/config/collector.hocon", 114 | BindMode.READ_ONLY 115 | ) 116 | ), 117 | command = Seq( 118 | "--config", 119 | "/snowplow/config/collector.hocon" 120 | ), 121 | waitStrategy = Wait.forLogMessage(s".*Service bound to address.*", 1) 122 | ) 123 | container.container.withNetwork(network) 124 | val c = startContainerWithLogs(container.container, "collector") 125 | CollectorContainer(c, c.getHost, c.getMappedPort(collectorPort)) 126 | } 127 | )(c => IO(c.container.stop())) 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /kafka/src/it/scala/com/snowplowanalytics/snowplow/collectors/scalastream/it/kafka/KafkaCollectorSpec.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2013-present Snowplow Analytics Ltd. 3 | * All rights reserved. 4 | * 5 | * This software is made available by Snowplow Analytics, Ltd., 6 | * under the terms of the Snowplow Limited Use License Agreement, Version 1.1 7 | * located at https://docs.snowplow.io/limited-use-license-1.1 8 | * BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION 9 | * OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT. 10 | */ 11 | package com.snowplowanalytics.snowplow.collectors.scalastream.it.kafka 12 | 13 | import scala.concurrent.duration._ 14 | 15 | import cats.effect.IO 16 | import cats.effect.testing.specs2.CatsEffect 17 | 18 | import com.snowplowanalytics.snowplow.collectors.scalastream.it.EventGenerator 19 | import com.snowplowanalytics.snowplow.collectors.scalastream.it.utils._ 20 | 21 | import org.specs2.mutable.Specification 22 | 23 | class KafkaCollectorSpec extends Specification with CatsEffect { 24 | 25 | override protected val Timeout = 5.minutes 26 | 27 | val maxBytes = 10000 28 | 29 | "emit the correct number of collector payloads and bad rows" in { 30 | val testName = "count" 31 | val nbGood = 1000 32 | val nbBad = 10 33 | val goodTopic = "test-raw" 34 | val badTopic = "test-bad" 35 | 36 | Containers.createContainers( 37 | goodTopic = goodTopic, 38 | badTopic = badTopic, 39 | maxBytes = maxBytes 40 | ).use { collector => 41 | for { 42 | _ <- log(testName, "Sending data") 43 | _ <- EventGenerator.sendEvents( 44 | collector.host, 45 | collector.port, 46 | nbGood, 47 | nbBad, 48 | maxBytes 49 | ) 50 | _ <- log(testName, "Data sent. Waiting for collector to work") 51 | _ <- IO.sleep(30.second) 52 | _ <- log(testName, "Consuming collector's output") 53 | collectorOutput <- KafkaUtils.readOutput( 54 | brokerAddr = s"localhost:${Containers.brokerExternalPort}", 55 | goodTopic = goodTopic, 56 | badTopic = badTopic 57 | ) 58 | } yield { 59 | collectorOutput.good.size must beEqualTo(nbGood) 60 | collectorOutput.bad.size must beEqualTo(nbBad) 61 | } 62 | } 63 | } 64 | 65 | } 66 | -------------------------------------------------------------------------------- /kafka/src/it/scala/com/snowplowanalytics/snowplow/collectors/scalastream/it/kafka/KafkaUtils.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2013-present Snowplow Analytics Ltd. 3 | * All rights reserved. 4 | * 5 | * This software is made available by Snowplow Analytics, Ltd., 6 | * under the terms of the Snowplow Limited Use License Agreement, Version 1.1 7 | * located at https://docs.snowplow.io/limited-use-license-1.1 8 | * BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION 9 | * OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT. 10 | */ 11 | package com.snowplowanalytics.snowplow.collectors.scalastream.it.kafka 12 | 13 | import cats.effect._ 14 | import org.apache.kafka.clients.consumer._ 15 | import java.util.Properties 16 | import java.time.Duration 17 | import scala.jdk.CollectionConverters._ 18 | import com.snowplowanalytics.snowplow.collectors.scalastream.it.utils._ 19 | import com.snowplowanalytics.snowplow.collectors.scalastream.it.CollectorOutput 20 | 21 | object KafkaUtils { 22 | 23 | def readOutput( 24 | brokerAddr: String, 25 | goodTopic: String, 26 | badTopic: String 27 | ): IO[CollectorOutput] = { 28 | createConsumer(brokerAddr).use { kafkaConsumer => 29 | IO { 30 | kafkaConsumer.subscribe(List(goodTopic, badTopic).asJava) 31 | val records = kafkaConsumer.poll(Duration.ofSeconds(20)) 32 | val extract = (r: ConsumerRecords[String, Array[Byte]], topicName: String) => 33 | r.records(topicName).asScala.toList.map(_.value()) 34 | val goodCount = extract(records, goodTopic).map(parseCollectorPayload) 35 | val badCount = extract(records, badTopic).map(parseBadRow) 36 | CollectorOutput(goodCount, badCount) 37 | } 38 | } 39 | } 40 | 41 | private def createConsumer(brokerAddr: String): Resource[IO, KafkaConsumer[String, Array[Byte]]] = { 42 | val acquire = IO { 43 | val props = new Properties() 44 | props.setProperty("bootstrap.servers", brokerAddr) 45 | props.setProperty("group.id", "it-collector") 46 | props.setProperty("auto.offset.reset", "earliest") 47 | props.setProperty("max.poll.records", "2000") 48 | props.setProperty("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer") 49 | props.setProperty("value.deserializer", "org.apache.kafka.common.serialization.ByteArrayDeserializer") 50 | new KafkaConsumer[String, Array[Byte]](props) 51 | } 52 | val release = (p: KafkaConsumer[String, Array[Byte]]) => IO(p.close()) 53 | Resource.make(acquire)(release) 54 | } 55 | 56 | } 57 | -------------------------------------------------------------------------------- /kafka/src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | collector { 2 | streams { 3 | 4 | //New object-like style 5 | good = ${collector.streams.sink} 6 | bad = ${collector.streams.sink} 7 | 8 | //Legacy style 9 | sink { 10 | threadPoolSize = 10 11 | retries = 10 12 | maxBytes = 1000000 13 | buffer = ${collector.streams.buffer} 14 | producerConf = { 15 | "security.protocol" = "SASL_SSL" 16 | "sasl.mechanism" = "OAUTHBEARER" 17 | "sasl.jaas.config": "org.apache.kafka.common.security.oauthbearer.OAuthBearerLoginModule required;" 18 | } 19 | } 20 | 21 | //Legacy style 22 | buffer { 23 | byteLimit = 3145728 24 | recordLimit = 500 25 | timeLimit = 5000 26 | } 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /kafka/src/main/scala/com.snowplowanalytics.snowplow.collectors.scalastream/KafkaCollector.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2013-present Snowplow Analytics Ltd. 3 | * All rights reserved. 4 | * 5 | * This software is made available by Snowplow Analytics, Ltd., 6 | * under the terms of the Snowplow Limited Use License Agreement, Version 1.1 7 | * located at https://docs.snowplow.io/limited-use-license-1.1 8 | * BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION 9 | * OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT. 10 | */ 11 | package com.snowplowanalytics.snowplow.collectors.scalastream 12 | 13 | import cats.effect.{IO, Resource} 14 | import com.snowplowanalytics.snowplow.collector.core.model.Sinks 15 | import com.snowplowanalytics.snowplow.collector.core.{App, Config, Telemetry} 16 | import com.snowplowanalytics.snowplow.collectors.scalastream.sinks._ 17 | 18 | object KafkaCollector extends App[KafkaSinkConfig](BuildInfo) { 19 | 20 | override def mkSinks(config: Config.Streams[KafkaSinkConfig]): Resource[IO, Sinks[IO]] = 21 | for { 22 | good <- KafkaSink.create[IO]( 23 | config.good, 24 | classOf[GoodAzureAuthenticationCallbackHandler].getName 25 | ) 26 | bad <- KafkaSink.create[IO]( 27 | config.bad, 28 | classOf[BadAzureAuthenticationCallbackHandler].getName 29 | ) 30 | } yield Sinks(good, bad) 31 | 32 | override def telemetryInfo(config: Config.Streams[KafkaSinkConfig]): IO[Telemetry.TelemetryInfo] = 33 | TelemetryUtils.getAzureSubscriptionId.map { 34 | case None => Telemetry.TelemetryInfo(None, None, None) 35 | case Some(id) => Telemetry.TelemetryInfo(None, Some("Azure"), Some(id)) 36 | } 37 | 38 | } 39 | -------------------------------------------------------------------------------- /kafka/src/main/scala/com.snowplowanalytics.snowplow.collectors.scalastream/TelemetryUtils.scala: -------------------------------------------------------------------------------- 1 | package com.snowplowanalytics.snowplow.collectors.scalastream 2 | 3 | import cats.effect.IO 4 | import org.http4s._ 5 | import org.http4s.blaze.client.BlazeClientBuilder 6 | import org.typelevel.ci._ 7 | import io.circe.parser 8 | 9 | object TelemetryUtils { 10 | 11 | // Metadata service response will be used to get Azure subscription id 12 | // More information about the service can be found here: 13 | // https://learn.microsoft.com/en-us/azure/virtual-machines/instance-metadata-service 14 | val azureMetadataServiceUrl = "http://169.254.169.254/metadata/instance?api-version=2021-02-01" 15 | 16 | def getAzureSubscriptionId: IO[Option[String]] = { 17 | val response = for { 18 | client <- BlazeClientBuilder[IO].resource 19 | request = Request[IO]( 20 | method = Method.GET, 21 | uri = Uri.unsafeFromString(azureMetadataServiceUrl), 22 | headers = Headers(Header.Raw(ci"Metadata", "true")) 23 | ) 24 | response <- client.run(request) 25 | } yield response 26 | response.use(_.bodyText.compile.string.map(extractId)).handleError(_ => None) 27 | } 28 | 29 | private def extractId(metadata: String): Option[String] = 30 | for { 31 | json <- parser.parse(metadata).toOption 32 | id <- json.hcursor.downField("compute").downField("subscriptionId").as[String].toOption 33 | } yield id 34 | } 35 | -------------------------------------------------------------------------------- /kafka/src/main/scala/com.snowplowanalytics.snowplow.collectors.scalastream/sinks/AzureAuthenticationCallbackHandler.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2013-present Snowplow Analytics Ltd. 3 | * All rights reserved. 4 | * 5 | * This software is made available by Snowplow Analytics, Ltd., 6 | * under the terms of the Snowplow Limited Use License Agreement, Version 1.1 7 | * located at https://docs.snowplow.io/limited-use-license-1.1 8 | * BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION 9 | * OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT. 10 | */ 11 | package com.snowplowanalytics.snowplow.collectors.scalastream 12 | package sinks 13 | 14 | import java.net.URI 15 | import java.{lang, util} 16 | 17 | import javax.security.auth.callback.Callback 18 | import javax.security.auth.callback.UnsupportedCallbackException 19 | import javax.security.auth.login.AppConfigurationEntry 20 | 21 | import org.apache.kafka.clients.producer.ProducerConfig 22 | import org.apache.kafka.common.security.auth.AuthenticateCallbackHandler 23 | import org.apache.kafka.common.security.oauthbearer.OAuthBearerToken 24 | import org.apache.kafka.common.security.oauthbearer.OAuthBearerTokenCallback 25 | 26 | import com.azure.identity.DefaultAzureCredentialBuilder 27 | import com.azure.core.credential.TokenRequestContext 28 | 29 | import com.nimbusds.jwt.JWTParser 30 | 31 | // We need separate instances of callback handler with good and bad sink because 32 | // they need different tokens to authenticate. However we are only giving class name to 33 | // Kafka and it initializes the class itself and if we pass same class name for both sinks, 34 | // Kafka initializes and uses only one instance of the callback handler. To create two 35 | // separate instances, we created two different classes and pass their names to respective 36 | // sink's properties. With this way, both sinks will have their own callback handler instance. 37 | class GoodAzureAuthenticationCallbackHandler extends AzureAuthenticationCallbackHandler 38 | 39 | class BadAzureAuthenticationCallbackHandler extends AzureAuthenticationCallbackHandler 40 | 41 | class AzureAuthenticationCallbackHandler extends AuthenticateCallbackHandler { 42 | 43 | val credentials = new DefaultAzureCredentialBuilder().build() 44 | 45 | var sbUri: String = "" 46 | 47 | override def configure( 48 | configs: util.Map[String, _], 49 | saslMechanism: String, 50 | jaasConfigEntries: util.List[AppConfigurationEntry] 51 | ): Unit = { 52 | val bootstrapServer = 53 | configs 54 | .get(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG) 55 | .toString 56 | .replaceAll("\\[|\\]", "") 57 | .split(",") 58 | .toList 59 | .headOption match { 60 | case Some(s) => s 61 | case None => throw new Exception("Empty bootstrap servers list") 62 | } 63 | val uri = URI.create("https://" + bootstrapServer) 64 | // Workload identity works with '.default' scope 65 | this.sbUri = s"${uri.getScheme}://${uri.getHost}/.default" 66 | } 67 | 68 | override def handle(callbacks: Array[Callback]): Unit = 69 | callbacks.foreach { 70 | case callback: OAuthBearerTokenCallback => 71 | val token = getOAuthBearerToken() 72 | callback.token(token) 73 | case callback => throw new UnsupportedCallbackException(callback) 74 | } 75 | 76 | def getOAuthBearerToken(): OAuthBearerToken = { 77 | val reqContext = new TokenRequestContext() 78 | reqContext.addScopes(sbUri) 79 | val accessToken = credentials.getTokenSync(reqContext).getToken 80 | val jwt = JWTParser.parse(accessToken) 81 | val claims = jwt.getJWTClaimsSet 82 | 83 | new OAuthBearerToken { 84 | override def value(): String = accessToken 85 | 86 | override def lifetimeMs(): Long = claims.getExpirationTime.getTime 87 | 88 | override def scope(): util.Set[String] = null 89 | 90 | override def principalName(): String = null 91 | 92 | override def startTimeMs(): lang.Long = null 93 | } 94 | } 95 | 96 | override def close(): Unit = () 97 | } 98 | -------------------------------------------------------------------------------- /kafka/src/main/scala/com.snowplowanalytics.snowplow.collectors.scalastream/sinks/KafkaSink.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2013-present Snowplow Analytics Ltd. 3 | * All rights reserved. 4 | * 5 | * This software is made available by Snowplow Analytics, Ltd., 6 | * under the terms of the Snowplow Limited Use License Agreement, Version 1.1 7 | * located at https://docs.snowplow.io/limited-use-license-1.1 8 | * BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION 9 | * OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT. 10 | */ 11 | package com.snowplowanalytics.snowplow.collectors.scalastream 12 | package sinks 13 | 14 | import cats.implicits._ 15 | import cats.effect.implicits._ 16 | import cats.effect._ 17 | import org.typelevel.log4cats.Logger 18 | import org.typelevel.log4cats.slf4j.Slf4jLogger 19 | import org.apache.kafka.clients.producer.{Callback, KafkaProducer, ProducerRecord, RecordMetadata} 20 | 21 | import com.snowplowanalytics.snowplow.collector.core.{Config, Sink} 22 | 23 | import scala.jdk.CollectionConverters._ 24 | import scala.concurrent.ExecutionContext 25 | import java.util.concurrent.Executors 26 | 27 | /** 28 | * Kafka Sink for the Scala Stream Collector 29 | */ 30 | class KafkaSink[F[_]: Async: Logger]( 31 | val maxBytes: Int, 32 | isHealthyState: Ref[F, Boolean], 33 | kafkaProducer: KafkaProducer[String, Array[Byte]], 34 | topicName: String, 35 | ec: ExecutionContext 36 | ) extends Sink[F] { 37 | 38 | override def isHealthy: F[Boolean] = isHealthyState.get 39 | 40 | /** 41 | * Store raw events to the topic 42 | * 43 | * @param events The list of events to send 44 | * @param key The partition key to use 45 | */ 46 | override def storeRawEvents(events: List[Array[Byte]], key: String): F[Unit] = 47 | storeRawEventsAndWait(events, key).start.void 48 | 49 | private def storeRawEventsAndWait(events: List[Array[Byte]], key: String): F[Unit] = 50 | Logger[F].debug(s"Writing ${events.size} Thrift records to Kafka topic $topicName at key $key") *> 51 | events.parTraverse_ { e => 52 | def go: F[Unit] = 53 | Async[F] 54 | .async[Unit] { cb => 55 | val blockingSend = Sync[F].delay { 56 | val record = new ProducerRecord(topicName, key, e) 57 | kafkaProducer.send(record, callback(cb)) 58 | Option.empty[F[Unit]] 59 | } 60 | Async[F].startOn(blockingSend, ec).map(f => Some(f.cancel)) 61 | } 62 | .handleErrorWith { e => 63 | handlePublishError(e) >> go 64 | } 65 | go 66 | } *> isHealthyState.set(true) 67 | 68 | private def callback(asyncCallback: Either[Throwable, Unit] => Unit): Callback = 69 | new Callback { 70 | def onCompletion(metadata: RecordMetadata, exception: Exception): Unit = 71 | Option(exception) match { 72 | case Some(e) => asyncCallback(Left(e)) 73 | case None => asyncCallback(Right(())) 74 | } 75 | } 76 | 77 | private def handlePublishError(error: Throwable): F[Unit] = 78 | isHealthyState.set(false) *> Logger[F].error(s"Publishing to Kafka failed with message ${error.getMessage}") 79 | } 80 | 81 | object KafkaSink { 82 | 83 | implicit private def unsafeLogger[F[_]: Sync]: Logger[F] = 84 | Slf4jLogger.getLogger[F] 85 | 86 | def create[F[_]: Async]( 87 | sinkConfig: Config.Sink[KafkaSinkConfig], 88 | authCallbackClass: String 89 | ): Resource[F, KafkaSink[F]] = 90 | for { 91 | isHealthyState <- Resource.eval(Ref.of[F, Boolean](false)) 92 | kafkaProducer <- createProducer(sinkConfig.config, sinkConfig.buffer, authCallbackClass) 93 | ec <- createExecutionContext 94 | } yield new KafkaSink( 95 | sinkConfig.config.maxBytes, 96 | isHealthyState, 97 | kafkaProducer, 98 | sinkConfig.name, 99 | ec 100 | ) 101 | 102 | /** 103 | * Creates a new Kafka Producer with the given 104 | * configuration options 105 | * 106 | * @return a new Kafka Producer 107 | */ 108 | private def createProducer[F[_]: Async]( 109 | kafkaConfig: KafkaSinkConfig, 110 | bufferConfig: Config.Buffer, 111 | authCallbackClass: String 112 | ): Resource[F, KafkaProducer[String, Array[Byte]]] = { 113 | val props = Map( 114 | "bootstrap.servers" -> kafkaConfig.brokers, 115 | "acks" -> "all", 116 | "retries" -> kafkaConfig.retries.toString, 117 | "linger.ms" -> bufferConfig.timeLimit.toString, 118 | "key.serializer" -> "org.apache.kafka.common.serialization.StringSerializer", 119 | "value.serializer" -> "org.apache.kafka.common.serialization.ByteArraySerializer", 120 | "sasl.login.callback.handler.class" -> authCallbackClass 121 | ) ++ kafkaConfig.producerConf.getOrElse(Map.empty) + ("buffer.memory" -> Long.MaxValue.toString) 122 | 123 | val make = Sync[F].delay { 124 | new KafkaProducer[String, Array[Byte]]((props: Map[String, AnyRef]).asJava) 125 | } 126 | Resource.make(make)(p => Sync[F].blocking(p.close)) 127 | } 128 | 129 | def createExecutionContext[F[_]: Sync]: Resource[F, ExecutionContext] = { 130 | val make = Sync[F].delay { 131 | Executors.newSingleThreadExecutor 132 | } 133 | Resource.make(make)(e => Sync[F].blocking(e.shutdown)).map(ExecutionContext.fromExecutorService(_)) 134 | } 135 | 136 | } 137 | -------------------------------------------------------------------------------- /kafka/src/main/scala/com.snowplowanalytics.snowplow.collectors.scalastream/sinks/KafkaSinkConfig.scala: -------------------------------------------------------------------------------- 1 | package com.snowplowanalytics.snowplow.collectors.scalastream.sinks 2 | 3 | import io.circe.Decoder 4 | import io.circe.generic.semiauto._ 5 | 6 | final case class KafkaSinkConfig( 7 | maxBytes: Int, 8 | brokers: String, 9 | retries: Int, 10 | producerConf: Option[Map[String, String]] 11 | ) 12 | 13 | object KafkaSinkConfig { 14 | implicit val configDecoder: Decoder[KafkaSinkConfig] = deriveDecoder[KafkaSinkConfig] 15 | } 16 | -------------------------------------------------------------------------------- /kinesis/src/it/resources/collector-client-cookie.hocon: -------------------------------------------------------------------------------- 1 | collector { 2 | license { accept = true } 3 | interface = "0.0.0.0" 4 | port = ${PORT} 5 | 6 | streams { 7 | good { 8 | name = ${STREAM_GOOD} 9 | region = ${REGION} 10 | customEndpoint = ${KINESIS_ENDPOINT} 11 | 12 | aws { 13 | accessKey = env 14 | secretKey = env 15 | } 16 | 17 | maxBytes = ${MAX_BYTES} 18 | } 19 | 20 | bad { 21 | name = ${STREAM_BAD} 22 | region = ${REGION} 23 | customEndpoint = ${KINESIS_ENDPOINT} 24 | 25 | aws { 26 | accessKey = env 27 | secretKey = env 28 | } 29 | 30 | maxBytes = ${MAX_BYTES} 31 | } 32 | } 33 | 34 | networking { 35 | responseHeaderTimeout = 10 seconds 36 | } 37 | 38 | "cookie": { 39 | "enabled": true, 40 | "name": ${SERVER_COOKIE_NAME}, 41 | "expiration": "365 days", 42 | "secure": false, 43 | "httpOnly": false, 44 | "sameSite": "None" 45 | "clientCookieName": ${CLIENT_COOKIE_NAME} 46 | } 47 | } -------------------------------------------------------------------------------- /kinesis/src/it/resources/collector-cookie-anonymous.hocon: -------------------------------------------------------------------------------- 1 | collector { 2 | license { accept = true } 3 | interface = "0.0.0.0" 4 | port = ${PORT} 5 | 6 | streams { 7 | good { 8 | name = ${STREAM_GOOD} 9 | region = ${REGION} 10 | customEndpoint = ${KINESIS_ENDPOINT} 11 | 12 | aws { 13 | accessKey = env 14 | secretKey = env 15 | } 16 | 17 | maxBytes = ${MAX_BYTES} 18 | } 19 | 20 | bad { 21 | name = ${STREAM_BAD} 22 | region = ${REGION} 23 | customEndpoint = ${KINESIS_ENDPOINT} 24 | 25 | aws { 26 | accessKey = env 27 | secretKey = env 28 | } 29 | 30 | maxBytes = ${MAX_BYTES} 31 | } 32 | } 33 | 34 | networking { 35 | responseHeaderTimeout = 10 seconds 36 | } 37 | 38 | "cookie": { 39 | "enabled": true, 40 | "name": "sp", 41 | "expiration": "365 days", 42 | "secure": false, 43 | "httpOnly": false, 44 | "sameSite": "None" 45 | } 46 | } -------------------------------------------------------------------------------- /kinesis/src/it/resources/collector-cookie-attributes-1.hocon: -------------------------------------------------------------------------------- 1 | collector { 2 | license { accept = true } 3 | interface = "0.0.0.0" 4 | port = ${PORT} 5 | 6 | streams { 7 | good { 8 | name = ${STREAM_GOOD} 9 | region = ${REGION} 10 | customEndpoint = ${KINESIS_ENDPOINT} 11 | 12 | aws { 13 | accessKey = env 14 | secretKey = env 15 | } 16 | 17 | maxBytes = ${MAX_BYTES} 18 | } 19 | 20 | bad { 21 | name = ${STREAM_BAD} 22 | region = ${REGION} 23 | customEndpoint = ${KINESIS_ENDPOINT} 24 | 25 | aws { 26 | accessKey = env 27 | secretKey = env 28 | } 29 | 30 | maxBytes = ${MAX_BYTES} 31 | } 32 | } 33 | 34 | networking { 35 | responseHeaderTimeout = 10 seconds 36 | } 37 | 38 | "cookie": { 39 | "enabled": true, 40 | "name": "greatName", 41 | "expiration": "42 days", 42 | "secure": true, 43 | "httpOnly": true, 44 | "sameSite": "Strict" 45 | } 46 | } -------------------------------------------------------------------------------- /kinesis/src/it/resources/collector-cookie-attributes-2.hocon: -------------------------------------------------------------------------------- 1 | collector { 2 | license { accept = true } 3 | interface = "0.0.0.0" 4 | port = ${PORT} 5 | 6 | streams { 7 | good { 8 | name = ${STREAM_GOOD} 9 | region = ${REGION} 10 | customEndpoint = ${KINESIS_ENDPOINT} 11 | 12 | aws { 13 | accessKey = env 14 | secretKey = env 15 | } 16 | 17 | maxBytes = ${MAX_BYTES} 18 | } 19 | 20 | bad { 21 | name = ${STREAM_BAD} 22 | region = ${REGION} 23 | customEndpoint = ${KINESIS_ENDPOINT} 24 | 25 | aws { 26 | accessKey = env 27 | secretKey = env 28 | } 29 | 30 | maxBytes = ${MAX_BYTES} 31 | } 32 | } 33 | 34 | networking { 35 | responseHeaderTimeout = 10 seconds 36 | } 37 | 38 | "cookie": { 39 | "enabled": true, 40 | "name": "sp", 41 | "expiration": "365 days", 42 | "secure": false, 43 | "httpOnly": false, 44 | "sameSite": "None" 45 | } 46 | } -------------------------------------------------------------------------------- /kinesis/src/it/resources/collector-cookie-domain.hocon: -------------------------------------------------------------------------------- 1 | collector { 2 | license { accept = true } 3 | interface = "0.0.0.0" 4 | port = ${PORT} 5 | 6 | streams { 7 | good { 8 | name = ${STREAM_GOOD} 9 | region = ${REGION} 10 | customEndpoint = ${KINESIS_ENDPOINT} 11 | 12 | aws { 13 | accessKey = env 14 | secretKey = env 15 | } 16 | 17 | maxBytes = ${MAX_BYTES} 18 | } 19 | 20 | bad { 21 | name = ${STREAM_BAD} 22 | region = ${REGION} 23 | customEndpoint = ${KINESIS_ENDPOINT} 24 | 25 | aws { 26 | accessKey = env 27 | secretKey = env 28 | } 29 | 30 | maxBytes = ${MAX_BYTES} 31 | } 32 | } 33 | 34 | networking { 35 | responseHeaderTimeout = 10 seconds 36 | } 37 | 38 | "cookie": { 39 | "enabled": true, 40 | "name": "sp", 41 | "expiration": "365 days", 42 | "domains": ["foo.bar","sub.foo.bar"], 43 | "fallbackDomain": "fallback.domain", 44 | "secure": false, 45 | "httpOnly": false, 46 | "sameSite": "None" 47 | } 48 | } -------------------------------------------------------------------------------- /kinesis/src/it/resources/collector-cookie-fallback.hocon: -------------------------------------------------------------------------------- 1 | collector { 2 | license { accept = true } 3 | interface = "0.0.0.0" 4 | port = ${PORT} 5 | 6 | streams { 7 | good { 8 | name = ${STREAM_GOOD} 9 | region = ${REGION} 10 | customEndpoint = ${KINESIS_ENDPOINT} 11 | 12 | aws { 13 | accessKey = env 14 | secretKey = env 15 | } 16 | 17 | maxBytes = ${MAX_BYTES} 18 | } 19 | 20 | bad { 21 | name = ${STREAM_BAD} 22 | region = ${REGION} 23 | customEndpoint = ${KINESIS_ENDPOINT} 24 | 25 | aws { 26 | accessKey = env 27 | secretKey = env 28 | } 29 | 30 | maxBytes = ${MAX_BYTES} 31 | } 32 | } 33 | 34 | networking { 35 | responseHeaderTimeout = 10 seconds 36 | } 37 | 38 | "cookie": { 39 | "enabled": true, 40 | "name": "sp", 41 | "expiration": "365 days", 42 | "domains": ["foo.bar" ], 43 | "fallbackDomain": "fallback.domain", 44 | "secure": false, 45 | "httpOnly": false, 46 | "sameSite": "None" 47 | } 48 | } -------------------------------------------------------------------------------- /kinesis/src/it/resources/collector-cookie-no-domain.hocon: -------------------------------------------------------------------------------- 1 | collector { 2 | license { accept = true } 3 | interface = "0.0.0.0" 4 | port = ${PORT} 5 | 6 | streams { 7 | good { 8 | name = ${STREAM_GOOD} 9 | region = ${REGION} 10 | customEndpoint = ${KINESIS_ENDPOINT} 11 | 12 | aws { 13 | accessKey = env 14 | secretKey = env 15 | } 16 | 17 | maxBytes = ${MAX_BYTES} 18 | } 19 | 20 | bad { 21 | name = ${STREAM_BAD} 22 | region = ${REGION} 23 | customEndpoint = ${KINESIS_ENDPOINT} 24 | 25 | aws { 26 | accessKey = env 27 | secretKey = env 28 | } 29 | 30 | maxBytes = ${MAX_BYTES} 31 | } 32 | } 33 | 34 | networking { 35 | responseHeaderTimeout = 10 seconds 36 | } 37 | 38 | "cookie": { 39 | "enabled": true, 40 | "name": "sp", 41 | "expiration": "365 days", 42 | "secure": false, 43 | "httpOnly": false, 44 | "sameSite": "None" 45 | } 46 | } -------------------------------------------------------------------------------- /kinesis/src/it/resources/collector-custom-paths.hocon: -------------------------------------------------------------------------------- 1 | collector { 2 | license { accept = true } 3 | interface = "0.0.0.0" 4 | port = ${PORT} 5 | 6 | streams { 7 | good { 8 | name = ${STREAM_GOOD} 9 | region = ${REGION} 10 | customEndpoint = ${KINESIS_ENDPOINT} 11 | 12 | aws { 13 | accessKey = env 14 | secretKey = env 15 | } 16 | 17 | maxBytes = ${MAX_BYTES} 18 | } 19 | 20 | bad { 21 | name = ${STREAM_BAD} 22 | region = ${REGION} 23 | customEndpoint = ${KINESIS_ENDPOINT} 24 | 25 | aws { 26 | accessKey = env 27 | secretKey = env 28 | } 29 | 30 | maxBytes = ${MAX_BYTES} 31 | } 32 | } 33 | 34 | networking { 35 | responseHeaderTimeout = 10 seconds 36 | } 37 | 38 | 39 | "paths": { 40 | "/acme/track": "/com.snowplowanalytics.snowplow/tp2", 41 | "/acme/redirect": "/r/tp2", 42 | "/acme/iglu": "/com.snowplowanalytics.iglu/v1" 43 | } 44 | } -------------------------------------------------------------------------------- /kinesis/src/it/resources/collector-doNotTrackCookie-disabled.hocon: -------------------------------------------------------------------------------- 1 | collector { 2 | license { accept = true } 3 | interface = "0.0.0.0" 4 | port = ${PORT} 5 | 6 | streams { 7 | good { 8 | name = ${STREAM_GOOD} 9 | region = ${REGION} 10 | customEndpoint = ${KINESIS_ENDPOINT} 11 | 12 | aws { 13 | accessKey = env 14 | secretKey = env 15 | } 16 | 17 | maxBytes = ${MAX_BYTES} 18 | } 19 | 20 | bad { 21 | name = ${STREAM_BAD} 22 | region = ${REGION} 23 | customEndpoint = ${KINESIS_ENDPOINT} 24 | 25 | aws { 26 | accessKey = env 27 | secretKey = env 28 | } 29 | 30 | maxBytes = ${MAX_BYTES} 31 | } 32 | } 33 | 34 | networking { 35 | responseHeaderTimeout = 10 seconds 36 | } 37 | 38 | "doNotTrackCookie": { 39 | "enabled": false, 40 | "name" : "foo", 41 | "value": "bar" 42 | } 43 | } -------------------------------------------------------------------------------- /kinesis/src/it/resources/collector-doNotTrackCookie-enabled.hocon: -------------------------------------------------------------------------------- 1 | collector { 2 | license { accept = true } 3 | interface = "0.0.0.0" 4 | port = ${PORT} 5 | 6 | streams { 7 | good { 8 | name = ${STREAM_GOOD} 9 | region = ${REGION} 10 | customEndpoint = ${KINESIS_ENDPOINT} 11 | 12 | aws { 13 | accessKey = env 14 | secretKey = env 15 | } 16 | 17 | maxBytes = ${MAX_BYTES} 18 | } 19 | 20 | bad { 21 | name = ${STREAM_BAD} 22 | region = ${REGION} 23 | customEndpoint = ${KINESIS_ENDPOINT} 24 | 25 | aws { 26 | accessKey = env 27 | secretKey = env 28 | } 29 | 30 | maxBytes = ${MAX_BYTES} 31 | } 32 | } 33 | 34 | networking { 35 | responseHeaderTimeout = 10 seconds 36 | } 37 | 38 | "doNotTrackCookie": { 39 | "enabled": true, 40 | "name" : "foo", 41 | "value": "bar" 42 | } 43 | } -------------------------------------------------------------------------------- /kinesis/src/it/resources/collector.hocon: -------------------------------------------------------------------------------- 1 | collector { 2 | license { accept = true } 3 | interface = "0.0.0.0" 4 | port = ${PORT} 5 | 6 | streams { 7 | good { 8 | name = ${STREAM_GOOD} 9 | region = ${REGION} 10 | customEndpoint = ${KINESIS_ENDPOINT} 11 | 12 | aws { 13 | accessKey = env 14 | secretKey = env 15 | } 16 | 17 | maxBytes = ${MAX_BYTES} 18 | } 19 | 20 | bad { 21 | name = ${STREAM_BAD} 22 | region = ${REGION} 23 | customEndpoint = ${KINESIS_ENDPOINT} 24 | 25 | aws { 26 | accessKey = env 27 | secretKey = env 28 | } 29 | 30 | maxBytes = ${MAX_BYTES} 31 | } 32 | } 33 | 34 | networking { 35 | responseHeaderTimeout = 10 seconds 36 | } 37 | } -------------------------------------------------------------------------------- /kinesis/src/it/scala/com/snowplowanalytics/snowplow/collectors/scalastream/it/core/CustomPathsSpec.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2013-present Snowplow Analytics Ltd. 3 | * All rights reserved. 4 | * 5 | * This software is made available by Snowplow Analytics, Ltd., 6 | * under the terms of the Snowplow Limited Use License Agreement, Version 1.1 7 | * located at https://docs.snowplow.io/limited-use-license-1.1 8 | * BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION 9 | * OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT. 10 | */ 11 | package com.snowplowanalytics.snowplow.collectors.scalastream.it.core 12 | 13 | import cats.effect.IO 14 | import cats.effect.testing.specs2.CatsEffect 15 | import com.snowplowanalytics.snowplow.collectors.scalastream.it.Http 16 | import com.snowplowanalytics.snowplow.collectors.scalastream.it.kinesis.Kinesis 17 | import com.snowplowanalytics.snowplow.collectors.scalastream.it.kinesis.containers._ 18 | import org.http4s.{Method, Request, Uri} 19 | import org.specs2.mutable.Specification 20 | 21 | import scala.concurrent.duration._ 22 | 23 | class CustomPathsSpec extends Specification with Localstack with CatsEffect { 24 | 25 | override protected val Timeout = 5.minutes 26 | 27 | "collector" should { 28 | "map custom paths" in { 29 | val testName = "custom-paths" 30 | val streamGood = s"$testName-raw" 31 | val streamBad = s"$testName-bad-1" 32 | 33 | val originalPaths = List( 34 | "/acme/track", 35 | "/acme/redirect", 36 | "/acme/iglu" 37 | ) 38 | Collector.container( 39 | "kinesis/src/it/resources/collector-custom-paths.hocon", 40 | testName, 41 | streamGood, 42 | streamBad 43 | ).use { collector => 44 | val requests = originalPaths.map { p => 45 | val uri = Uri.unsafeFromString(s"http://${collector.host}:${collector.port}$p") 46 | Request[IO](Method.POST, uri).withEntity("foo") 47 | } 48 | 49 | for { 50 | _ <- Http.statuses(requests) 51 | _ <- IO.sleep(5.second) 52 | collectorOutput <- Kinesis.readOutput(streamGood, streamBad) 53 | outputPaths = collectorOutput.good.map(cp => cp.getPath()) 54 | } yield { 55 | outputPaths must beEqualTo(List( 56 | "/com.snowplowanalytics.snowplow/tp2", 57 | "/r/tp2", 58 | "/com.snowplowanalytics.iglu/v1" 59 | )) 60 | } 61 | } 62 | } 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /kinesis/src/it/scala/com/snowplowanalytics/snowplow/collectors/scalastream/it/core/DoNotTrackCookieSpec.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2013-present Snowplow Analytics Ltd. 3 | * All rights reserved. 4 | * 5 | * This software is made available by Snowplow Analytics, Ltd., 6 | * under the terms of the Snowplow Limited Use License Agreement, Version 1.1 7 | * located at https://docs.snowplow.io/limited-use-license-1.1 8 | * BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION 9 | * OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT. 10 | */ 11 | package com.snowplowanalytics.snowplow.collectors.scalastream.it.core 12 | 13 | import cats.effect.IO 14 | import cats.effect.testing.specs2.CatsEffect 15 | import com.snowplowanalytics.snowplow.collectors.scalastream.it.{EventGenerator, Http} 16 | import com.snowplowanalytics.snowplow.collectors.scalastream.it.kinesis.Kinesis 17 | import com.snowplowanalytics.snowplow.collectors.scalastream.it.kinesis.containers._ 18 | import org.specs2.execute.PendingUntilFixed 19 | import org.specs2.mutable.Specification 20 | 21 | import scala.jdk.CollectionConverters._ 22 | import scala.concurrent.duration._ 23 | 24 | class DoNotTrackCookieSpec extends Specification with Localstack with CatsEffect with PendingUntilFixed { 25 | 26 | override protected val Timeout = 5.minutes 27 | 28 | "collector" should { 29 | val cookieName = "foo" 30 | val cookieValue = "bar" 31 | 32 | "ignore events that have a cookie whose name and value match doNotTrackCookie config if enabled" in { 33 | import cats.effect.unsafe.implicits.global 34 | 35 | val testName = "doNotTrackCookie-enabled" 36 | val streamGood = s"$testName-raw" 37 | val streamBad = s"$testName-bad-1" 38 | 39 | Collector.container( 40 | "kinesis/src/it/resources/collector-doNotTrackCookie-enabled.hocon", 41 | testName, 42 | streamGood, 43 | streamBad 44 | ).use { collector => 45 | val requests = List( 46 | EventGenerator.mkTp2Event(collector.host, collector.port).addCookie(cookieName, cookieName), 47 | EventGenerator.mkTp2Event(collector.host, collector.port).addCookie(cookieValue, cookieValue), 48 | EventGenerator.mkTp2Event(collector.host, collector.port).addCookie(cookieName, cookieValue) 49 | ) 50 | 51 | val expected = List(s"Cookie: $cookieName=$cookieName", s"Cookie: $cookieValue=$cookieValue") 52 | 53 | for { 54 | statuses <- Http.statuses(requests) 55 | _ <- IO.sleep(5.second) 56 | collectorOutput <- Kinesis.readOutput(streamGood, streamBad) 57 | headers = collectorOutput.good.map(_.headers.asScala) 58 | } yield { 59 | statuses.map(_.code) must beEqualTo(List(200, 200, 200)) 60 | headers must haveSize(2) 61 | expected.forall(cookie => headers.exists(_.contains(cookie))) must beTrue 62 | } 63 | }.unsafeRunSync() 64 | } 65 | 66 | "track events that have a cookie whose name and value match doNotTrackCookie config if disabled" in { 67 | val testName = "doNotTrackCookie-disabled" 68 | val streamGood = s"$testName-raw" 69 | val streamBad = s"$testName-bad-1" 70 | 71 | Collector.container( 72 | "kinesis/src/it/resources/collector-doNotTrackCookie-disabled.hocon", 73 | testName, 74 | streamGood, 75 | streamBad 76 | ).use { collector => 77 | val request = EventGenerator.mkTp2Event(collector.host, collector.port).addCookie(cookieName, cookieValue) 78 | 79 | val expected = s"Cookie: $cookieName=$cookieValue" 80 | 81 | for { 82 | status <- Http.status(request) 83 | _ <- IO.sleep(5.second) 84 | collectorOutput <- Kinesis.readOutput(streamGood, streamBad) 85 | headers = collectorOutput.good.map(_.headers.asScala) 86 | } yield { 87 | status.code must beEqualTo(200) 88 | headers match { 89 | case List(one) if one.contains(expected) => ok 90 | case other => 91 | ko(s"$other is not one list that contains [$expected]") 92 | } 93 | } 94 | } 95 | } 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /kinesis/src/it/scala/com/snowplowanalytics/snowplow/collectors/scalastream/it/core/HealthEndpointSpec.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2013-present Snowplow Analytics Ltd. 3 | * All rights reserved. 4 | * 5 | * This software is made available by Snowplow Analytics, Ltd., 6 | * under the terms of the Snowplow Limited Use License Agreement, Version 1.1 7 | * located at https://docs.snowplow.io/limited-use-license-1.1 8 | * BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION 9 | * OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT. 10 | */ 11 | package com.snowplowanalytics.snowplow.collectors.scalastream.it.core 12 | 13 | import cats.effect.IO 14 | import cats.effect.testing.specs2.CatsEffect 15 | import com.snowplowanalytics.snowplow.collectors.scalastream.it.Http 16 | import com.snowplowanalytics.snowplow.collectors.scalastream.it.kinesis.Kinesis 17 | import com.snowplowanalytics.snowplow.collectors.scalastream.it.kinesis.containers._ 18 | import org.http4s.{Method, Request, Uri} 19 | import org.specs2.mutable.Specification 20 | 21 | import scala.concurrent.duration._ 22 | 23 | class HealthEndpointSpec extends Specification with Localstack with CatsEffect { 24 | 25 | override protected val Timeout = 5.minutes 26 | 27 | "collector" should { 28 | "respond with 200 to /health endpoint after it has started" in { 29 | val testName = "health-endpoint" 30 | val streamGood = s"$testName-raw" 31 | val streamBad = s"$testName-bad-1" 32 | Collector.container( 33 | "kinesis/src/it/resources/collector.hocon", 34 | testName, 35 | streamGood, 36 | streamBad 37 | ).use { collector => 38 | val uri = Uri.unsafeFromString(s"http://${collector.host}:${collector.port}/health") 39 | val request = Request[IO](Method.GET, uri) 40 | 41 | for { 42 | status <- Http.status(request) 43 | _ <- IO.sleep(5.second) 44 | collectorOutput <- Kinesis.readOutput(streamGood, streamBad) 45 | } yield { 46 | status.code must beEqualTo(200) 47 | collectorOutput.good.size should beEqualTo(0) 48 | collectorOutput.bad.size should beEqualTo(0) 49 | } 50 | } 51 | } 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /kinesis/src/it/scala/com/snowplowanalytics/snowplow/collectors/scalastream/it/core/RobotsSpec.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2013-present Snowplow Analytics Ltd. 3 | * All rights reserved. 4 | * 5 | * This software is made available by Snowplow Analytics, Ltd., 6 | * under the terms of the Snowplow Limited Use License Agreement, Version 1.1 7 | * located at https://docs.snowplow.io/limited-use-license-1.1 8 | * BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION 9 | * OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT. 10 | */ 11 | package com.snowplowanalytics.snowplow.collectors.scalastream.it.core 12 | 13 | import scala.concurrent.duration._ 14 | 15 | import org.specs2.mutable.Specification 16 | 17 | import cats.effect.IO 18 | 19 | import org.http4s.{Method, Request, Uri} 20 | 21 | import cats.effect.testing.specs2.CatsEffect 22 | 23 | import com.snowplowanalytics.snowplow.collectors.scalastream.it.kinesis.Kinesis 24 | import com.snowplowanalytics.snowplow.collectors.scalastream.it.kinesis.containers._ 25 | import com.snowplowanalytics.snowplow.collectors.scalastream.it.Http 26 | 27 | class RobotsSpec extends Specification with Localstack with CatsEffect { 28 | 29 | override protected val Timeout = 5.minutes 30 | 31 | "collector" should { 32 | "respond to /robots.txt with 200 and not emit any event" in { 33 | val testName = "robots" 34 | val streamGood = s"$testName-raw" 35 | val streamBad = s"$testName-bad-1" 36 | 37 | Collector.container( 38 | "kinesis/src/it/resources/collector.hocon", 39 | testName, 40 | streamGood, 41 | streamBad 42 | ).use { collector => 43 | val uri = Uri.unsafeFromString(s"http://${collector.host}:${collector.port}/robots.txt") 44 | val request = Request[IO](Method.GET, uri) 45 | 46 | for { 47 | response <- Http.response(request) 48 | bodyBytes <- response.body.compile.toList 49 | body = new String(bodyBytes.toArray) 50 | _ <- IO.sleep(10.second) 51 | collectorOutput <- Kinesis.readOutput(streamGood, streamBad) 52 | } yield { 53 | response.status.code must beEqualTo(200) 54 | body must beEqualTo("User-agent: *\nDisallow: /\n\nUser-agent: Googlebot\nDisallow: /\n\nUser-agent: AdsBot-Google\nDisallow: /") 55 | collectorOutput.good must beEmpty 56 | collectorOutput.bad must beEmpty 57 | } 58 | } 59 | } 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /kinesis/src/it/scala/com/snowplowanalytics/snowplow/collectors/scalastream/it/core/XForwardedForSpec.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2013-present Snowplow Analytics Ltd. 3 | * All rights reserved. 4 | * 5 | * This software is made available by Snowplow Analytics, Ltd., 6 | * under the terms of the Snowplow Limited Use License Agreement, Version 1.1 7 | * located at https://docs.snowplow.io/limited-use-license-1.1 8 | * BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION 9 | * OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT. 10 | */ 11 | package com.snowplowanalytics.snowplow.collectors.scalastream.it.core 12 | 13 | import cats.data.NonEmptyList 14 | import cats.effect.IO 15 | import cats.effect.testing.specs2.CatsEffect 16 | import com.comcast.ip4s.IpAddress 17 | import com.snowplowanalytics.snowplow.collectors.scalastream.it.{EventGenerator, Http} 18 | import com.snowplowanalytics.snowplow.collectors.scalastream.it.kinesis.Kinesis 19 | import com.snowplowanalytics.snowplow.collectors.scalastream.it.kinesis.containers._ 20 | import org.http4s.headers.`X-Forwarded-For` 21 | import org.specs2.mutable.Specification 22 | 23 | import scala.concurrent.duration._ 24 | 25 | class XForwardedForSpec extends Specification with Localstack with CatsEffect { 26 | 27 | override protected val Timeout = 5.minutes 28 | 29 | "collector" should { 30 | "put X-Forwarded-For header in the collector payload" in { 31 | val testName = "X-Forwarded-For" 32 | val streamGood = s"$testName-raw" 33 | val streamBad = s"$testName-bad-1" 34 | 35 | val ip = IpAddress.fromString("123.123.123.123") 36 | 37 | Collector.container( 38 | "kinesis/src/it/resources/collector.hocon", 39 | testName, 40 | streamGood, 41 | streamBad 42 | ).use { collector => 43 | val request = EventGenerator.mkTp2Event(collector.host, collector.port) 44 | .withHeaders(`X-Forwarded-For`(NonEmptyList.one(ip))) 45 | 46 | for { 47 | _ <- Http.status(request) 48 | _ <- IO.sleep(5.second) 49 | collectorOutput <- Kinesis.readOutput(streamGood, streamBad) 50 | } yield { 51 | val expected = "X-Forwarded-For: 123.123.123.123" 52 | collectorOutput.good match { 53 | case List(one) if one.headers.contains(expected) => ok 54 | case List(one) => ko(s"${one.headers} doesn't contain $expected") 55 | case other => ko(s"${other.size} output collector payload instead of one") 56 | } 57 | } 58 | } 59 | } 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /kinesis/src/it/scala/com/snowplowanalytics/snowplow/collectors/scalastream/it/kinesis/Kinesis.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2013-present Snowplow Analytics Ltd. 3 | * All rights reserved. 4 | * 5 | * This software is made available by Snowplow Analytics, Ltd., 6 | * under the terms of the Snowplow Limited Use License Agreement, Version 1.1 7 | * located at https://docs.snowplow.io/limited-use-license-1.1 8 | * BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION 9 | * OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT. 10 | */ 11 | package com.snowplowanalytics.snowplow.collectors.scalastream.it.kinesis 12 | 13 | import scala.jdk.CollectionConverters._ 14 | import java.net.URI 15 | 16 | import cats.effect.{IO, Resource} 17 | 18 | import software.amazon.awssdk.regions.Region 19 | import software.amazon.awssdk.auth.credentials._ 20 | import software.amazon.awssdk.services.kinesis.KinesisClient 21 | import software.amazon.awssdk.services.kinesis.model._ 22 | 23 | import com.snowplowanalytics.snowplow.CollectorPayload.thrift.model1.CollectorPayload 24 | 25 | import com.snowplowanalytics.snowplow.badrows.BadRow 26 | 27 | import com.snowplowanalytics.snowplow.collectors.scalastream.it.CollectorOutput 28 | import com.snowplowanalytics.snowplow.collectors.scalastream.it.utils._ 29 | 30 | import com.snowplowanalytics.snowplow.collectors.scalastream.it.kinesis.containers.Localstack 31 | 32 | object Kinesis { 33 | 34 | def readOutput(streamGood: String, streamBad: String): IO[CollectorOutput] = 35 | resourceClient.use { client => 36 | for { 37 | good <- consumeGood(client, streamGood) 38 | bad <- consumeBad(client, streamBad) 39 | } yield CollectorOutput(good, bad) 40 | } 41 | 42 | private def resourceClient: Resource[IO, KinesisClient] = 43 | Resource.make(IO( 44 | KinesisClient.builder() 45 | .credentialsProvider(StaticCredentialsProvider.create(AwsBasicCredentials.create("whatever", "whatever"))) 46 | .region(Region.of(Localstack.region)) 47 | .endpointOverride(URI.create(Localstack.publicEndpoint)) 48 | .build 49 | ))(client => IO(client.close())) 50 | 51 | private def consumeGood( 52 | kinesis: KinesisClient, 53 | streamName: String, 54 | ): IO[List[CollectorPayload]] = 55 | for { 56 | raw <- consumeStream(kinesis, streamName) 57 | good <- IO(raw.map(parseCollectorPayload)) 58 | } yield good 59 | 60 | private def consumeBad( 61 | kinesis: KinesisClient, 62 | streamName: String, 63 | ): IO[List[BadRow]] = 64 | for { 65 | raw <- consumeStream(kinesis, streamName) 66 | bad <- IO(raw.map(parseBadRow)) 67 | } yield bad 68 | 69 | private def consumeStream( 70 | kinesis: KinesisClient, 71 | streamName: String, 72 | ): IO[List[Array[Byte]]] = { 73 | val describeRequest = DescribeStreamRequest.builder().streamName(streamName).build() 74 | val shardId = kinesis.describeStream(describeRequest).streamDescription().shards().get(0).shardId() 75 | 76 | val getShardIteratorRequest = GetShardIteratorRequest.builder() 77 | .streamName(streamName) 78 | .shardId(shardId) 79 | .shardIteratorType("TRIM_HORIZON") 80 | .build() 81 | val iterator = kinesis.getShardIterator(getShardIteratorRequest).shardIterator() 82 | val getRecordsRequest = GetRecordsRequest.builder().shardIterator(iterator).build() 83 | 84 | IO(kinesis.getRecords(getRecordsRequest).records().asScala.toList.map(_.data().asByteArray())) 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /kinesis/src/it/scala/com/snowplowanalytics/snowplow/collectors/scalastream/it/kinesis/KinesisCollectorSpec.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2013-present Snowplow Analytics Ltd. 3 | * All rights reserved. 4 | * 5 | * This software is made available by Snowplow Analytics, Ltd., 6 | * under the terms of the Snowplow Limited Use License Agreement, Version 1.1 7 | * located at https://docs.snowplow.io/limited-use-license-1.1 8 | * BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION 9 | * OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT. 10 | */ 11 | package com.snowplowanalytics.snowplow.collectors.scalastream.it.kinesis 12 | 13 | import cats.effect.IO 14 | import cats.effect.testing.specs2.CatsEffect 15 | import com.snowplowanalytics.snowplow.collectors.scalastream.it.kinesis.containers._ 16 | import com.snowplowanalytics.snowplow.collectors.scalastream.it.utils._ 17 | import com.snowplowanalytics.snowplow.collectors.scalastream.it.{EventGenerator, Http} 18 | import org.http4s.{Method, Request, Status, Uri} 19 | import org.specs2.mutable.Specification 20 | import org.testcontainers.containers.GenericContainer 21 | 22 | import scala.concurrent.duration._ 23 | 24 | class KinesisCollectorSpec extends Specification with Localstack with CatsEffect { 25 | 26 | override protected val Timeout = 5.minutes 27 | 28 | val stopTimeout = 20.second 29 | 30 | "collector-kinesis" should { 31 | "be able to parse the minimal config" in { 32 | val testName = "minimal" 33 | Collector.container( 34 | "examples/config.kinesis.minimal.hocon", 35 | testName, 36 | s"$testName-raw", 37 | s"$testName-bad-1" 38 | ).use { collector => 39 | IO(collector.container.getLogs() must contain(("Service bound to address"))) 40 | } 41 | } 42 | 43 | "emit the correct number of collector payloads and bad rows" in { 44 | val testName = "count" 45 | val nbGood = 1000 46 | val nbBad = 10 47 | val streamGood = s"$testName-raw" 48 | val streamBad = s"$testName-bad-1" 49 | 50 | Collector.container( 51 | "kinesis/src/it/resources/collector.hocon", 52 | testName, 53 | streamGood, 54 | streamBad 55 | ).use { collector => 56 | for { 57 | _ <- log(testName, "Sending data") 58 | _ <- EventGenerator.sendEvents( 59 | collector.host, 60 | collector.port, 61 | nbGood, 62 | nbBad, 63 | Collector.maxBytes 64 | ) 65 | _ <- log(testName, "Data sent. Waiting for collector to work") 66 | _ <- IO.sleep(5.second) 67 | _ <- log(testName, "Consuming collector's output") 68 | collectorOutput <- Kinesis.readOutput(streamGood, streamBad) 69 | _ <- printBadRows(testName, collectorOutput.bad) 70 | } yield { 71 | collectorOutput.good.size should beEqualTo(nbGood) 72 | collectorOutput.bad.size should beEqualTo(nbBad) 73 | } 74 | } 75 | } 76 | 77 | s"shutdown within $stopTimeout when it receives a SIGTERM" in { 78 | val testName = "stop" 79 | Collector.container( 80 | "kinesis/src/it/resources/collector.hocon", 81 | testName, 82 | s"$testName-raw", 83 | s"$testName-bad-1" 84 | ).use { collector => 85 | val container = collector.container 86 | for { 87 | _ <- log(testName, "Sending signal") 88 | _ <- IO(container.getDockerClient().killContainerCmd(container.getContainerId()).withSignal("TERM").exec()) 89 | _ <- waitWhile[GenericContainer[_]](container, _.isRunning, stopTimeout) 90 | } yield { 91 | container.isRunning() must beFalse 92 | container.getLogs() must contain("Closing NIO1 channel") 93 | } 94 | } 95 | } 96 | 97 | "start with /sink-health unhealthy and insert pending events when streams become available" in { 98 | val testName = "sink-health" 99 | val nbGood = 10 100 | val nbBad = 10 101 | val streamGood = s"$testName-raw" 102 | val streamBad = s"$testName-bad-1" 103 | 104 | Collector.container( 105 | "kinesis/src/it/resources/collector.hocon", 106 | testName, 107 | streamGood, 108 | streamBad, 109 | createStreams = false 110 | ).use { collector => 111 | val uri = Uri.unsafeFromString(s"http://${collector.host}:${collector.port}/sink-health") 112 | val request = Request[IO](Method.GET, uri) 113 | 114 | for { 115 | statusBeforeCreate <- Http.status(request) 116 | _ <- EventGenerator.sendEvents( 117 | collector.host, 118 | collector.port, 119 | nbGood, 120 | nbBad, 121 | Collector.maxBytes 122 | ) 123 | _ <- Localstack.createStreams(List(streamGood, streamBad)) 124 | _ <- IO.sleep(10.second) 125 | statusAfterCreate <- Http.status(request) 126 | collectorOutput <- Kinesis.readOutput(streamGood, streamBad) 127 | _ <- printBadRows(testName, collectorOutput.bad) 128 | } yield { 129 | statusBeforeCreate should beEqualTo(Status.ServiceUnavailable) 130 | statusAfterCreate should beEqualTo(Status.Ok) 131 | collectorOutput.good.size should beEqualTo(nbGood) 132 | collectorOutput.bad.size should beEqualTo(nbBad) 133 | } 134 | } 135 | } 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /kinesis/src/it/scala/com/snowplowanalytics/snowplow/collectors/scalastream/it/kinesis/containers/Collector.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2013-present Snowplow Analytics Ltd. 3 | * All rights reserved. 4 | * 5 | * This software is made available by Snowplow Analytics, Ltd., 6 | * under the terms of the Snowplow Limited Use License Agreement, Version 1.1 7 | * located at https://docs.snowplow.io/limited-use-license-1.1 8 | * BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION 9 | * OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT. 10 | */ 11 | package com.snowplowanalytics.snowplow.collectors.scalastream.it.kinesis.containers 12 | 13 | import cats.effect.{IO, Resource} 14 | import com.dimafeng.testcontainers.GenericContainer 15 | import com.snowplowanalytics.snowplow.collectors.scalastream.BuildInfo 16 | import com.snowplowanalytics.snowplow.collectors.scalastream.it.CollectorContainer 17 | import com.snowplowanalytics.snowplow.collectors.scalastream.it.utils._ 18 | import org.testcontainers.containers.BindMode 19 | import org.testcontainers.containers.wait.strategy.Wait 20 | 21 | object Collector { 22 | 23 | val port = 8080 24 | val maxBytes = 10000 25 | 26 | def container( 27 | configPath: String, 28 | testName: String, 29 | streamGood: String, 30 | streamBad: String, 31 | createStreams: Boolean = true, 32 | additionalConfig: Map[String, String] = Map.empty 33 | ): Resource[IO, CollectorContainer] = { 34 | val container = GenericContainer( 35 | dockerImage = BuildInfo.dockerAlias, 36 | env = Map( 37 | "AWS_ACCESS_KEY_ID" -> "whatever", 38 | "AWS_SECRET_ACCESS_KEY" -> "whatever", 39 | "PORT" -> port.toString, 40 | "STREAM_GOOD" -> streamGood, 41 | "STREAM_BAD" -> streamBad, 42 | "REGION" -> Localstack.region, 43 | "KINESIS_ENDPOINT" -> Localstack.privateEndpoint, 44 | "MAX_BYTES" -> maxBytes.toString, 45 | "JDK_JAVA_OPTIONS" -> "-Dorg.slf4j.simpleLogger.log.com.snowplowanalytics.snowplow.collectors.scalastream.sinks.KinesisSink=warn", 46 | "HTTP4S_BACKEND" -> "BLAZE" 47 | ) ++ additionalConfig, 48 | exposedPorts = Seq(port), 49 | fileSystemBind = Seq( 50 | GenericContainer.FileSystemBind( 51 | configPath, 52 | "/snowplow/config/collector.hocon", 53 | BindMode.READ_ONLY 54 | ) 55 | ), 56 | command = Seq( 57 | "--config", 58 | "/snowplow/config/collector.hocon" 59 | ), 60 | waitStrategy = Wait.forLogMessage(s".*Service bound to address.*", 1) 61 | ) 62 | container.container.withNetwork(Localstack.network) 63 | 64 | val create = if(createStreams) Localstack.createStreams(List(streamGood, streamBad)) else IO.unit 65 | 66 | Resource.make( 67 | create *> 68 | IO(startContainerWithLogs(container.container, testName)) 69 | .map(c => CollectorContainer(c, c.getHost, c.getMappedPort(Collector.port))) 70 | )( 71 | c => IO(c.container.stop()) 72 | ) 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /kinesis/src/it/scala/com/snowplowanalytics/snowplow/collectors/scalastream/it/kinesis/containers/Localstack.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2013-present Snowplow Analytics Ltd. 3 | * All rights reserved. 4 | * 5 | * This software is made available by Snowplow Analytics, Ltd., 6 | * under the terms of the Snowplow Limited Use License Agreement, Version 1.1 7 | * located at https://docs.snowplow.io/limited-use-license-1.1 8 | * BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION 9 | * OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT. 10 | */ 11 | package com.snowplowanalytics.snowplow.collectors.scalastream.it.kinesis.containers 12 | 13 | import java.util.concurrent.Semaphore 14 | 15 | import org.testcontainers.containers.Network 16 | import org.testcontainers.containers.wait.strategy.Wait 17 | 18 | import org.specs2.specification.BeforeAfterAll 19 | 20 | import cats.implicits._ 21 | 22 | import cats.effect.IO 23 | 24 | import com.dimafeng.testcontainers.GenericContainer 25 | 26 | trait Localstack extends BeforeAfterAll { 27 | def beforeAll() = Localstack.start() 28 | 29 | def afterAll() = Localstack.stop() 30 | } 31 | 32 | object Localstack { 33 | 34 | private val nbPermits = Int.MaxValue 35 | private val permits = new Semaphore(nbPermits) 36 | 37 | val region = "eu-central-1" 38 | val host = "localhost" 39 | val alias = "localstack" 40 | val privatePort = 4566 41 | 42 | val network = Network.newNetwork() 43 | 44 | val localstack = { 45 | val container = GenericContainer( 46 | dockerImage = "localstack/localstack-light:1.3.0", 47 | env = Map( 48 | "AWS_ACCESS_KEY_ID" -> "unused", 49 | "AWS_SECRET_ACCESS_KEY" -> "unused" 50 | ), 51 | waitStrategy = Wait.forLogMessage(".*Ready.*", 1), 52 | exposedPorts = Seq(privatePort) 53 | ) 54 | container.underlyingUnsafeContainer.withNetwork(network) 55 | container.underlyingUnsafeContainer.withNetworkAliases(alias) 56 | container.container 57 | } 58 | 59 | def start() = synchronized { 60 | permits.acquire() 61 | // Calling start on an already started container has no effect 62 | localstack.start() 63 | } 64 | 65 | def stop() = synchronized { 66 | permits.release() 67 | if(permits.availablePermits() == nbPermits) 68 | localstack.stop() 69 | } 70 | 71 | def publicPort = localstack.getMappedPort(privatePort) 72 | 73 | def privateEndpoint: String = 74 | s"http://$alias:$privatePort" 75 | 76 | def publicEndpoint: String = 77 | s"http://$host:$publicPort" 78 | 79 | def createStreams( 80 | streams: List[String] 81 | ): IO[Unit] = 82 | streams 83 | .traverse_ { s => 84 | IO( 85 | localstack.execInContainer( 86 | "aws", 87 | s"--endpoint-url=http://$host:$privatePort", 88 | "kinesis", 89 | "create-stream", 90 | "--stream-name", 91 | s, 92 | "--shard-count", 93 | "1", 94 | "--region", 95 | region 96 | ) 97 | ) 98 | .flatMap { 99 | case res if res.getExitCode() != 0 => 100 | IO.raiseError(new RuntimeException(s"Problem when creating stream $s [${res.getStderr()}] [${res.getStdout()}]")) 101 | case _ => IO(println(s"Stream $s created")) 102 | } 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /kinesis/src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | { 2 | streams { 3 | //New object-like style 4 | good = ${streams.sink} 5 | bad = ${streams.sink} 6 | 7 | //Legacy style 8 | sink { 9 | threadPoolSize = 10 10 | 11 | aws { 12 | accessKey = iam 13 | secretKey = iam 14 | } 15 | 16 | backoffPolicy { 17 | minBackoff = 500 18 | maxBackoff = 1500 19 | maxRetries = 3 20 | } 21 | 22 | maxBytes = 1000000 23 | sqsMaxBytes = 192000 24 | 25 | startupCheckInterval = 1 second 26 | buffer = ${streams.buffer} 27 | } 28 | 29 | //Legacy style 30 | buffer { 31 | byteLimit = 3145728 32 | recordLimit = 500 33 | timeLimit = 5000 34 | } 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /kinesis/src/main/scala/com.snowplowanalytics.snowplow.collectors.scalastream/KinesisCollector.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2013-present Snowplow Analytics Ltd. 3 | * All rights reserved. 4 | * 5 | * This software is made available by Snowplow Analytics, Ltd., 6 | * under the terms of the Snowplow Limited Use License Agreement, Version 1.1 7 | * located at https://docs.snowplow.io/limited-use-license-1.1 8 | * BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION 9 | * OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT. 10 | */ 11 | package com.snowplowanalytics.snowplow.collectors.scalastream 12 | 13 | import cats.effect.{IO, Resource} 14 | 15 | import com.snowplowanalytics.snowplow.collector.core.model.Sinks 16 | import com.snowplowanalytics.snowplow.collector.core.{App, Config, Telemetry} 17 | import com.snowplowanalytics.snowplow.collectors.scalastream.sinks.{KinesisSink, KinesisSinkConfig} 18 | 19 | import org.slf4j.LoggerFactory 20 | 21 | import java.util.concurrent.ScheduledThreadPoolExecutor 22 | 23 | object KinesisCollector extends App[KinesisSinkConfig](BuildInfo) { 24 | 25 | private lazy val log = LoggerFactory.getLogger(getClass) 26 | 27 | override def mkSinks(config: Config.Streams[KinesisSinkConfig]): Resource[IO, Sinks[IO]] = { 28 | val threadPoolExecutor = buildExecutorService(config.good.config) 29 | for { 30 | good <- KinesisSink.create[IO](config.good, config.good.config.sqsGoodBuffer, threadPoolExecutor) 31 | bad <- KinesisSink.create[IO](config.bad, config.bad.config.sqsBadBuffer, threadPoolExecutor) 32 | } yield Sinks(good, bad) 33 | } 34 | 35 | override def telemetryInfo(config: Config.Streams[KinesisSinkConfig]): IO[Telemetry.TelemetryInfo] = 36 | TelemetryUtils 37 | .getAccountId(config) 38 | .map(id => 39 | Telemetry.TelemetryInfo( 40 | region = Some(config.good.config.region), 41 | cloud = Some("AWS"), 42 | unhashedInstallationId = id 43 | ) 44 | ) 45 | 46 | def buildExecutorService(kc: KinesisSinkConfig): ScheduledThreadPoolExecutor = { 47 | log.info("Creating thread pool of size " + kc.threadPoolSize) 48 | new ScheduledThreadPoolExecutor(kc.threadPoolSize) 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /kinesis/src/main/scala/com.snowplowanalytics.snowplow.collectors.scalastream/TelemetryUtils.scala: -------------------------------------------------------------------------------- 1 | package com.snowplowanalytics.snowplow.collectors.scalastream 2 | 3 | import cats.effect.{IO, Resource} 4 | 5 | import com.snowplowanalytics.snowplow.collector.core.Config 6 | import com.snowplowanalytics.snowplow.collectors.scalastream.sinks.{KinesisSink, KinesisSinkConfig} 7 | 8 | object TelemetryUtils { 9 | 10 | def getAccountId(config: Config.Streams[KinesisSinkConfig]): IO[Option[String]] = 11 | Resource 12 | .make( 13 | IO(KinesisSink.createKinesisClient(config.good.config.endpoint, config.good.config.region)).rethrow 14 | )(c => IO(c.close())) 15 | .use { kinesis => 16 | IO { 17 | val streamArn = KinesisSink.describeStream(kinesis, config.good.name).streamARN() 18 | Some(extractAccountId(streamArn)) 19 | } 20 | } 21 | .handleError(_ => None) 22 | 23 | def extractAccountId(kinesisStreamArn: String): String = 24 | kinesisStreamArn.split(":")(4) 25 | 26 | } 27 | -------------------------------------------------------------------------------- /kinesis/src/main/scala/com.snowplowanalytics.snowplow.collectors.scalastream/sinks/KinesisSinkConfig.scala: -------------------------------------------------------------------------------- 1 | package com.snowplowanalytics.snowplow.collectors.scalastream.sinks 2 | 3 | import io.circe.Decoder 4 | import io.circe.generic.semiauto._ 5 | import io.circe.config.syntax.durationDecoder 6 | 7 | import scala.concurrent.duration.FiniteDuration 8 | 9 | final case class KinesisSinkConfig( 10 | maxBytes: Int, 11 | region: String, 12 | threadPoolSize: Int, 13 | backoffPolicy: KinesisSinkConfig.BackoffPolicy, 14 | customEndpoint: Option[String], 15 | sqsGoodBuffer: Option[String], 16 | sqsBadBuffer: Option[String], 17 | sqsMaxBytes: Int, 18 | startupCheckInterval: FiniteDuration 19 | ) { 20 | val endpoint = customEndpoint.orElse(region match { 21 | case cn @ "cn-north-1" => Some(s"https://kinesis.$cn.amazonaws.com.cn") 22 | case cn @ "cn-northwest-1" => Some(s"https://kinesis.$cn.amazonaws.com.cn") 23 | case _ => None 24 | }) 25 | } 26 | 27 | object KinesisSinkConfig { 28 | final case class AWSConfig(accessKey: String, secretKey: String) 29 | 30 | final case class BackoffPolicy(minBackoff: Long, maxBackoff: Long, maxRetries: Int) 31 | implicit val configDecoder: Decoder[KinesisSinkConfig] = deriveDecoder[KinesisSinkConfig] 32 | implicit val awsConfigDecoder: Decoder[AWSConfig] = deriveDecoder[AWSConfig] 33 | implicit val backoffPolicyConfigDecoder: Decoder[BackoffPolicy] = 34 | deriveDecoder[BackoffPolicy] 35 | } 36 | -------------------------------------------------------------------------------- /kinesis/src/test/scala/com.snowplowanalytics.snowplow.collectors.scalastream/TelemetryUtilsSpec.scala: -------------------------------------------------------------------------------- 1 | package com.snowplowanalytics.snowplow.collectors.scalastream 2 | 3 | import org.specs2.mutable.Specification 4 | 5 | class TelemetryUtilsSpec extends Specification { 6 | 7 | "extractAccountId" should { 8 | "be able to extract account id from kinesis stream arn successfully" in { 9 | val streamArn = "arn:aws:kinesis:region:123456789:stream/name" 10 | TelemetryUtils.extractAccountId(streamArn) must beEqualTo("123456789") 11 | } 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /kinesis/src/test/scala/com.snowplowanalytics.snowplow.collectors.scalastream/sinks/KinesisSinkSpec.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2013-present Snowplow Analytics Ltd. 3 | * All rights reserved. 4 | * 5 | * This software is made available by Snowplow Analytics, Ltd., 6 | * under the terms of the Snowplow Limited Use License Agreement, Version 1.1 7 | * located at https://docs.snowplow.io/limited-use-license-1.1 8 | * BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION 9 | * OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT. 10 | */ 11 | package com.snowplowanalytics.snowplow.collectors.scalastream 12 | package sinks 13 | 14 | import com.snowplowanalytics.snowplow.collectors.scalastream.sinks.KinesisSink._ 15 | 16 | import org.specs2.mutable.Specification 17 | 18 | class KinesisSinkSpec extends Specification { 19 | val event = Events("a".getBytes, "b") 20 | 21 | "KinesisSink.split" should { 22 | "return empty list if given an empty batch" in { 23 | val emptyBatch = List.empty[Events] 24 | 25 | split(emptyBatch, getByteSize, 1, 10) mustEqual List.empty 26 | split(emptyBatch, getByteSize, 10, 1) mustEqual List.empty 27 | // Edge case that we shouldn't hit. The test simply confirms the behaviour. 28 | split(emptyBatch, getByteSize, 0, 0) mustEqual List.empty 29 | } 30 | 31 | "correctly split batches, according to maxRecords setting" in { 32 | val batch1 = List.fill(10)(event) 33 | val batch2 = List.fill(1)(event) 34 | 35 | val res1 = split(batch1, getByteSize, 3, 1000) 36 | val res2 = split(batch2, getByteSize, 3, 1000) 37 | // Edge case that we shouldn't hit. The test simply confirms the behaviour. 38 | val res3 = split(batch1, getByteSize, 0, 1000) 39 | 40 | res1.length mustEqual 4 41 | res2.length mustEqual 1 42 | (res3.length mustEqual 10).and(res3.forall(_ must not be empty)) 43 | } 44 | 45 | "correctly split batches, according to maxBytes setting" in { 46 | val batch1 = List.fill(10)(event) 47 | val batch2 = List.fill(1)(event) 48 | 49 | val res1 = split(batch1, getByteSize, 1000, 3) 50 | val res2 = split(batch2, getByteSize, 1000, 3) 51 | // Edge case that we shouldn't hit. The test simply confirms the behaviour. 52 | val res3 = split(batch1, getByteSize, 1000, 0) 53 | 54 | res1.length mustEqual 4 55 | res2.length mustEqual 1 56 | (res3.length mustEqual 10).and(res3.forall(_ must not be empty)) 57 | } 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /nsq/src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | collector { 2 | streams { 3 | 4 | good = ${collector.streams.sink} 5 | bad = ${collector.streams.sink} 6 | 7 | sink { 8 | threadPoolSize = 10 9 | port = 4150 10 | maxBytes = 1000000 11 | buffer = ${collector.streams.buffer} 12 | } 13 | 14 | buffer { 15 | byteLimit = 3145728 16 | recordLimit = 500 17 | timeLimit = 5000 18 | } 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /nsq/src/main/scala/com.snowplowanalytics.snowplow.collectors.scalastream/NsqCollector.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2013-present Snowplow Analytics Ltd. 3 | * All rights reserved. 4 | * 5 | * This software is made available by Snowplow Analytics, Ltd., 6 | * under the terms of the Snowplow Limited Use License Agreement, Version 1.1 7 | * located at https://docs.snowplow.io/limited-use-license-1.1 8 | * BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION 9 | * OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT. 10 | */ 11 | package com.snowplowanalytics.snowplow.collectors.scalastream 12 | 13 | import cats.effect.{IO, Resource} 14 | import com.snowplowanalytics.snowplow.collector.core.model.Sinks 15 | import com.snowplowanalytics.snowplow.collector.core.{App, Config, Telemetry} 16 | import com.snowplowanalytics.snowplow.collectors.scalastream.sinks._ 17 | 18 | object NsqCollector extends App[NsqSinkConfig](BuildInfo) { 19 | override def mkSinks(config: Config.Streams[NsqSinkConfig]): Resource[IO, Sinks[IO]] = 20 | for { 21 | good <- NsqSink.create[IO](config.good) 22 | bad <- NsqSink.create[IO](config.bad) 23 | } yield Sinks(good, bad) 24 | 25 | override def telemetryInfo(config: Config.Streams[NsqSinkConfig]): IO[Telemetry.TelemetryInfo] = 26 | IO(Telemetry.TelemetryInfo(None, None, None)) 27 | } 28 | -------------------------------------------------------------------------------- /nsq/src/main/scala/com.snowplowanalytics.snowplow.collectors.scalastream/sinks/NsqSink.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2013-present Snowplow Analytics Ltd. 3 | * All rights reserved. 4 | * 5 | * This software is made available by Snowplow Analytics, Ltd., 6 | * under the terms of the Snowplow Limited Use License Agreement, Version 1.1 7 | * located at https://docs.snowplow.io/limited-use-license-1.1 8 | * BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION 9 | * OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT. 10 | */ 11 | package com.snowplowanalytics.snowplow.collectors.scalastream 12 | package sinks 13 | 14 | import java.util.concurrent.TimeoutException 15 | import scala.jdk.CollectionConverters._ 16 | import cats.effect.{Resource, Sync} 17 | import cats.implicits._ 18 | import com.snowplowanalytics.client.nsq.NSQProducer 19 | import com.snowplowanalytics.snowplow.collector.core.{Config, Sink} 20 | import com.snowplowanalytics.client.nsq.exceptions.NSQException 21 | 22 | /** 23 | * NSQ Sink for the Scala Stream Collector 24 | * @param nsqConfig Configuration for Nsq 25 | * @param topicName Nsq topic name 26 | */ 27 | class NsqSink[F[_]: Sync] private ( 28 | val maxBytes: Int, 29 | nsqConfig: NsqSinkConfig, 30 | topicName: String 31 | ) extends Sink[F] { 32 | 33 | @volatile private var healthStatus = true 34 | 35 | override def isHealthy: F[Boolean] = Sync[F].pure(healthStatus) 36 | 37 | private val producer = new NSQProducer().addAddress(nsqConfig.host, nsqConfig.port).start() 38 | 39 | /** 40 | * Store raw events to the topic 41 | * @param events The list of events to send 42 | * @param key The partition key (unused) 43 | */ 44 | override def storeRawEvents(events: List[Array[Byte]], key: String): F[Unit] = 45 | Sync[F].blocking(producer.produceMulti(topicName, events.asJava)).onError { 46 | case _: NSQException | _: TimeoutException => 47 | setHealthStatus(false) 48 | } *> setHealthStatus(true) 49 | 50 | def shutdown(): Unit = 51 | producer.shutdown() 52 | 53 | private def setHealthStatus(status: Boolean): F[Unit] = Sync[F].delay { 54 | healthStatus = status 55 | } 56 | } 57 | 58 | object NsqSink { 59 | 60 | def create[F[_]: Sync]( 61 | nsqConfig: Config.Sink[NsqSinkConfig] 62 | ): Resource[F, NsqSink[F]] = 63 | Resource.make( 64 | Sync[F].delay( 65 | new NsqSink(nsqConfig.config.maxBytes, nsqConfig.config, nsqConfig.name) 66 | ) 67 | )(sink => Sync[F].delay(sink.shutdown())) 68 | } 69 | -------------------------------------------------------------------------------- /nsq/src/main/scala/com.snowplowanalytics.snowplow.collectors.scalastream/sinks/NsqSinkConfig.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2013-present Snowplow Analytics Ltd. 3 | * All rights reserved. 4 | * 5 | * This software is made available by Snowplow Analytics, Ltd., 6 | * under the terms of the Snowplow Limited Use License Agreement, Version 1.1 7 | * located at https://docs.snowplow.io/limited-use-license-1.1 8 | * BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION 9 | * OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT. 10 | */ 11 | package com.snowplowanalytics.snowplow.collectors.scalastream.sinks 12 | 13 | import io.circe.Decoder 14 | import io.circe.generic.semiauto._ 15 | 16 | final case class NsqSinkConfig( 17 | maxBytes: Int, 18 | threadPoolSize: Int, 19 | host: String, 20 | port: Int 21 | ) 22 | 23 | object NsqSinkConfig { 24 | implicit val configDecoder: Decoder[NsqSinkConfig] = deriveDecoder[NsqSinkConfig] 25 | } 26 | -------------------------------------------------------------------------------- /nsq/src/test/scala/com.snowplowanalytics.snowplow.collectors.scalastream/NsqConfigSpec.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2013-present Snowplow Analytics Ltd. 3 | * All rights reserved. 4 | * 5 | * This software is made available by Snowplow Analytics, Ltd., 6 | * under the terms of the Snowplow Limited Use License Agreement, Version 1.1 7 | * located at https://docs.snowplow.io/limited-use-license-1.1 8 | * BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION 9 | * OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT. 10 | */ 11 | package com.snowplowanalytics.snowplow.collectors.scalastream 12 | 13 | import cats.effect.testing.specs2.CatsEffect 14 | import cats.effect.{ExitCode, IO} 15 | import com.snowplowanalytics.snowplow.collector.core.{Config, ConfigParser} 16 | import com.snowplowanalytics.snowplow.collectors.scalastream.sinks.NsqSinkConfig 17 | import org.http4s.SameSite 18 | import org.specs2.mutable.Specification 19 | 20 | import java.nio.file.Paths 21 | import scala.concurrent.duration.DurationInt 22 | 23 | class NsqConfigSpec extends Specification with CatsEffect { 24 | 25 | "Config parser" should { 26 | "be able to parse extended nsq config" in { 27 | assert( 28 | resource = "/config.nsq.extended.hocon", 29 | expectedResult = Right( 30 | NsqConfigSpec 31 | .expectedConfig 32 | .copy( 33 | monitoring = Config.Monitoring( 34 | Config.Metrics( 35 | NsqConfigSpec.expectedConfig.monitoring.metrics.statsd.copy(tags = Map("app" -> "collector")) 36 | ) 37 | ) 38 | ) 39 | ) 40 | ) 41 | } 42 | "be able to parse minimal nsq config" in { 43 | assert( 44 | resource = "/config.nsq.minimal.hocon", 45 | expectedResult = Right(NsqConfigSpec.expectedConfig) 46 | ) 47 | } 48 | } 49 | 50 | private def assert(resource: String, expectedResult: Either[ExitCode, Config[NsqSinkConfig]]) = { 51 | val path = Paths.get(getClass.getResource(resource).toURI) 52 | ConfigParser.fromPath[IO, NsqSinkConfig](Some(path)).value.map { result => 53 | result must beEqualTo(expectedResult) 54 | } 55 | } 56 | } 57 | 58 | object NsqConfigSpec { 59 | private val expectedConfig = Config[NsqSinkConfig]( 60 | interface = "0.0.0.0", 61 | port = 8080, 62 | paths = Map.empty[String, String], 63 | p3p = Config.P3P( 64 | policyRef = "/w3c/p3p.xml", 65 | CP = "NOI DSP COR NID PSA OUR IND COM NAV STA" 66 | ), 67 | crossDomain = Config.CrossDomain( 68 | enabled = false, 69 | domains = List("*"), 70 | secure = true 71 | ), 72 | cookie = Config.Cookie( 73 | enabled = true, 74 | expiration = 365.days, 75 | name = "sp", 76 | domains = List.empty, 77 | fallbackDomain = None, 78 | secure = true, 79 | httpOnly = true, 80 | sameSite = Some(SameSite.None), 81 | clientCookieName = None 82 | ), 83 | doNotTrackCookie = Config.DoNotTrackCookie( 84 | enabled = false, 85 | name = "", 86 | value = "" 87 | ), 88 | cookieBounce = Config.CookieBounce( 89 | enabled = false, 90 | name = "n3pc", 91 | fallbackNetworkUserId = "00000000-0000-4000-A000-000000000000", 92 | forwardedProtocolHeader = None 93 | ), 94 | redirectMacro = Config.RedirectMacro( 95 | enabled = false, 96 | placeholder = None 97 | ), 98 | rootResponse = Config.RootResponse( 99 | enabled = false, 100 | statusCode = 302, 101 | headers = Map.empty[String, String], 102 | body = "" 103 | ), 104 | cors = Config.CORS(1.hour), 105 | monitoring = Config.Monitoring( 106 | Config.Metrics( 107 | Config.Statsd(false, "localhost", 8125, 10.seconds, "snowplow.collector", Map.empty) 108 | ) 109 | ), 110 | ssl = Config.SSL(enable = false, redirect = false, port = 443), 111 | hsts = Config.HSTS(enable = false, maxAge = 365.days), 112 | enableDefaultRedirect = false, 113 | redirectDomains = Set.empty, 114 | preTerminationPeriod = 10.seconds, 115 | streams = Config.Streams( 116 | useIpAddressAsPartitionKey = false, 117 | good = Config.Sink( 118 | name = "good", 119 | buffer = Config.Buffer( 120 | byteLimit = 3145728, 121 | recordLimit = 500, 122 | timeLimit = 5000 123 | ), 124 | config = NsqSinkConfig( 125 | maxBytes = 1000000, 126 | threadPoolSize = 10, 127 | host = "nsqHost", 128 | port = 4150 129 | ) 130 | ), 131 | bad = Config.Sink( 132 | name = "bad", 133 | buffer = Config.Buffer( 134 | byteLimit = 3145728, 135 | recordLimit = 500, 136 | timeLimit = 5000 137 | ), 138 | config = NsqSinkConfig( 139 | maxBytes = 1000000, 140 | threadPoolSize = 10, 141 | host = "nsqHost", 142 | port = 4150 143 | ) 144 | ) 145 | ), 146 | telemetry = Config.Telemetry( 147 | disable = false, 148 | interval = 60.minutes, 149 | method = "POST", 150 | url = "telemetry-g.snowplowanalytics.com", 151 | port = 443, 152 | secure = true, 153 | userProvidedId = None, 154 | moduleName = None, 155 | moduleVersion = None, 156 | instanceId = None, 157 | autoGeneratedId = None 158 | ), 159 | networking = Config.Networking( 160 | maxConnections = 1024, 161 | idleTimeout = 610.seconds, 162 | responseHeaderTimeout = 30.seconds, 163 | maxRequestLineLength = 20480, 164 | maxHeadersLength = 40960, 165 | maxPayloadSize = 1048576, 166 | dropPayloadSize = 2097152 167 | ), 168 | license = Config.License(accept = true) 169 | ) 170 | } 171 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=1.5.6 2 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.15.0") 2 | addSbtPlugin("com.eed3si9n" % "sbt-buildinfo" % "0.10.0") 3 | addSbtPlugin("io.github.davidgregory084" % "sbt-tpolecat" % "0.1.17") 4 | addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.4.2") 5 | addSbtPlugin("com.dwijnand" % "sbt-dynver" % "4.1.1") 6 | addSbtPlugin("com.snowplowanalytics" % "sbt-snowplow-release" % "0.3.2") 7 | -------------------------------------------------------------------------------- /pubsub/src/it/resources/collector.hocon: -------------------------------------------------------------------------------- 1 | collector { 2 | license { accept = true } 3 | interface = "0.0.0.0" 4 | port = ${PORT} 5 | 6 | streams { 7 | good { 8 | name = ${TOPIC_GOOD} 9 | googleProjectId = ${GOOGLE_PROJECT_ID} 10 | maxBytes = ${MAX_BYTES} 11 | } 12 | bad { 13 | name = ${TOPIC_BAD} 14 | googleProjectId = ${GOOGLE_PROJECT_ID} 15 | maxBytes = ${MAX_BYTES} 16 | } 17 | } 18 | 19 | networking { 20 | responseHeaderTimeout = 10 seconds 21 | } 22 | } -------------------------------------------------------------------------------- /pubsub/src/it/scala/com/snowplowanalytics/snowplow/collectors/scalastream/it/pubsub/Containers.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2013-present Snowplow Analytics Ltd. 3 | * All rights reserved. 4 | * 5 | * This software is made available by Snowplow Analytics, Ltd., 6 | * under the terms of the Snowplow Limited Use License Agreement, Version 1.1 7 | * located at https://docs.snowplow.io/limited-use-license-1.1 8 | * BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION 9 | * OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT. 10 | */ 11 | package com.snowplowanalytics.snowplow.collectors.scalastream.it.pubsub 12 | 13 | import org.testcontainers.containers.{BindMode, Network} 14 | import org.testcontainers.containers.wait.strategy.Wait 15 | import com.dimafeng.testcontainers.GenericContainer 16 | import cats.effect.{IO, Resource} 17 | import com.snowplowanalytics.snowplow.collectors.scalastream.BuildInfo 18 | import com.snowplowanalytics.snowplow.collectors.scalastream.it.utils._ 19 | import com.snowplowanalytics.snowplow.collectors.scalastream.it.CollectorContainer 20 | 21 | object Containers { 22 | 23 | val collectorPort = 8080 24 | val projectId = "google-project-id" 25 | val emulatorHost = "localhost" 26 | val emulatorPort = 8085 27 | lazy val emulatorHostPort = pubSubEmulator.getMappedPort(emulatorPort) 28 | val topicGood = "good" 29 | val topicBad = "bad" 30 | 31 | private val network = Network.newNetwork() 32 | 33 | private val pubSubEmulator = { 34 | val container = GenericContainer( 35 | dockerImage = "gcr.io/google.com/cloudsdktool/google-cloud-cli:emulators", 36 | waitStrategy = Wait.forLogMessage(".*Server started.*", 1), 37 | exposedPorts = Seq(emulatorPort), 38 | command = Seq( 39 | "gcloud", 40 | "beta", 41 | "emulators", 42 | "pubsub", 43 | "start", 44 | s"--project=$projectId", 45 | s"--host-port=0.0.0.0:$emulatorPort" 46 | ) 47 | ) 48 | 49 | container.underlyingUnsafeContainer.withNetwork(network) 50 | container.underlyingUnsafeContainer.withNetworkAliases("pubsub-emulator") 51 | container.container 52 | } 53 | 54 | def collector( 55 | configPath: String, 56 | testName: String, 57 | topicGood: String, 58 | topicBad: String, 59 | createTopics: Boolean = true, 60 | envs: Map[String, String] = Map.empty[String, String] 61 | ): Resource[IO, CollectorContainer] = { 62 | val container = GenericContainer( 63 | dockerImage = BuildInfo.dockerAlias, 64 | env = Map( 65 | "PUBSUB_EMULATOR_HOST" -> s"pubsub-emulator:$emulatorPort", 66 | "PORT" -> collectorPort.toString, 67 | "TOPIC_GOOD" -> topicGood, 68 | "TOPIC_BAD" -> topicBad, 69 | "GOOGLE_PROJECT_ID" -> projectId, 70 | "MAX_BYTES" -> Integer.MAX_VALUE.toString, 71 | "JDK_JAVA_OPTIONS" -> "-Dorg.slf4j.simpleLogger.log.com.snowplowanalytics.snowplow.collectors.scalastream.sinks.GooglePubSubSink=warn", 72 | "HTTP4S_BACKEND" -> "BLAZE" 73 | ) ++ envs, 74 | exposedPorts = Seq(collectorPort), 75 | fileSystemBind = Seq( 76 | GenericContainer.FileSystemBind( 77 | configPath, 78 | "/snowplow/config/collector.hocon", 79 | BindMode.READ_ONLY 80 | ) 81 | ), 82 | command = Seq( 83 | "--config", 84 | "/snowplow/config/collector.hocon" 85 | ) 86 | ,waitStrategy = Wait.forLogMessage(s".*Service bound to address.*", 1) 87 | ) 88 | container.container.withNetwork(network) 89 | 90 | val create = 91 | if(createTopics) 92 | PubSub.createTopicsAndSubscriptions( 93 | projectId, 94 | emulatorHost, 95 | emulatorHostPort, 96 | List(topicGood, topicBad) 97 | ) 98 | else 99 | IO.unit 100 | 101 | Resource.make ( 102 | create *> 103 | IO(startContainerWithLogs(container.container, testName)) 104 | .map(c => CollectorContainer(c, c.getHost, c.getMappedPort(collectorPort))) 105 | )( 106 | c => IO(c.container.stop()) 107 | ) 108 | } 109 | 110 | def startEmulator(): Unit = pubSubEmulator.start() 111 | 112 | def stopEmulator(): Unit = pubSubEmulator.stop() 113 | } 114 | -------------------------------------------------------------------------------- /pubsub/src/it/scala/com/snowplowanalytics/snowplow/collectors/scalastream/it/pubsub/GooglePubSubCollectorSpec.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2013-present Snowplow Analytics Ltd. 3 | * All rights reserved. 4 | * 5 | * This software is made available by Snowplow Analytics, Ltd., 6 | * under the terms of the Snowplow Limited Use License Agreement, Version 1.1 7 | * located at https://docs.snowplow.io/limited-use-license-1.1 8 | * BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION 9 | * OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT. 10 | */ 11 | package com.snowplowanalytics.snowplow.collectors.scalastream.it.pubsub 12 | 13 | import scala.concurrent.duration._ 14 | import cats.effect.IO 15 | import org.http4s.{Method, Request, Status, Uri} 16 | import cats.effect.testing.specs2.CatsEffect 17 | import org.specs2.mutable.Specification 18 | import org.specs2.specification.BeforeAfterAll 19 | import org.testcontainers.containers.GenericContainer 20 | import com.snowplowanalytics.snowplow.collectors.scalastream.it.utils._ 21 | import com.snowplowanalytics.snowplow.collectors.scalastream.it.{EventGenerator, Http} 22 | 23 | class GooglePubSubCollectorSpec extends Specification with CatsEffect with BeforeAfterAll { 24 | 25 | override protected val Timeout = 5.minutes 26 | 27 | def beforeAll(): Unit = Containers.startEmulator() 28 | 29 | def afterAll(): Unit = Containers.stopEmulator() 30 | 31 | val stopTimeout = 20.second 32 | 33 | val maxBytes = 10000 34 | 35 | "collector-pubsub" should { 36 | "be able to parse the minimal config" in { 37 | val testName = "minimal" 38 | 39 | Containers.collector( 40 | "examples/config.pubsub.minimal.hocon", 41 | testName, 42 | "good", 43 | "bad" 44 | ).use { collector => 45 | IO(collector.container.getLogs() must contain("Service bound to address")) 46 | } 47 | } 48 | 49 | "emit the correct number of collector payloads and bad rows" in { 50 | val testName = "count" 51 | val nbGood = 1000 52 | val nbBad = 10 53 | val topicGood = s"${testName}-raw" 54 | val topicBad = s"${testName}-bad-1" 55 | 56 | Containers.collector( 57 | "pubsub/src/it/resources/collector.hocon", 58 | testName, 59 | topicGood, 60 | topicBad, 61 | envs = Map("MAX_BYTES" -> maxBytes.toString) 62 | ).use { collector => 63 | for { 64 | _ <- log(testName, "Sending data") 65 | _ <- EventGenerator.sendEvents( 66 | collector.host, 67 | collector.port, 68 | nbGood, 69 | nbBad, 70 | maxBytes 71 | ) 72 | _ <- log(testName, "Data sent. Waiting for collector to work") 73 | _ <- IO.sleep(5.second) 74 | _ <- log(testName, "Consuming collector's output") 75 | collectorOutput <- PubSub.consume( 76 | Containers.projectId, 77 | Containers.emulatorHost, 78 | Containers.emulatorHostPort, 79 | topicGood, 80 | topicBad 81 | ) 82 | _ <- printBadRows(testName, collectorOutput.bad) 83 | } yield { 84 | collectorOutput.good.size should beEqualTo(nbGood) 85 | collectorOutput.bad.size should beEqualTo(nbBad) 86 | } 87 | } 88 | } 89 | 90 | s"shutdown within $stopTimeout when it receives a SIGTERM" in { 91 | val testName = "stop" 92 | 93 | Containers.collector( 94 | "pubsub/src/it/resources/collector.hocon", 95 | testName, 96 | s"${testName}-raw", 97 | s"${testName}-bad-1" 98 | ).use { collector => 99 | val container = collector.container 100 | for { 101 | _ <- log(testName, "Sending signal") 102 | _ <- IO(container.getDockerClient().killContainerCmd(container.getContainerId()).withSignal("TERM").exec()) 103 | _ <- waitWhile[GenericContainer[_]](container, _.isRunning, stopTimeout) 104 | } yield { 105 | container.isRunning() must beFalse 106 | container.getLogs() must contain("Closing NIO1 channel") 107 | } 108 | } 109 | } 110 | 111 | "start with /sink-health unhealthy and insert pending events when topics become available" in { 112 | val testName = "sink-health" 113 | val nbGood = 10 114 | val nbBad = 10 115 | val topicGood = s"${testName}-raw" 116 | val topicBad = s"${testName}-bad-1" 117 | 118 | Containers.collector( 119 | "pubsub/src/it/resources/collector.hocon", 120 | testName, 121 | topicGood, 122 | topicBad, 123 | createTopics = false, 124 | envs = Map("MAX_BYTES" -> maxBytes.toString) 125 | ).use { collector => 126 | val uri = Uri.unsafeFromString(s"http://${collector.host}:${collector.port}/sink-health") 127 | val request = Request[IO](Method.GET, uri) 128 | 129 | for { 130 | _ <- log(testName, "Checking /sink-health before creating the topics") 131 | statusBeforeCreate <- Http.status(request) 132 | _ <- log(testName, "Sending events before creating the topics") 133 | _ <- EventGenerator.sendEvents( 134 | collector.host, 135 | collector.port, 136 | nbGood, 137 | nbBad, 138 | maxBytes 139 | ) 140 | _ <- log(testName, "Creating topics") 141 | _ <- PubSub.createTopicsAndSubscriptions( 142 | Containers.projectId, 143 | Containers.emulatorHost, 144 | Containers.emulatorHostPort, 145 | List(topicGood, topicBad) 146 | ) 147 | _ <- IO.sleep(10.second) 148 | _ <- log(testName, "Checking /sink-health after creating the topics") 149 | statusAfterCreate <- Http.status(request) 150 | collectorOutput <- PubSub.consume( 151 | Containers.projectId, 152 | Containers.emulatorHost, 153 | Containers.emulatorHostPort, 154 | topicGood, 155 | topicBad 156 | ) 157 | _ <- printBadRows(testName, collectorOutput.bad) 158 | } yield { 159 | statusBeforeCreate should beEqualTo(Status.ServiceUnavailable) 160 | statusAfterCreate should beEqualTo(Status.Ok) 161 | collectorOutput.good.size should beEqualTo(nbGood) 162 | collectorOutput.bad.size should beEqualTo(nbBad) 163 | } 164 | } 165 | } 166 | } 167 | } 168 | -------------------------------------------------------------------------------- /pubsub/src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | { 2 | streams { 3 | //New object-like style 4 | good = ${streams.sink} 5 | bad = ${streams.sink} 6 | 7 | sink { 8 | threadPoolSize = 10 9 | 10 | backoffPolicy { 11 | minBackoff = 1000 12 | maxBackoff = 1000 13 | totalBackoff = 9223372036854 14 | multiplier = 2 15 | initialRpcTimeout = 10000 16 | maxRpcTimeout = 10000 17 | rpcTimeoutMultiplier = 2 18 | } 19 | 20 | maxBytes = 10000000 21 | 22 | startupCheckInterval = 1 second 23 | retryInterval = 10 seconds 24 | buffer = ${streams.buffer} 25 | gcpUserAgent { 26 | productName = "Snowplow OSS" 27 | } 28 | } 29 | 30 | buffer { 31 | byteLimit = 100000 32 | recordLimit = 40 33 | timeLimit = 1000 34 | } 35 | } 36 | } -------------------------------------------------------------------------------- /pubsub/src/main/scala/com.snowplowanalytics.snowplow.collectors.scalastream/PubSubCollector.scala: -------------------------------------------------------------------------------- 1 | package com.snowplowanalytics.snowplow.collectors.scalastream 2 | 3 | import cats.effect._ 4 | import cats.effect.kernel.Resource 5 | import com.snowplowanalytics.snowplow.collector.core.model.Sinks 6 | import com.snowplowanalytics.snowplow.collector.core.{App, Config, Telemetry} 7 | import com.snowplowanalytics.snowplow.collectors.scalastream.sinks.{PubSubSink, PubSubSinkConfig} 8 | 9 | object PubSubCollector extends App[PubSubSinkConfig](BuildInfo) { 10 | 11 | override def mkSinks(config: Config.Streams[PubSubSinkConfig]): Resource[IO, Sinks[IO]] = 12 | for { 13 | good <- PubSubSink.create[IO](config.good) 14 | bad <- PubSubSink.create[IO](config.bad) 15 | } yield Sinks(good, bad) 16 | 17 | override def telemetryInfo(config: Config.Streams[PubSubSinkConfig]): IO[Telemetry.TelemetryInfo] = 18 | IO( 19 | Telemetry.TelemetryInfo( 20 | region = None, 21 | cloud = Some("GCP"), 22 | unhashedInstallationId = Some(config.good.config.googleProjectId) 23 | ) 24 | ) 25 | } 26 | -------------------------------------------------------------------------------- /pubsub/src/main/scala/com.snowplowanalytics.snowplow.collectors.scalastream/sinks/BuilderOps.scala: -------------------------------------------------------------------------------- 1 | package com.snowplowanalytics.snowplow.collectors.scalastream.sinks 2 | 3 | import com.google.api.gax.core.NoCredentialsProvider 4 | import com.google.api.gax.grpc.GrpcTransportChannel 5 | import com.google.api.gax.rpc.FixedTransportChannelProvider 6 | import com.google.cloud.pubsub.v1.{Publisher, TopicAdminSettings} 7 | import io.grpc.ManagedChannelBuilder 8 | 9 | object BuilderOps { 10 | 11 | implicit class PublisherBuilderOps(val builder: Publisher.Builder) extends AnyVal { 12 | def setProvidersForEmulator(): Publisher.Builder = 13 | customEmulatorHost().fold(builder) { emulatorHost => 14 | builder 15 | .setChannelProvider(createCustomChannelProvider(emulatorHost)) 16 | .setCredentialsProvider(NoCredentialsProvider.create()) 17 | } 18 | } 19 | 20 | implicit class TopicAdminBuilderOps(val builder: TopicAdminSettings.Builder) extends AnyVal { 21 | def setProvidersForEmulator(): TopicAdminSettings.Builder = 22 | customEmulatorHost().fold(builder) { emulatorHost => 23 | builder 24 | .setTransportChannelProvider(createCustomChannelProvider(emulatorHost)) 25 | .setCredentialsProvider(NoCredentialsProvider.create()) 26 | } 27 | } 28 | 29 | private def customEmulatorHost(): Option[String] = 30 | sys.env.get("PUBSUB_EMULATOR_HOST") 31 | 32 | private def createCustomChannelProvider(emulatorHost: String): FixedTransportChannelProvider = { 33 | val channel = ManagedChannelBuilder.forTarget(emulatorHost).usePlaintext().build() 34 | FixedTransportChannelProvider.create(GrpcTransportChannel.create(channel)) 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /pubsub/src/main/scala/com.snowplowanalytics.snowplow.collectors.scalastream/sinks/PubSubHealthCheck.scala: -------------------------------------------------------------------------------- 1 | package com.snowplowanalytics.snowplow.collectors.scalastream.sinks 2 | 3 | import cats.effect.implicits.genSpawnOps 4 | import cats.effect.{Async, Ref, Resource, Sync} 5 | import cats.implicits._ 6 | import com.google.cloud.pubsub.v1.{TopicAdminClient, TopicAdminSettings} 7 | import com.google.pubsub.v1.{ProjectName, TopicName} 8 | import com.snowplowanalytics.snowplow.collectors.scalastream.sinks.BuilderOps._ 9 | import org.typelevel.log4cats.Logger 10 | import org.typelevel.log4cats.slf4j.Slf4jLogger 11 | 12 | import scala.jdk.CollectionConverters._ 13 | import scala.util._ 14 | 15 | object PubSubHealthCheck { 16 | 17 | implicit private def unsafeLogger[F[_]: Sync]: Logger[F] = 18 | Slf4jLogger.getLogger[F] 19 | 20 | def run[F[_]: Async]( 21 | isHealthyState: Ref[F, Boolean], 22 | sinkConfig: PubSubSinkConfig, 23 | topicName: String 24 | ): Resource[F, Unit] = 25 | for { 26 | topicAdminClient <- createTopicAdminClient[F]() 27 | healthCheckTask = createHealthCheckTask[F](topicAdminClient, isHealthyState, sinkConfig, topicName) 28 | _ <- repeatInBackgroundUntilHealthy(isHealthyState, sinkConfig, healthCheckTask) 29 | } yield () 30 | 31 | private def repeatInBackgroundUntilHealthy[F[_]: Async]( 32 | isHealthyState: Ref[F, Boolean], 33 | sinkConfig: PubSubSinkConfig, 34 | healthCheckTask: F[Unit] 35 | ): Resource[F, Unit] = { 36 | val checkThenSleep = healthCheckTask *> Async[F].sleep(sinkConfig.startupCheckInterval) 37 | checkThenSleep.untilM_(isHealthyState.get).background.void 38 | } 39 | 40 | private def createHealthCheckTask[F[_]: Async]( 41 | topicAdminClient: TopicAdminClient, 42 | isHealthyState: Ref[F, Boolean], 43 | sinkConfig: PubSubSinkConfig, 44 | topicName: String 45 | ): F[Unit] = 46 | topicExists(topicAdminClient, sinkConfig.googleProjectId, topicName).flatMap { 47 | case Right(true) => 48 | Logger[F].info(s"Topic $topicName exists") *> isHealthyState.set(true) 49 | case Right(false) => 50 | Logger[F].error(s"Topic $topicName doesn't exist") 51 | case Left(err) => 52 | Logger[F].error(s"Error while checking if topic $topicName exists: ${err.getCause}") 53 | } 54 | 55 | private def createTopicAdminClient[F[_]: Sync](): Resource[F, TopicAdminClient] = { 56 | val builder = TopicAdminSettings.newBuilder().setProvidersForEmulator().build() 57 | Resource.make(Sync[F].delay(TopicAdminClient.create(builder)))(client => Sync[F].delay(client.close())) 58 | } 59 | 60 | private def topicExists[F[_]: Sync]( 61 | topicAdmin: TopicAdminClient, 62 | projectId: String, 63 | topicName: String 64 | ): F[Either[Throwable, Boolean]] = Sync[F].delay { 65 | Either 66 | .catchNonFatal(topicAdmin.listTopics(ProjectName.of(projectId))) 67 | .leftMap(new RuntimeException(s"Can't list topics", _)) 68 | .map(_.iterateAll.asScala.toList.map(_.getName())) 69 | .flatMap { topics => 70 | topics.contains(TopicName.of(projectId, topicName).toString).asRight 71 | } 72 | } 73 | 74 | } 75 | -------------------------------------------------------------------------------- /pubsub/src/main/scala/com.snowplowanalytics.snowplow.collectors.scalastream/sinks/PubSubSink.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2013-present Snowplow Analytics Ltd. 3 | * All rights reserved. 4 | * 5 | * This software is made available by Snowplow Analytics, Ltd., 6 | * under the terms of the Snowplow Limited Use License Agreement, Version 1.1 7 | * located at https://docs.snowplow.io/limited-use-license-1.1 8 | * BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION 9 | * OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT. 10 | */ 11 | package com.snowplowanalytics.snowplow.collectors.scalastream.sinks 12 | 13 | import cats.Parallel 14 | import cats.effect.implicits.genSpawnOps 15 | import cats.effect.{Async, Ref, Resource, Sync} 16 | import cats.implicits._ 17 | import com.google.api.gax.retrying.RetrySettings 18 | import com.google.api.gax.rpc.{ApiException, FixedHeaderProvider} 19 | import com.permutive.pubsub.producer.Model.{ProjectId, Topic} 20 | import com.permutive.pubsub.producer.encoder.MessageEncoder 21 | import com.permutive.pubsub.producer.grpc.{GooglePubsubProducer, PubsubProducerConfig} 22 | import com.permutive.pubsub.producer.{Model, PubsubProducer} 23 | import com.snowplowanalytics.snowplow.collector.core.{Config, Sink} 24 | import com.snowplowanalytics.snowplow.collectors.scalastream.sinks.BuilderOps._ 25 | import org.threeten.bp.Duration 26 | import org.typelevel.log4cats.Logger 27 | import org.typelevel.log4cats.slf4j.Slf4jLogger 28 | import retry.RetryPolicies 29 | import retry.syntax.all._ 30 | 31 | import scala.concurrent.duration.{DurationLong, FiniteDuration} 32 | import scala.util._ 33 | 34 | class PubSubSink[F[_]: Async: Parallel: Logger] private ( 35 | override val maxBytes: Int, 36 | isHealthyState: Ref[F, Boolean], 37 | producer: PubsubProducer[F, Array[Byte]], 38 | retryInterval: FiniteDuration, 39 | topicName: String 40 | ) extends Sink[F] { 41 | 42 | override def storeRawEvents(events: List[Array[Byte]], key: String): F[Unit] = 43 | produceBatch(events).start.void 44 | 45 | override def isHealthy: F[Boolean] = isHealthyState.get 46 | 47 | private def produceBatch(events: List[Array[Byte]]): F[Unit] = 48 | events.parTraverse_ { event => 49 | produceSingleEvent(event) 50 | } *> isHealthyState.set(true) 51 | 52 | private def produceSingleEvent(event: Array[Byte]): F[Model.MessageId] = 53 | producer 54 | .produce(event) 55 | .retryingOnAllErrors( 56 | policy = RetryPolicies.constantDelay(retryInterval), 57 | onError = (error, _) => handlePublishError(error) 58 | ) 59 | 60 | private def handlePublishError(error: Throwable): F[Unit] = 61 | isHealthyState.set(false) *> Logger[F].error(createErrorMessage(error)) 62 | 63 | private def createErrorMessage(error: Throwable): String = 64 | error match { 65 | case apiEx: ApiException => 66 | val retryable = if (apiEx.isRetryable) "retryable" else "non-retryable" 67 | s"Publishing message to $topicName failed with code ${apiEx.getStatusCode} and $retryable error: ${apiEx.getMessage}" 68 | case throwable => s"Publishing message to $topicName failed with error: ${throwable.getMessage}" 69 | } 70 | } 71 | 72 | object PubSubSink { 73 | 74 | implicit private def unsafeLogger[F[_]: Sync]: Logger[F] = 75 | Slf4jLogger.getLogger[F] 76 | 77 | implicit val byteArrayEncoder: MessageEncoder[Array[Byte]] = 78 | new MessageEncoder[Array[Byte]] { 79 | def encode(a: Array[Byte]): Either[Throwable, Array[Byte]] = 80 | a.asRight 81 | } 82 | 83 | def create[F[_]: Async: Parallel]( 84 | sinkConfig: Config.Sink[PubSubSinkConfig] 85 | ): Resource[F, Sink[F]] = 86 | for { 87 | isHealthyState <- Resource.eval(Ref.of[F, Boolean](false)) 88 | producer <- createProducer[F](sinkConfig.config, sinkConfig.name, sinkConfig.buffer) 89 | _ <- PubSubHealthCheck.run(isHealthyState, sinkConfig.config, sinkConfig.name) 90 | } yield new PubSubSink( 91 | sinkConfig.config.maxBytes, 92 | isHealthyState, 93 | producer, 94 | sinkConfig.config.retryInterval, 95 | sinkConfig.name 96 | ) 97 | 98 | private def createProducer[F[_]: Async]( 99 | sinkConfig: PubSubSinkConfig, 100 | topicName: String, 101 | bufferConfig: Config.Buffer 102 | ): Resource[F, PubsubProducer[F, Array[Byte]]] = { 103 | val config = PubsubProducerConfig[F]( 104 | batchSize = bufferConfig.recordLimit, 105 | requestByteThreshold = Some(bufferConfig.byteLimit), 106 | delayThreshold = bufferConfig.timeLimit.millis, 107 | onFailedTerminate = err => Logger[F].error(err)("PubSub sink termination error"), 108 | customizePublisher = Some { 109 | _.setRetrySettings(retrySettings(sinkConfig.backoffPolicy)) 110 | .setHeaderProvider(FixedHeaderProvider.create("User-Agent", createUserAgent(sinkConfig.gcpUserAgent))) 111 | .setProvidersForEmulator() 112 | } 113 | ) 114 | 115 | GooglePubsubProducer.of[F, Array[Byte]](ProjectId(sinkConfig.googleProjectId), Topic(topicName), config) 116 | } 117 | 118 | private[sinks] def createUserAgent(gcpUserAgent: PubSubSinkConfig.GcpUserAgent): String = 119 | s"${gcpUserAgent.productName}/collector (GPN:Snowplow;)" 120 | 121 | private def retrySettings(backoffPolicy: PubSubSinkConfig.BackoffPolicy): RetrySettings = 122 | RetrySettings 123 | .newBuilder() 124 | .setInitialRetryDelay(Duration.ofMillis(backoffPolicy.minBackoff)) 125 | .setMaxRetryDelay(Duration.ofMillis(backoffPolicy.maxBackoff)) 126 | .setRetryDelayMultiplier(backoffPolicy.multiplier) 127 | .setTotalTimeout(Duration.ofMillis(backoffPolicy.totalBackoff)) 128 | .setInitialRpcTimeout(Duration.ofMillis(backoffPolicy.initialRpcTimeout)) 129 | .setRpcTimeoutMultiplier(backoffPolicy.rpcTimeoutMultiplier) 130 | .setMaxRpcTimeout(Duration.ofMillis(backoffPolicy.maxRpcTimeout)) 131 | .build() 132 | } 133 | -------------------------------------------------------------------------------- /pubsub/src/main/scala/com.snowplowanalytics.snowplow.collectors.scalastream/sinks/PubSubSinkConfig.scala: -------------------------------------------------------------------------------- 1 | package com.snowplowanalytics.snowplow.collectors.scalastream.sinks 2 | 3 | import com.snowplowanalytics.snowplow.collectors.scalastream.sinks.PubSubSinkConfig._ 4 | import io.circe.Decoder 5 | import io.circe.config.syntax.durationDecoder 6 | import io.circe.generic.semiauto._ 7 | 8 | import scala.concurrent.duration.FiniteDuration 9 | 10 | final case class PubSubSinkConfig( 11 | maxBytes: Int, 12 | googleProjectId: String, 13 | backoffPolicy: BackoffPolicy, 14 | startupCheckInterval: FiniteDuration, 15 | retryInterval: FiniteDuration, 16 | gcpUserAgent: GcpUserAgent 17 | ) 18 | 19 | object PubSubSinkConfig { 20 | 21 | final case class BackoffPolicy( 22 | minBackoff: Long, 23 | maxBackoff: Long, 24 | totalBackoff: Long, 25 | multiplier: Double, 26 | initialRpcTimeout: Long, 27 | maxRpcTimeout: Long, 28 | rpcTimeoutMultiplier: Double 29 | ) 30 | 31 | final case class GcpUserAgent(productName: String) 32 | 33 | implicit val configDecoder: Decoder[PubSubSinkConfig] = deriveDecoder[PubSubSinkConfig] 34 | implicit val backoffPolicyConfigDecoder: Decoder[BackoffPolicy] = 35 | deriveDecoder[BackoffPolicy] 36 | implicit val gcpUserAgentDecoder: Decoder[GcpUserAgent] = deriveDecoder[GcpUserAgent] 37 | } 38 | -------------------------------------------------------------------------------- /pubsub/src/test/scala/com.snowplowanalytics.snowplow.collectors.scalastream/sinks/GcpUserAgentSpec.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2013-present Snowplow Analytics Ltd. 3 | * All rights reserved. 4 | * 5 | * This software is made available by Snowplow Analytics, Ltd., 6 | * under the terms of the Snowplow Limited Use License Agreement, Version 1.1 7 | * located at https://docs.snowplow.io/limited-use-license-1.1 8 | * BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION 9 | * OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT. 10 | */ 11 | package com.snowplowanalytics.snowplow.collectors.scalastream.sinks 12 | 13 | import java.util.regex.Pattern 14 | 15 | import com.snowplowanalytics.snowplow.collectors.scalastream.sinks.PubSubSinkConfig._ 16 | 17 | import org.specs2.mutable.Specification 18 | 19 | class GcpUserAgentSpec extends Specification { 20 | 21 | "createUserAgent" should { 22 | "create user agent string correctly" in { 23 | val gcpUserAgent = GcpUserAgent(productName = "Snowplow OSS") 24 | val resultUserAgent = PubSubSink.createUserAgent(gcpUserAgent) 25 | val expectedUserAgent = s"Snowplow OSS/collector (GPN:Snowplow;)" 26 | 27 | val userAgentRegex = Pattern.compile( 28 | """(?iU)(?:[^\(\)\/]+\/[^\/]+\s+)*(?:[^\s][^\(\)\/]+\/[^\/]+\s?\([^\(\)]*)gpn:(.*)[;\)]""" 29 | ) 30 | val matcher = userAgentRegex.matcher(resultUserAgent) 31 | val matched = if (matcher.find()) Some(matcher.group(1)) else None 32 | val expectedMatched = "Snowplow;" 33 | 34 | resultUserAgent must beEqualTo(expectedUserAgent) 35 | matched must beSome(expectedMatched) 36 | } 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /sqs/src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | collector { 2 | streams { 3 | good = ${collector.streams.sink} 4 | bad = ${collector.streams.sink} 5 | sink { 6 | enabled = sqs 7 | threadPoolSize = 10 8 | 9 | backoffPolicy { 10 | minBackoff = 500 11 | maxBackoff = 1500 12 | maxRetries = 3 13 | } 14 | 15 | maxBytes = 192000 16 | 17 | startupCheckInterval = 1 second 18 | buffer = ${collector.streams.buffer} 19 | } 20 | 21 | buffer { 22 | byteLimit = 3145728 23 | recordLimit = 500 24 | timeLimit = 5000 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /sqs/src/main/scala/com.snowplowanalytics.snowplow.collectors.scalastream/SqsCollector.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2013-present Snowplow Analytics Ltd. 3 | * All rights reserved. 4 | * 5 | * This software is made available by Snowplow Analytics, Ltd., 6 | * under the terms of the Snowplow Limited Use License Agreement, Version 1.1 7 | * located at https://docs.snowplow.io/limited-use-license-1.1 8 | * BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION 9 | * OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT. 10 | */ 11 | package com.snowplowanalytics.snowplow.collectors.scalastream 12 | 13 | import java.util.concurrent.ScheduledThreadPoolExecutor 14 | import cats.effect.{IO, Resource} 15 | import com.snowplowanalytics.snowplow.collector.core.model.Sinks 16 | import com.snowplowanalytics.snowplow.collector.core.{App, Config, Telemetry} 17 | import com.snowplowanalytics.snowplow.collectors.scalastream.sinks._ 18 | 19 | object SqsCollector extends App[SqsSinkConfig](BuildInfo) { 20 | 21 | override def mkSinks(config: Config.Streams[SqsSinkConfig]): Resource[IO, Sinks[IO]] = { 22 | val threadPoolExecutor = new ScheduledThreadPoolExecutor(config.good.config.threadPoolSize) 23 | for { 24 | good <- SqsSink.create[IO](config.good, threadPoolExecutor) 25 | bad <- SqsSink.create[IO](config.bad, threadPoolExecutor) 26 | } yield Sinks(good, bad) 27 | } 28 | 29 | override def telemetryInfo(config: Config.Streams[SqsSinkConfig]): IO[Telemetry.TelemetryInfo] = 30 | TelemetryUtils 31 | .getAccountId(config) 32 | .map(id => 33 | Telemetry.TelemetryInfo( 34 | region = Some(config.good.config.region), 35 | cloud = Some("AWS"), 36 | unhashedInstallationId = id 37 | ) 38 | ) 39 | } 40 | -------------------------------------------------------------------------------- /sqs/src/main/scala/com.snowplowanalytics.snowplow.collectors.scalastream/TelemetryUtils.scala: -------------------------------------------------------------------------------- 1 | package com.snowplowanalytics.snowplow.collectors.scalastream 2 | 3 | import cats.effect.{IO, Resource} 4 | import software.amazon.awssdk.services.sqs.model._ 5 | import com.snowplowanalytics.snowplow.collector.core.Config 6 | import com.snowplowanalytics.snowplow.collectors.scalastream.sinks._ 7 | 8 | object TelemetryUtils { 9 | 10 | def getAccountId(config: Config.Streams[SqsSinkConfig]): IO[Option[String]] = 11 | Resource 12 | .make( 13 | IO(SqsSink.createSqsClient(config.good.config.region)).rethrow 14 | )(c => IO(c.close())) 15 | .use { client => 16 | IO { 17 | val req = GetQueueUrlRequest.builder().queueName(config.good.name).build() 18 | val sqsQueueUrl = client.getQueueUrl(req).queueUrl() 19 | Some(extractAccountId(sqsQueueUrl)) 20 | } 21 | } 22 | .handleError(_ => None) 23 | 24 | def extractAccountId(sqsQueueUrl: String): String = 25 | sqsQueueUrl.split("/")(3) 26 | 27 | } 28 | -------------------------------------------------------------------------------- /sqs/src/main/scala/com.snowplowanalytics.snowplow.collectors.scalastream/sinks/SqsSinkConfig.scala: -------------------------------------------------------------------------------- 1 | package com.snowplowanalytics.snowplow.collectors.scalastream.sinks 2 | 3 | import io.circe.Decoder 4 | import io.circe.generic.semiauto._ 5 | 6 | final case class SqsSinkConfig( 7 | maxBytes: Int, 8 | region: String, 9 | backoffPolicy: SqsSinkConfig.BackoffPolicyConfig, 10 | threadPoolSize: Int 11 | ) 12 | 13 | object SqsSinkConfig { 14 | final case class BackoffPolicyConfig(minBackoff: Long, maxBackoff: Long, maxRetries: Int) 15 | 16 | implicit val configDecoder: Decoder[SqsSinkConfig] = deriveDecoder[SqsSinkConfig] 17 | implicit val backoffPolicyDecoder: Decoder[BackoffPolicyConfig] = deriveDecoder[BackoffPolicyConfig] 18 | } 19 | -------------------------------------------------------------------------------- /sqs/src/test/scala/com.snowplowanalytics.snowplow.collectors.scalastream/SqsConfigSpec.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2013-present Snowplow Analytics Ltd. 3 | * All rights reserved. 4 | * 5 | * This software is made available by Snowplow Analytics, Ltd., 6 | * under the terms of the Snowplow Limited Use License Agreement, Version 1.1 7 | * located at https://docs.snowplow.io/limited-use-license-1.1 8 | * BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION 9 | * OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT. 10 | */ 11 | package com.snowplowanalytics.snowplow.collectors.scalastream 12 | 13 | import cats.effect.testing.specs2.CatsEffect 14 | import cats.effect.{ExitCode, IO} 15 | import com.snowplowanalytics.snowplow.collector.core.{Config, ConfigParser} 16 | import com.snowplowanalytics.snowplow.collectors.scalastream.sinks.SqsSinkConfig 17 | import org.http4s.SameSite 18 | import org.specs2.mutable.Specification 19 | 20 | import java.nio.file.Paths 21 | import scala.concurrent.duration.DurationInt 22 | 23 | class SqsConfigSpec extends Specification with CatsEffect { 24 | 25 | "Config parser" should { 26 | "be able to parse extended kinesis config" in { 27 | assert( 28 | resource = "/config.sqs.extended.hocon", 29 | expectedResult = Right( 30 | SqsConfigSpec 31 | .expectedConfig 32 | .copy( 33 | monitoring = Config.Monitoring( 34 | Config.Metrics( 35 | SqsConfigSpec.expectedConfig.monitoring.metrics.statsd.copy(tags = Map("app" -> "collector")) 36 | ) 37 | ) 38 | ) 39 | ) 40 | ) 41 | } 42 | "be able to parse minimal kinesis config" in { 43 | assert( 44 | resource = "/config.sqs.minimal.hocon", 45 | expectedResult = Right(SqsConfigSpec.expectedConfig) 46 | ) 47 | } 48 | } 49 | 50 | private def assert(resource: String, expectedResult: Either[ExitCode, Config[SqsSinkConfig]]) = { 51 | val path = Paths.get(getClass.getResource(resource).toURI) 52 | ConfigParser.fromPath[IO, SqsSinkConfig](Some(path)).value.map { result => 53 | result must beEqualTo(expectedResult) 54 | } 55 | } 56 | } 57 | 58 | object SqsConfigSpec { 59 | 60 | private val expectedConfig = Config[SqsSinkConfig]( 61 | interface = "0.0.0.0", 62 | port = 8080, 63 | paths = Map.empty[String, String], 64 | p3p = Config.P3P( 65 | policyRef = "/w3c/p3p.xml", 66 | CP = "NOI DSP COR NID PSA OUR IND COM NAV STA" 67 | ), 68 | crossDomain = Config.CrossDomain( 69 | enabled = false, 70 | domains = List("*"), 71 | secure = true 72 | ), 73 | cookie = Config.Cookie( 74 | enabled = true, 75 | expiration = 365.days, 76 | name = "sp", 77 | domains = List.empty, 78 | fallbackDomain = None, 79 | secure = true, 80 | httpOnly = true, 81 | sameSite = Some(SameSite.None), 82 | clientCookieName = None 83 | ), 84 | doNotTrackCookie = Config.DoNotTrackCookie( 85 | enabled = false, 86 | name = "", 87 | value = "" 88 | ), 89 | cookieBounce = Config.CookieBounce( 90 | enabled = false, 91 | name = "n3pc", 92 | fallbackNetworkUserId = "00000000-0000-4000-A000-000000000000", 93 | forwardedProtocolHeader = None 94 | ), 95 | redirectMacro = Config.RedirectMacro( 96 | enabled = false, 97 | placeholder = None 98 | ), 99 | rootResponse = Config.RootResponse( 100 | enabled = false, 101 | statusCode = 302, 102 | headers = Map.empty[String, String], 103 | body = "" 104 | ), 105 | cors = Config.CORS(1.hour), 106 | monitoring = Config.Monitoring( 107 | Config.Metrics( 108 | Config.Statsd(false, "localhost", 8125, 10.seconds, "snowplow.collector", Map.empty) 109 | ) 110 | ), 111 | ssl = Config.SSL(enable = false, redirect = false, port = 443), 112 | hsts = Config.HSTS(enable = false, maxAge = 365.days), 113 | enableDefaultRedirect = false, 114 | redirectDomains = Set.empty, 115 | preTerminationPeriod = 10.seconds, 116 | networking = Config.Networking( 117 | maxConnections = 1024, 118 | idleTimeout = 610.seconds, 119 | responseHeaderTimeout = 30.seconds, 120 | maxRequestLineLength = 20480, 121 | maxHeadersLength = 40960, 122 | maxPayloadSize = 1048576, 123 | dropPayloadSize = 2097152 124 | ), 125 | streams = Config.Streams( 126 | useIpAddressAsPartitionKey = false, 127 | good = Config.Sink( 128 | name = "good", 129 | buffer = Config.Buffer( 130 | byteLimit = 3145728, 131 | recordLimit = 500, 132 | timeLimit = 5000 133 | ), 134 | config = SqsSinkConfig( 135 | maxBytes = 192000, 136 | region = "eu-central-1", 137 | backoffPolicy = SqsSinkConfig.BackoffPolicyConfig( 138 | minBackoff = 500, 139 | maxBackoff = 1500, 140 | maxRetries = 3 141 | ), 142 | threadPoolSize = 10 143 | ) 144 | ), 145 | bad = Config.Sink( 146 | name = "bad", 147 | buffer = Config.Buffer( 148 | byteLimit = 3145728, 149 | recordLimit = 500, 150 | timeLimit = 5000 151 | ), 152 | config = SqsSinkConfig( 153 | maxBytes = 192000, 154 | region = "eu-central-1", 155 | backoffPolicy = SqsSinkConfig.BackoffPolicyConfig( 156 | minBackoff = 500, 157 | maxBackoff = 1500, 158 | maxRetries = 3 159 | ), 160 | threadPoolSize = 10 161 | ) 162 | ) 163 | ), 164 | telemetry = Config.Telemetry( 165 | disable = false, 166 | interval = 60.minutes, 167 | method = "POST", 168 | url = "telemetry-g.snowplowanalytics.com", 169 | port = 443, 170 | secure = true, 171 | userProvidedId = None, 172 | moduleName = None, 173 | moduleVersion = None, 174 | instanceId = None, 175 | autoGeneratedId = None 176 | ), 177 | license = Config.License(accept = true) 178 | ) 179 | 180 | } 181 | -------------------------------------------------------------------------------- /sqs/src/test/scala/com.snowplowanalytics.snowplow.collectors.scalastream/TelemetryUtilsSpec.scala: -------------------------------------------------------------------------------- 1 | package com.snowplowanalytics.snowplow.collectors.scalastream 2 | 3 | import org.specs2.mutable.Specification 4 | 5 | class TelemetryUtilsSpec extends Specification { 6 | 7 | "extractAccountId" should { 8 | "be able to extract account id from sqs queue url successfully" in { 9 | val queueUrl = "https://sqs.region.amazonaws.com/123456789/queue" 10 | TelemetryUtils.extractAccountId(queueUrl) must beEqualTo("123456789") 11 | } 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /stdout/src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | collector { 2 | streams { 3 | good = ${collector.streams.sink} 4 | bad = ${collector.streams.sink} 5 | sink { 6 | maxBytes = 1000000000 7 | buffer = ${collector.streams.buffer} 8 | } 9 | buffer { 10 | byteLimit = 3145728 11 | recordLimit = 500 12 | timeLimit = 5000 13 | } 14 | } 15 | } -------------------------------------------------------------------------------- /stdout/src/main/scala/com.snowplowanalytics.snowplow.collector.stdout/PrintingSink.scala: -------------------------------------------------------------------------------- 1 | package com.snowplowanalytics.snowplow.collector.stdout 2 | 3 | import java.io.PrintStream 4 | import java.util.Base64 5 | 6 | import cats.implicits._ 7 | 8 | import cats.effect.Sync 9 | 10 | import com.snowplowanalytics.snowplow.collector.core.Sink 11 | 12 | class PrintingSink[F[_]: Sync]( 13 | maxByteS: Int, 14 | stream: PrintStream 15 | ) extends Sink[F] { 16 | private val encoder: Base64.Encoder = Base64.getEncoder.withoutPadding() 17 | 18 | override val maxBytes: Int = maxByteS 19 | override def isHealthy: F[Boolean] = Sync[F].pure(true) 20 | 21 | override def storeRawEvents(events: List[Array[Byte]], key: String): F[Unit] = 22 | events.traverse_ { event => 23 | Sync[F].delay { 24 | stream.println(encoder.encodeToString(event)) 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /stdout/src/main/scala/com.snowplowanalytics.snowplow.collector.stdout/SinkConfig.scala: -------------------------------------------------------------------------------- 1 | package com.snowplowanalytics.snowplow.collector.stdout 2 | 3 | import io.circe.Decoder 4 | import io.circe.generic.semiauto._ 5 | 6 | final case class SinkConfig( 7 | maxBytes: Int 8 | ) 9 | 10 | object SinkConfig { 11 | implicit val configDecoder: Decoder[SinkConfig] = deriveDecoder[SinkConfig] 12 | } 13 | -------------------------------------------------------------------------------- /stdout/src/main/scala/com.snowplowanalytics.snowplow.collector.stdout/StdoutCollector.scala: -------------------------------------------------------------------------------- 1 | package com.snowplowanalytics.snowplow.collector.stdout 2 | 3 | import cats.effect.IO 4 | import cats.effect.kernel.Resource 5 | import com.snowplowanalytics.snowplow.collector.core.model.Sinks 6 | import com.snowplowanalytics.snowplow.collector.core.{App, Config, Telemetry} 7 | 8 | object StdoutCollector extends App[SinkConfig](BuildInfo) { 9 | 10 | override def mkSinks(config: Config.Streams[SinkConfig]): Resource[IO, Sinks[IO]] = { 11 | val good = new PrintingSink[IO](config.good.config.maxBytes, System.out) 12 | val bad = new PrintingSink[IO](config.bad.config.maxBytes, System.err) 13 | Resource.pure(Sinks(good, bad)) 14 | } 15 | 16 | override def telemetryInfo(config: Config.Streams[SinkConfig]): IO[Telemetry.TelemetryInfo] = 17 | IO(Telemetry.TelemetryInfo(None, None, None)) 18 | } 19 | -------------------------------------------------------------------------------- /stdout/src/test/scala/com.snowplowanalytics.snowplow.collectors.scalastream/sinks/PrintingSinkSpec.scala: -------------------------------------------------------------------------------- 1 | package com.snowplowanalytics.snowplow.collectors.scalastream.sinks 2 | 3 | import java.io.{ByteArrayOutputStream, PrintStream} 4 | import java.nio.charset.StandardCharsets 5 | 6 | import org.specs2.mutable.Specification 7 | 8 | import cats.effect.IO 9 | import cats.effect.unsafe.implicits.global 10 | 11 | import com.snowplowanalytics.snowplow.collector.stdout.PrintingSink 12 | 13 | class PrintingSinkSpec extends Specification { 14 | 15 | "Printing sink" should { 16 | "print provided bytes encoded as BASE64 string" in { 17 | val baos = new ByteArrayOutputStream() 18 | val sink = new PrintingSink[IO](Integer.MAX_VALUE, new PrintStream(baos)) 19 | val input = "Something" 20 | 21 | sink.storeRawEvents(List(input.getBytes(StandardCharsets.UTF_8)), "key").unsafeRunSync() 22 | 23 | baos.toString(StandardCharsets.UTF_8) must beEqualTo("U29tZXRoaW5n\n") // base64 of 'Something' + newline 24 | } 25 | } 26 | } 27 | --------------------------------------------------------------------------------