├── .dockerignore ├── .github └── workflows │ └── ci.yml ├── .gitignore ├── LICENSE ├── README.md ├── batch ├── .gitignore ├── build.gradle ├── config │ └── application.yml └── src │ ├── main │ ├── java │ │ └── com │ │ │ └── cookpad │ │ │ └── prism │ │ │ └── batch │ │ │ ├── CatalogCmd.java │ │ │ ├── CmdRunner.java │ │ │ ├── DropTableCmd.java │ │ │ ├── JobTime.java │ │ │ ├── ListStagingObjectsCmd.java │ │ │ ├── Main.java │ │ │ ├── ManifestGenerator.java │ │ │ ├── PrismBatchConf.java │ │ │ ├── UnlinkTableCmd.java │ │ │ └── catalog │ │ │ ├── Catalog.java │ │ │ ├── CatalogTable.java │ │ │ ├── DatabaseNameModifier.java │ │ │ └── UpsertPartitionRequest.java │ ├── jib │ │ └── app │ │ │ └── config │ │ │ └── application.yml │ └── resources │ │ ├── logback.xml │ │ └── sentry.properties │ └── test │ └── java │ └── com │ └── cookpad │ └── prism │ └── batch │ └── catalog │ └── CatalogTableTest.java ├── build.gradle ├── db ├── Gemfile ├── Gemfile.lock ├── Schemafile ├── database.yml ├── prism_merge_jobs.schema ├── prism_merge_ranges.schema ├── prism_partitions.schema ├── prism_small_objects.schema ├── prism_staging_objects.schema ├── prism_tables.schema ├── prism_tables_strload_streams.schema └── prism_unknown_staging_objects.schema ├── gc ├── .ruby-version ├── Dockerfile ├── Gemfile ├── Gemfile.lock ├── README.md ├── bin │ └── garbage-collect ├── config │ ├── development │ │ └── datasource.yml │ └── production │ │ └── datasource.yml └── lib │ └── prism │ ├── garbage_collect_application.rb │ └── garbage_collector.rb ├── gradle └── wrapper │ ├── gradle-wrapper.jar │ └── gradle-wrapper.properties ├── gradlew ├── gradlew.bat ├── merge ├── .gitignore ├── build.gradle ├── config │ └── application.yml └── src │ ├── main │ ├── java │ │ └── com │ │ │ └── cookpad │ │ │ └── prism │ │ │ └── merge │ │ │ ├── CmdRunner.java │ │ │ ├── DaemonCmd.java │ │ │ ├── Main.java │ │ │ ├── MergeJobDispatcher.java │ │ │ ├── MergeJobHandler.java │ │ │ ├── MergeJobQueue.java │ │ │ ├── MergeJobWorker.java │ │ │ ├── OneshotCmd.java │ │ │ ├── ParallelParquetMerger.java │ │ │ ├── ParquetFileMerger.java │ │ │ ├── ParquetMerger.java │ │ │ ├── PrismMergeConf.java │ │ │ ├── RebuildCmd.java │ │ │ └── downloader │ │ │ ├── DownloadedObjectSupplier.java │ │ │ ├── MergedObjectSupplierFactory.java │ │ │ └── SmallObjectSupplierFactory.java │ ├── jib │ │ └── app │ │ │ └── config │ │ │ └── application.yml │ └── resources │ │ ├── logback.xml │ │ └── sentry.properties │ └── test │ └── java │ └── com │ └── cookpad │ └── prism │ └── merge │ ├── MergeJobWorkerTest.java │ ├── ParallelParquetMergerTest.java │ ├── ParquetMergerTest.java │ └── downloader │ └── SmallObjectSupplierFactoryTest.java ├── settings.gradle ├── shared ├── .gitignore ├── Dockerfile ├── build.gradle └── src │ ├── main │ ├── java │ │ └── com │ │ │ └── cookpad │ │ │ └── prism │ │ │ ├── Banner.java │ │ │ ├── PrismConf.java │ │ │ ├── SchemaBuilder.java │ │ │ ├── StepHandler.java │ │ │ ├── TempFile.java │ │ │ ├── dao │ │ │ ├── OneToMany.java │ │ │ ├── OneToOne.java │ │ │ ├── PacketStream.java │ │ │ ├── PacketStreamMapper.java │ │ │ ├── PrismMergeJob.java │ │ │ ├── PrismMergeJobMapper.java │ │ │ ├── PrismMergeRange.java │ │ │ ├── PrismMergeRangeMapper.java │ │ │ ├── PrismPartition.java │ │ │ ├── PrismPartitionMapper.java │ │ │ ├── PrismSmallObject.java │ │ │ ├── PrismSmallObjectMapper.java │ │ │ ├── PrismStagingObject.java │ │ │ ├── PrismStagingObjectMapper.java │ │ │ ├── PrismTable.java │ │ │ ├── PrismTableMapper.java │ │ │ ├── PrismUnknownStagingObject.java │ │ │ ├── PrismUnknownStagingObjectMapper.java │ │ │ └── StreamColumn.java │ │ │ ├── jsonl │ │ │ ├── JsonlReader.java │ │ │ ├── JsonlRecordReader.java │ │ │ └── converters │ │ │ │ ├── Converter.java │ │ │ │ ├── DefaultConverter.java │ │ │ │ ├── NonNullConverter.java │ │ │ │ ├── NullableConverter.java │ │ │ │ ├── PrimitiveConverter.java │ │ │ │ └── UnexpectedValueType.java │ │ │ ├── objectstore │ │ │ ├── MergedObjectStore.java │ │ │ ├── MergedPartitionManifest.java │ │ │ ├── PartitionManifest.java │ │ │ ├── PrismObjectStore.java │ │ │ ├── PrismObjectStoreFactory.java │ │ │ ├── PrismTableLocator.java │ │ │ ├── PrismTableLocatorFactory.java │ │ │ └── SmallObjectStore.java │ │ │ └── record │ │ │ ├── PrismRecordMaterializer.java │ │ │ ├── Record.java │ │ │ ├── RecordConverter.java │ │ │ ├── RecordReadSupport.java │ │ │ ├── RecordReaderFactory.java │ │ │ ├── RecordTimestampComparator.java │ │ │ ├── RecordWriteSupport.java │ │ │ ├── RecordWriterBuilder.java │ │ │ ├── RecordWriterFactory.java │ │ │ ├── Schema.java │ │ │ ├── SizedValueType.java │ │ │ ├── UnsizedValueType.java │ │ │ ├── ValueKind.java │ │ │ ├── ValueListRecord.java │ │ │ ├── ValueType.java │ │ │ ├── partitioned │ │ │ ├── DateAttachedRecord.java │ │ │ ├── PartitionCollector.java │ │ │ ├── PartitionedRecord.java │ │ │ ├── PartitionedRecordWriter.java │ │ │ ├── PartitionedWriter.java │ │ │ └── SortedPartitionedWriter.java │ │ │ └── values │ │ │ ├── NonNullValue.java │ │ │ ├── NullValue.java │ │ │ ├── PrimitiveValue.java │ │ │ └── Value.java │ └── resources │ │ ├── application.yml │ │ ├── com │ │ └── cookpad │ │ │ └── prism │ │ │ └── dao │ │ │ ├── PacketStreamMapper.xml │ │ │ ├── PrismMergeJobMapper.xml │ │ │ ├── PrismMergeRangeMapper.xml │ │ │ ├── PrismPartitionMapper.xml │ │ │ ├── PrismSmallObjectMapper.xml │ │ │ ├── PrismStagingObjectMapper.xml │ │ │ ├── PrismTableMapper.xml │ │ │ └── PrismUnknownStagingObjectMapper.xml │ │ └── mybatis-config.xml │ └── test │ └── java │ └── com │ └── cookpad │ └── prism │ ├── jsonl │ └── JsonlRecordReaderTest.java │ └── objectstore │ ├── PrismObjectStoreTest.java │ └── S3TableLocatorTest.java ├── spotbugs-exclude.xml └── stream ├── .gitignore ├── README.md ├── build.gradle ├── config └── application.yml └── src ├── main ├── java │ └── com │ │ └── cookpad │ │ └── prism │ │ ├── objectstore │ │ └── StagingObjectStore.java │ │ └── stream │ │ ├── FileQueueEventDispatcherFactory.java │ │ ├── Main.java │ │ ├── ParquetConverter.java │ │ ├── PrismStreamConf.java │ │ ├── StagingObjectAttributes.java │ │ ├── events │ │ ├── DateRange.java │ │ ├── EventHandler.java │ │ ├── FileQueueEventDispatcher.java │ │ ├── SnsEnvelope.java │ │ ├── SqsEventDispatcher.java │ │ ├── StagingObjectDispatcher.java │ │ ├── StagingObjectEvent.java │ │ └── StagingObjectHandler.java │ │ └── filequeue │ │ ├── FileQueue.java │ │ └── S3QueueDownloader.java ├── jib │ └── app │ │ └── config │ │ └── application.yml └── resources │ ├── logback.xml │ └── sentry.properties └── test ├── java └── com │ └── cookpad │ └── prism │ └── stream │ ├── ParquetConverterTest.java │ ├── StagingObjectAttributesTest.java │ └── events │ ├── DateRangeTest.java │ ├── SqsEventDispatcherTest.java │ └── StagingObjectDispatcherTest.java └── resources └── com └── cookpad └── prism └── stream ├── small_object_22.parquet ├── small_object_23.parquet └── staging_object.gz /.dockerignore: -------------------------------------------------------------------------------- 1 | .git 2 | **/*.class 3 | 4 | # Mobile Tools for Java (J2ME) 5 | **/.mtj.tmp/ 6 | 7 | # Package Files # 8 | **/*.jar 9 | **/*.war 10 | **/*.ear 11 | 12 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 13 | **/hs_err_pid* 14 | 15 | .gradle 16 | */build/ 17 | gradle-app.setting 18 | 19 | !gradle/wrapper/gradle-wrapper.jar 20 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | - gh-actions-test 8 | pull_request: 9 | 10 | jobs: 11 | test: 12 | name: gradle test 13 | runs-on: ubuntu-latest 14 | strategy: 15 | fail-fast: false 16 | matrix: 17 | java: 18 | - 11 19 | # TODO: Support Java 17 20 | # - 17 21 | steps: 22 | - uses: actions/checkout@v3 23 | - name: Set up JDK ${{ matrix.java }} 24 | uses: actions/setup-java@v3 25 | with: 26 | java-version: ${{ matrix.java }} 27 | distribution: corretto 28 | cache: gradle 29 | - name: Create fake REVISION file 30 | run: echo fake > REVISION 31 | - run: ./gradlew test --no-daemon 32 | docker: 33 | name: Docker 34 | runs-on: ubuntu-latest 35 | strategy: 36 | fail-fast: false 37 | matrix: 38 | component: 39 | - stream 40 | - merge 41 | - batch 42 | steps: 43 | - uses: actions/checkout@v3 44 | - name: Set up JDK 11 45 | uses: actions/setup-java@v3 46 | with: 47 | java-version: 11 48 | distribution: corretto 49 | cache: gradle 50 | - name: Create fake REVISION file 51 | run: echo fake > REVISION 52 | - name: Build ${{ matrix.component }} image 53 | run: ./gradlew shared:docker ${{ matrix.component }}:jibDockerBuild -Djib.to.image=prism-${{ matrix.component }} 54 | - name: Check Hadoop native library 55 | run: docker run --entrypoint '' prism-${{ matrix.component}}:fake java -cp '/app/resources:/app/classes:/app/libs/*' org.apache.hadoop.util.NativeLibraryChecker -a 56 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.jar 3 | 4 | .gradle 5 | build/ 6 | gradle-app.setting 7 | !gradle-wrapper.jar 8 | 9 | /REVISION 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Hidekazu Kobayashi, Minero Aoki 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Prism: Redshift Spectrum Streaming Loader 2 | 3 | Prism is a scalable, fast Redshift Spectrum Streaming Loader. 4 | This software is developped at Cookpad in working time. 5 | 6 | ## Prerequisites 7 | - OpenJDK 11 8 | - Tested with Amazon Corretto only, but other distributions like Eclipse Temurin should also work. 9 | 10 | ## Build 11 | ``` 12 | % ./gradlew build 13 | ``` 14 | 15 | ## Components 16 | - stream/: Prism Stream converts JSONL S3 objects to Parquet objects. 17 | - merge/: Prism Merge merges small Parquet objects into large one. 18 | - batch/: Prism Batch Jobs update Glue Catalog to reflect latest partition info. 19 | - gc/: Prism GC deletes unused merged objects. 20 | 21 | ## Setup 22 | TBD 23 | 24 | ## License 25 | MIT license. See LICENSE file for details. 26 | 27 | ## Authors 28 | - Hidekazu Kobayashi @koba789 (original author) 29 | - Minero Aoki @aamine (current maintainer) 30 | -------------------------------------------------------------------------------- /batch/.gitignore: -------------------------------------------------------------------------------- 1 | /bin/ -------------------------------------------------------------------------------- /batch/build.gradle: -------------------------------------------------------------------------------- 1 | plugins { 2 | id "org.springframework.boot" version "2.1.3.RELEASE" 3 | id "io.spring.dependency-management" version "1.0.7.RELEASE" 4 | id "com.google.cloud.tools.jib" version "3.4.5" 5 | } 6 | 7 | dependencies { 8 | implementation project(':shared') 9 | implementation group: 'org.springframework.boot', name: 'spring-boot-starter', version: '2.1.3.RELEASE' 10 | implementation group: 'org.springframework.boot', name: 'spring-boot-starter-logging', version: '2.1.3.RELEASE' 11 | implementation group: 'io.sentry', name: 'sentry-logback', version: '1.7.30' 12 | implementation group: 'org.postgresql', name: 'postgresql', version: '42.2.5' 13 | implementation group: 'com.amazonaws', name: 'aws-java-sdk-glue', version: '1.11.438' 14 | } 15 | 16 | jib { 17 | from { 18 | image = 'docker://prism-base' 19 | } 20 | to { 21 | tags = ["latest", rootProject.file("REVISION").text.trim()] 22 | } 23 | container { 24 | jvmFlags = [] 25 | workingDirectory = '/app' 26 | } 27 | } 28 | tasks.jib.dependsOn ':shared:docker' 29 | -------------------------------------------------------------------------------- /batch/config/application.yml: -------------------------------------------------------------------------------- 1 | # This config file is for *DEVELOPMENT* 2 | # Use batch/src/main/jib/app/config/application.yml or environment variables instead 3 | # if you need to change the config in production environment. 4 | 5 | spring: 6 | main: 7 | banner-mode: "off" 8 | datasource: 9 | url: jdbc:postgresql://localhost:5432/prism 10 | driver-class-name: org.postgresql.Driver 11 | username: prism 12 | password: prism 13 | hikari: 14 | maximum-pool-size: 2 15 | minimum-idle: 1 16 | 17 | prism: 18 | bucket-name: prism-example-bucket 19 | prefix: "" 20 | catalog: 21 | database-prefix: "prism_test_" 22 | database-suffix: "" 23 | 24 | logging: 25 | level: 26 | root: INFO 27 | com.cookpad.prism: DEBUG 28 | com.cookpad.prism.dao: INFO 29 | com.cookpad.prism.dao.PrismTableMapper.drop: TRACE 30 | org.apache.parquet: WARN 31 | org.apache.hadoop: WARN 32 | -------------------------------------------------------------------------------- /batch/src/main/java/com/cookpad/prism/batch/CmdRunner.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.batch; 2 | 3 | import java.util.List; 4 | 5 | import com.amazonaws.services.s3.AmazonS3URI; 6 | 7 | import org.slf4j.MDC; 8 | import org.springframework.boot.ApplicationArguments; 9 | import org.springframework.boot.CommandLineRunner; 10 | import org.springframework.context.ApplicationContext; 11 | import org.springframework.stereotype.Component; 12 | 13 | import lombok.RequiredArgsConstructor; 14 | 15 | @RequiredArgsConstructor 16 | @Component 17 | public class CmdRunner implements CommandLineRunner { 18 | final private ApplicationContext ctx; 19 | private final ApplicationArguments args; 20 | 21 | private String getSingleOptionValue(String name) { 22 | List values = this.args.getOptionValues(name); 23 | if (values.size() == 0) { 24 | throw new RuntimeException(String.format("No %s are given", name)); 25 | } 26 | if (values.size() > 1) { 27 | throw new RuntimeException(String.format("Only one %s is allowed", name)); 28 | } 29 | return values.get(0); 30 | } 31 | 32 | @Override 33 | public void run(String... rawArgs) throws Exception { 34 | List nonopts = this.args.getNonOptionArgs(); 35 | if (nonopts.size() < 1) { 36 | throw new RuntimeException("Command line args must be exactly one"); 37 | } 38 | String cmdName = nonopts.get(0); 39 | MDC.put("cmd_name", cmdName); 40 | switch (cmdName) { 41 | case "CatalogCmd": 42 | case "sync": 43 | CatalogCmd catalogCmd = ctx.getBean(CatalogCmd.class); 44 | catalogCmd.run(); 45 | break; 46 | case "ls-s3-objects": 47 | String destS3UriString = this.getSingleOptionValue("dest-s3-uri"); 48 | AmazonS3URI destS3Uri = new AmazonS3URI(destS3UriString); 49 | String bucketName = this.getSingleOptionValue("bucket"); 50 | String keyStartx = this.getSingleOptionValue("key-startx"); 51 | String keyEndx = this.getSingleOptionValue("key-endx"); 52 | ListStagingObjectsCmd listStagingObjectsCmd = ctx.getBean(ListStagingObjectsCmd.class); 53 | listStagingObjectsCmd.run(destS3Uri, bucketName, keyStartx, keyEndx); 54 | break; 55 | case "unlink-table": 56 | String tableIdString = this.getSingleOptionValue("table"); 57 | int tableId = Integer.parseInt(tableIdString); 58 | UnlinkTableCmd unlinkTableCmd = ctx.getBean(UnlinkTableCmd.class); 59 | unlinkTableCmd.run(tableId); 60 | break; 61 | case "drop-table": 62 | String tableIdString2 = this.getSingleOptionValue("table"); 63 | int tableId2 = Integer.parseInt(tableIdString2); 64 | DropTableCmd dropTableCmd = ctx.getBean(DropTableCmd.class); 65 | dropTableCmd.run(tableId2); 66 | break; 67 | default: 68 | throw new RuntimeException("No such cmd: " + cmdName); 69 | } 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /batch/src/main/java/com/cookpad/prism/batch/DropTableCmd.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.batch; 2 | 3 | import java.io.IOException; 4 | 5 | import com.cookpad.prism.dao.PrismTable; 6 | import com.cookpad.prism.dao.PrismTableMapper; 7 | import org.springframework.stereotype.Component; 8 | 9 | import lombok.RequiredArgsConstructor; 10 | import lombok.extern.slf4j.Slf4j; 11 | 12 | @RequiredArgsConstructor 13 | @Component 14 | @Slf4j 15 | public class DropTableCmd { 16 | private final PrismTableMapper tableMapper; 17 | 18 | public void run(int tableId) throws IOException { 19 | PrismTable table = this.tableMapper.find(tableId); 20 | if (table == null) { 21 | throw new RuntimeException(String.format("No table found for id: %d", tableId)); 22 | } 23 | log.info("Dropping table: {}.{}", table.getLogicalSchemaName(), table.getLogicalTableName()); 24 | this.tableMapper.drop(tableId); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /batch/src/main/java/com/cookpad/prism/batch/JobTime.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.batch; 2 | 3 | import java.time.Clock; 4 | import java.time.Instant; 5 | import java.time.LocalDateTime; 6 | import java.time.ZoneOffset; 7 | 8 | import org.springframework.stereotype.Component; 9 | 10 | import lombok.NonNull; 11 | import lombok.RequiredArgsConstructor; 12 | 13 | @RequiredArgsConstructor 14 | @Component 15 | public class JobTime { 16 | final private Clock clock; 17 | private Instant time; 18 | 19 | @NonNull 20 | public Instant getTime() { 21 | if (this.time == null) { 22 | this.time = this.clock.instant(); 23 | } 24 | return this.time; 25 | } 26 | 27 | public LocalDateTime getTimeInUTC() { 28 | return LocalDateTime.ofInstant(this.getTime(), ZoneOffset.UTC); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /batch/src/main/java/com/cookpad/prism/batch/ListStagingObjectsCmd.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.batch; 2 | 3 | import java.io.FileWriter; 4 | import java.io.IOException; 5 | import java.io.PrintWriter; 6 | import java.nio.charset.StandardCharsets; 7 | import java.nio.file.Files; 8 | import java.nio.file.Path; 9 | 10 | import com.amazonaws.services.s3.AmazonS3; 11 | import com.amazonaws.services.s3.AmazonS3URI; 12 | import com.amazonaws.services.s3.model.ListObjectsV2Request; 13 | import com.amazonaws.services.s3.model.ListObjectsV2Result; 14 | import com.amazonaws.services.s3.model.PutObjectRequest; 15 | import com.amazonaws.services.s3.model.S3ObjectSummary; 16 | 17 | import org.springframework.stereotype.Component; 18 | 19 | import lombok.RequiredArgsConstructor; 20 | import lombok.extern.slf4j.Slf4j; 21 | 22 | @RequiredArgsConstructor 23 | @Component 24 | @Slf4j 25 | public class ListStagingObjectsCmd { 26 | private final AmazonS3 s3; 27 | 28 | public void run(AmazonS3URI destS3Uri, String bucketName, String keyStartx, String keyEndx) throws IOException { 29 | Path tmpPath = Files.createTempFile("prism-list-staging-objects-", ".txt").toAbsolutePath(); 30 | log.info("tmp file: {}", tmpPath.toString()); 31 | try (PrintWriter outputWriter = new PrintWriter(new FileWriter(tmpPath.toFile(), StandardCharsets.UTF_8))) { 32 | String continuationToken = null; 33 | pagination: do { 34 | ListObjectsV2Request req = new ListObjectsV2Request() 35 | .withBucketName(bucketName) 36 | .withContinuationToken(continuationToken) 37 | .withStartAfter(keyStartx); 38 | ListObjectsV2Result res = this.s3.listObjectsV2(req); 39 | for (S3ObjectSummary objectSummary : res.getObjectSummaries()) { 40 | String key = objectSummary.getKey(); 41 | if (key.compareTo(keyEndx) >= 0) { 42 | break pagination; 43 | } 44 | outputWriter.printf("s3://%s/%s%n", objectSummary.getBucketName(), key); 45 | } 46 | continuationToken = res.getNextContinuationToken(); 47 | } while(continuationToken != null); 48 | } 49 | PutObjectRequest putObjectRequest = new PutObjectRequest(destS3Uri.getBucket(), destS3Uri.getKey(), tmpPath.toFile()); 50 | this.s3.putObject(putObjectRequest); 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /batch/src/main/java/com/cookpad/prism/batch/Main.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.batch; 2 | 3 | import java.time.Clock; 4 | 5 | import com.amazonaws.services.glue.AWSGlue; 6 | import com.amazonaws.services.glue.AWSGlueClientBuilder; 7 | import com.amazonaws.services.s3.AmazonS3; 8 | import com.amazonaws.services.s3.AmazonS3ClientBuilder; 9 | 10 | import com.cookpad.prism.batch.catalog.DatabaseNameModifier; 11 | import com.cookpad.prism.objectstore.PrismObjectStoreFactory; 12 | import com.cookpad.prism.objectstore.PrismTableLocatorFactory; 13 | import org.mybatis.spring.annotation.MapperScan; 14 | import org.springframework.beans.factory.annotation.Autowired; 15 | import org.springframework.boot.WebApplicationType; 16 | import org.springframework.boot.autoconfigure.SpringBootApplication; 17 | import org.springframework.boot.builder.SpringApplicationBuilder; 18 | import org.springframework.context.annotation.Bean; 19 | 20 | import lombok.extern.slf4j.Slf4j; 21 | 22 | @SpringBootApplication 23 | @MapperScan(basePackages = "com.cookpad.prism.dao") 24 | @Slf4j 25 | public class Main { 26 | public static void main(String[] args) { 27 | try { 28 | new SpringApplicationBuilder(Main.class).web(WebApplicationType.NONE).run(args); 29 | } catch(Exception e) { 30 | log.error("Unhandled exception", e); 31 | throw e; 32 | } 33 | } 34 | 35 | @Bean 36 | public AmazonS3 s3() { 37 | return AmazonS3ClientBuilder.defaultClient(); 38 | } 39 | 40 | @Bean 41 | public AWSGlue glue() { 42 | return AWSGlueClientBuilder.defaultClient(); 43 | } 44 | 45 | @Bean 46 | public PrismTableLocatorFactory tableLocatorFactory(@Autowired PrismBatchConf prismConf) { 47 | return new PrismTableLocatorFactory(prismConf.getBucketName(), prismConf.getPrefix()); 48 | } 49 | 50 | @Bean 51 | public PrismObjectStoreFactory prismObjectStoreFactory(@Autowired PrismTableLocatorFactory tableLocatorFactory) { 52 | return new PrismObjectStoreFactory(this.s3(), tableLocatorFactory); 53 | } 54 | 55 | @Bean 56 | public DatabaseNameModifier databaseNameModifier(@Autowired PrismBatchConf prismConf) { 57 | String prefix = prismConf.getCatalog().getDatabasePrefix(); 58 | String suffix = prismConf.getCatalog().getDatabaseSuffix(); 59 | return new DatabaseNameModifier(prefix, suffix); 60 | } 61 | 62 | @Bean 63 | public Clock clock() { 64 | return Clock.systemDefaultZone(); 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /batch/src/main/java/com/cookpad/prism/batch/ManifestGenerator.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.batch; 2 | 3 | import java.time.LocalDate; 4 | import java.util.List; 5 | 6 | import com.cookpad.prism.objectstore.PartitionManifest; 7 | import com.cookpad.prism.objectstore.PrismTableLocator; 8 | import com.cookpad.prism.dao.PrismMergeRange; 9 | import org.springframework.stereotype.Component; 10 | 11 | import lombok.RequiredArgsConstructor; 12 | 13 | @RequiredArgsConstructor 14 | @Component 15 | public class ManifestGenerator { 16 | public PartitionManifest generate(PrismTableLocator tableLocator, LocalDate partitionDate, List mergeRanges) { 17 | PartitionManifest manifest = new PartitionManifest(); 18 | for (PrismMergeRange mergeRange : mergeRanges) { 19 | String key = tableLocator.getMergedObjectKey(partitionDate, mergeRange.getLowerBound(), mergeRange.getUpperBound()); 20 | String url = tableLocator.toFullUrl(key).toString(); 21 | manifest.add(url, mergeRange.getContentLength()); 22 | } 23 | return manifest; 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /batch/src/main/java/com/cookpad/prism/batch/PrismBatchConf.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.batch; 2 | 3 | import com.cookpad.prism.PrismConf; 4 | 5 | import org.springframework.stereotype.Component; 6 | 7 | import lombok.NoArgsConstructor; 8 | import lombok.Getter; 9 | import lombok.Setter; 10 | import lombok.ToString; 11 | 12 | @Component 13 | @NoArgsConstructor 14 | @Getter 15 | @Setter 16 | @ToString(callSuper = true) 17 | public class PrismBatchConf extends PrismConf { 18 | Catalog catalog = new Catalog(); 19 | 20 | @NoArgsConstructor 21 | @Getter 22 | @Setter 23 | @ToString 24 | public static class Catalog { 25 | String databasePrefix = ""; 26 | String databaseSuffix = ""; 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /batch/src/main/java/com/cookpad/prism/batch/UnlinkTableCmd.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.batch; 2 | 3 | import java.io.IOException; 4 | 5 | import com.amazonaws.services.glue.AWSGlue; 6 | import com.amazonaws.services.glue.model.DeleteTableRequest; 7 | 8 | import com.cookpad.prism.batch.catalog.DatabaseNameModifier; 9 | import com.cookpad.prism.dao.PrismTable; 10 | import com.cookpad.prism.dao.PrismTableMapper; 11 | import org.springframework.stereotype.Component; 12 | 13 | import lombok.RequiredArgsConstructor; 14 | import lombok.extern.slf4j.Slf4j; 15 | 16 | @RequiredArgsConstructor 17 | @Component 18 | @Slf4j 19 | public class UnlinkTableCmd { 20 | private final AWSGlue glue; 21 | private final PrismTableMapper tableMapper; 22 | private final DatabaseNameModifier databaseNameModifier; 23 | 24 | public void run(int tableId) throws IOException { 25 | PrismTable table = this.tableMapper.find(tableId); 26 | if (table == null) { 27 | throw new RuntimeException(String.format("No table found for id: %d", tableId)); 28 | } 29 | 30 | String databaseName = this.databaseNameModifier.getDatabaseName(table.getLogicalSchemaName()); 31 | String tableName = table.getLogicalTableName(); 32 | DeleteTableRequest deleteTableRequest = new DeleteTableRequest() 33 | .withDatabaseName(databaseName) 34 | .withName(tableName); 35 | log.info("Deleting table from Glue: {}.{}", databaseName, tableName); 36 | this.glue.deleteTable(deleteTableRequest); 37 | 38 | log.info("Unlinking table whose id is {}", tableId); 39 | this.tableMapper.unlink(tableId); 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /batch/src/main/java/com/cookpad/prism/batch/catalog/DatabaseNameModifier.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.batch.catalog; 2 | 3 | import lombok.RequiredArgsConstructor; 4 | 5 | @RequiredArgsConstructor 6 | public class DatabaseNameModifier { 7 | final private String prefix; 8 | final private String suffix; 9 | 10 | public String getDatabaseName(String schemaName) { 11 | return String.format("%s%s%s", this.prefix, schemaName, this.suffix); 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /batch/src/main/java/com/cookpad/prism/batch/catalog/UpsertPartitionRequest.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.batch.catalog; 2 | 3 | import com.amazonaws.services.glue.model.CreatePartitionRequest; 4 | import com.amazonaws.services.glue.model.GetPartitionRequest; 5 | import com.amazonaws.services.glue.model.PartitionInput; 6 | import com.amazonaws.services.glue.model.UpdatePartitionRequest; 7 | 8 | import lombok.Getter; 9 | import lombok.RequiredArgsConstructor; 10 | 11 | @RequiredArgsConstructor 12 | public class UpsertPartitionRequest { 13 | @Getter 14 | private final String databaseName; 15 | @Getter 16 | private final String tableName; 17 | @Getter 18 | private final PartitionInput partitionInput; 19 | 20 | public GetPartitionRequest buildGetPartitionRequest() { 21 | return new GetPartitionRequest() 22 | .withDatabaseName(this.getDatabaseName()) 23 | .withTableName(this.getTableName()) 24 | .withPartitionValues(this.getPartitionInput().getValues()) 25 | ; 26 | } 27 | 28 | public CreatePartitionRequest buildCreatePartitionRequest() { 29 | return new CreatePartitionRequest() 30 | .withDatabaseName(this.getDatabaseName()) 31 | .withTableName(this.getTableName()) 32 | .withPartitionInput(this.getPartitionInput()) 33 | ; 34 | } 35 | 36 | public UpdatePartitionRequest buildUpdatePartitionRequest() { 37 | return new UpdatePartitionRequest() 38 | .withDatabaseName(this.getDatabaseName()) 39 | .withTableName(this.getTableName()) 40 | .withPartitionValueList(this.getPartitionInput().getValues()) 41 | .withPartitionInput(this.getPartitionInput()) 42 | ; 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /batch/src/main/jib/app/config/application.yml: -------------------------------------------------------------------------------- 1 | # This config file is for *PRODUCTION* 2 | 3 | spring: 4 | main: 5 | banner-mode: "off" 6 | datasource: 7 | # database endpoint will be injected by environment variables 8 | driver-class-name: org.postgresql.Driver 9 | hikari: 10 | maximum-pool-size: 2 11 | minimum-idle: 1 12 | 13 | prism: 14 | bucket-name: prism-example-bucket 15 | prefix: "" 16 | catalog: 17 | database-prefix: "" 18 | database-suffix: "" 19 | 20 | logging: 21 | level: 22 | root: INFO 23 | com.cookpad.prism: INFO 24 | com.cookpad.prism.dao: INFO 25 | com.cookpad.prism.dao.PrismTableMapper.drop: TRACE 26 | org.apache.parquet: WARN 27 | org.apache.hadoop: WARN 28 | -------------------------------------------------------------------------------- /batch/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | System.err 5 | 6 | %d [%t] %5level: %logger: %msg%n 7 | 8 | 9 | 10 | 11 | 12 | WARN 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /batch/src/main/resources/sentry.properties: -------------------------------------------------------------------------------- 1 | mdctags=cmd_name 2 | tags=subsystem:batch 3 | stacktrace.app.packages=com.cookpad.prism 4 | -------------------------------------------------------------------------------- /batch/src/test/java/com/cookpad/prism/batch/catalog/CatalogTableTest.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.batch.catalog; 2 | 3 | import static org.junit.jupiter.api.Assertions.assertEquals; 4 | 5 | import java.time.LocalDateTime; 6 | import java.util.ArrayList; 7 | import java.util.List; 8 | 9 | import com.cookpad.prism.SchemaBuilder; 10 | import com.cookpad.prism.SchemaBuilder.BadColumnsError; 11 | import com.cookpad.prism.objectstore.PrismTableLocatorFactory; 12 | import com.cookpad.prism.dao.PrismTable; 13 | import com.cookpad.prism.dao.StreamColumn; 14 | import org.junit.jupiter.api.Test; 15 | 16 | import lombok.val; 17 | 18 | public class CatalogTableTest { 19 | @Test 20 | void getTablePrefix() throws BadColumnsError { 21 | val databaseModifier = new DatabaseNameModifier("prefix_", "_suffix"); 22 | val prismTable = new PrismTable(200, null, null, "test_schema", "nanika_log", LocalDateTime.now(), 43200); 23 | val columns = List.of(new StreamColumn(0, "utc_event_time", null, "timestamp", null, "+00:00", "+09:00", null, true)); 24 | val schema = new SchemaBuilder().build(prismTable, columns); 25 | val locatorFactory = new PrismTableLocatorFactory("prism-sandbox", "global-prefix/"); 26 | val catalogTable = new CatalogTable.Factory(databaseModifier).build(schema, locatorFactory.build(prismTable)); 27 | assertEquals("nanika_log", catalogTable.buildTableInput().getName()); 28 | assertEquals("prefix_test_schema_suffix", catalogTable.buildDatabaseInput().getName()); 29 | } 30 | 31 | @Test 32 | void getTablePrefixWhenPhysicalNameIsSpecified() throws BadColumnsError { 33 | val databaseModifier = new DatabaseNameModifier("prefix_", "_suffix"); 34 | val prismTable = new PrismTable(200, "phy_test_schema", "phy_nanika_log", "test_schema", "nanika_log", LocalDateTime.now(), 43200); 35 | val columns = List.of(new StreamColumn(0, "utc_event_time", null, "timestamp", null, "+00:00", "+09:00", null, true)); 36 | val schema = new SchemaBuilder().build(prismTable, columns); 37 | val locatorFactory = new PrismTableLocatorFactory("prism-sandbox", "global-prefix/"); 38 | val catalogTable = new CatalogTable.Factory(databaseModifier).build(schema, locatorFactory.build(prismTable)); 39 | assertEquals("nanika_log", catalogTable.buildTableInput().getName()); 40 | assertEquals("prefix_test_schema_suffix", catalogTable.buildDatabaseInput().getName()); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /build.gradle: -------------------------------------------------------------------------------- 1 | plugins { 2 | id "com.github.spotbugs" version "2.0.0" apply false 3 | } 4 | 5 | subprojects { 6 | apply plugin: 'java' 7 | apply plugin: 'com.github.spotbugs' 8 | 9 | sourceCompatibility = 11 10 | targetCompatibility = 11 11 | 12 | repositories { 13 | mavenCentral() 14 | } 15 | 16 | dependencies { 17 | annotationProcessor group: 'org.projectlombok', name: 'lombok', version: '1.18.6' 18 | compileOnly group: 'org.projectlombok', name: 'lombok', version: '1.18.6' 19 | testImplementation group: 'org.junit.jupiter', name: 'junit-jupiter-api', version: '5.4.0' 20 | testRuntimeOnly group: 'org.junit.jupiter', name: 'junit-jupiter-engine', version: '5.4.0' 21 | testImplementation group: 'org.junit.platform', name: 'junit-platform-launcher', version: '1.4.0' 22 | testImplementation group: 'org.junit.platform', name: 'junit-platform-commons', version: '1.4.0' 23 | testImplementation group: "org.mockito", name: "mockito-core", version: "2.24.5" 24 | testAnnotationProcessor group: 'org.projectlombok', name: 'lombok', version: '1.18.6' 25 | testCompileOnly group: 'org.projectlombok', name: 'lombok', version: '1.18.6' 26 | } 27 | 28 | test { 29 | useJUnitPlatform() 30 | 31 | testLogging { 32 | events "passed", "skipped", "failed" 33 | } 34 | } 35 | 36 | tasks.withType(com.github.spotbugs.SpotBugsTask) { 37 | reports { 38 | xml.enabled = false 39 | html.enabled = true 40 | } 41 | } 42 | spotbugs { 43 | excludeFilter = rootProject.file("spotbugs-exclude.xml") 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /db/Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | gem 'ridgepole' 3 | gem 'pg' 4 | -------------------------------------------------------------------------------- /db/Gemfile.lock: -------------------------------------------------------------------------------- 1 | GEM 2 | remote: https://rubygems.org/ 3 | specs: 4 | activemodel (5.2.3) 5 | activesupport (= 5.2.3) 6 | activerecord (5.2.3) 7 | activemodel (= 5.2.3) 8 | activesupport (= 5.2.3) 9 | arel (>= 9.0) 10 | activesupport (5.2.3) 11 | concurrent-ruby (~> 1.0, >= 1.0.2) 12 | i18n (>= 0.7, < 2) 13 | minitest (~> 5.1) 14 | tzinfo (~> 1.1) 15 | arel (9.0.0) 16 | concurrent-ruby (1.1.5) 17 | diffy (3.3.0) 18 | i18n (1.6.0) 19 | concurrent-ruby (~> 1.0) 20 | minitest (5.11.3) 21 | pg (1.1.4) 22 | ridgepole (0.7.7) 23 | activerecord (>= 5.0.1, < 6) 24 | diffy 25 | thread_safe (0.3.6) 26 | tzinfo (1.2.5) 27 | thread_safe (~> 0.1) 28 | 29 | PLATFORMS 30 | ruby 31 | 32 | DEPENDENCIES 33 | pg 34 | ridgepole 35 | 36 | BUNDLED WITH 37 | 2.0.1 38 | -------------------------------------------------------------------------------- /db/Schemafile: -------------------------------------------------------------------------------- 1 | Dir.glob("*.schema").each do |path| 2 | require path 3 | end 4 | -------------------------------------------------------------------------------- /db/database.yml: -------------------------------------------------------------------------------- 1 | development: 2 | adapter: postgresql 3 | host: localhost 4 | port: 5432 5 | database: prism 6 | username: prism 7 | password: prism 8 | encoding: unicode 9 | pool: 1 10 | -------------------------------------------------------------------------------- /db/prism_merge_jobs.schema: -------------------------------------------------------------------------------- 1 | create_table "prism_merge_jobs", primary_key: "prism_merge_job_id", id: :bigint, force: :cascade do |t| 2 | t.bigint "prism_partition_id", null: false 3 | t.datetime "schedule_time", null: false 4 | t.bigint "ongoing_mark", null: false 5 | t.datetime "heartbeat_time", null: true 6 | end 7 | 8 | add_index "prism_merge_jobs", ["prism_partition_id", "ongoing_mark"], 9 | name: "prism_merge_jobs_prism_partition_id_pending_idx", 10 | using: :btree, 11 | unique: true 12 | 13 | add_index "prism_merge_jobs", ["schedule_time"], 14 | name: "prism_merge_jobs_pending_idx", 15 | using: :btree, 16 | where: '(ongoing_mark = 0)' 17 | 18 | add_index "prism_merge_jobs", ["heartbeat_time"], 19 | name: "prism_merge_jobs_ongoing_idx", 20 | using: :btree, 21 | where: '(ongoing_mark > 0)' 22 | -------------------------------------------------------------------------------- /db/prism_merge_ranges.schema: -------------------------------------------------------------------------------- 1 | create_table "prism_merge_ranges", primary_key: "prism_merge_range_id", id: :bigint, force: :cascade do |t| 2 | t.bigint "prism_partition_id", null: false 3 | t.bigint "lower_bound", null: true 4 | t.bigint "upper_bound", null: false 5 | t.bigint "content_length", null: false 6 | t.datetime "create_time", null: false 7 | t.datetime "update_time", null: false 8 | end 9 | 10 | add_index "prism_merge_ranges", ["lower_bound", "prism_partition_id"], 11 | name: "prism_merge_ranges_lower_bound_prism_partition_id_idx", 12 | unique: true, 13 | using: :btree 14 | 15 | add_index "prism_merge_ranges", ["upper_bound", "prism_partition_id"], 16 | name: "prism_merge_ranges_upper_bound_prism_partition_id_idx", 17 | unique: true, 18 | using: :btree 19 | 20 | add_index "prism_merge_ranges", ["prism_partition_id"], 21 | name: "prism_merge_ranges_prism_partition_id_idx", 22 | using: :btree 23 | 24 | def if_constr_not_exists(c, table_name, name, type) 25 | sql = <<~SQL 26 | select 27 | * 28 | from 29 | information_schema.table_constraints 30 | where 31 | table_schema = 'public' 32 | and table_name = '#{table_name}' 33 | and constraint_type = '#{type}' 34 | and constraint_name = '#{name}' 35 | ; 36 | SQL 37 | 38 | c.raw_connection.query(sql).to_a.length.zero? 39 | end 40 | 41 | execute(<<~SQL) do |c| 42 | alter table prism_merge_ranges 43 | add constraint prism_merge_ranges_prism_partition_id_fk 44 | foreign key (prism_partition_id, lower_bound) 45 | references prism_merge_ranges (prism_partition_id, upper_bound) 46 | SQL 47 | if_constr_not_exists(c, "prism_merge_ranges", "prism_merge_ranges_prism_partition_id_fk", "FOREIGN KEY") 48 | end 49 | 50 | execute(<<~SQL) do |c| 51 | alter table prism_merge_ranges 52 | add constraint prism_merge_ranges_upper_gt_lower_ck 53 | check (upper_bound > lower_bound) 54 | SQL 55 | if_constr_not_exists(c, "prism_merge_ranges", "prism_merge_ranges_upper_gt_lower_ck", "CHECK") 56 | end 57 | 58 | execute(<<~SQL) do |c| 59 | alter table prism_merge_ranges 60 | add constraint prism_merge_ranges_lower_links_upper_ck 61 | check (lower_bound is not null or (lower_bound is null and upper_bound = 0)) 62 | SQL 63 | if_constr_not_exists(c, "prism_merge_ranges", "prism_merge_ranges_lower_links_upper_ck", "CHECK") 64 | end 65 | -------------------------------------------------------------------------------- /db/prism_partitions.schema: -------------------------------------------------------------------------------- 1 | create_table "prism_partitions", primary_key: "prism_partition_id", id: :bigint, force: :cascade do |t| 2 | t.integer "prism_table_id", null: false 3 | t.date "partition_date", null: false 4 | t.bigint "current_manifest_version", null: false, default: -1 5 | t.bigint "ongoing_manifest_version", null: false, default: -1 # deprecated 6 | t.bigint "desired_manifest_version", null: false, default: 0 7 | t.bigint "last_live_object_id", null: true, default: nil 8 | t.boolean "switched", null: false, default: false 9 | end 10 | 11 | add_index "prism_partitions", ["prism_table_id", "partition_date"], 12 | name: "prism_partitions_prism_table_id_partition_date_idx", 13 | unique: true, 14 | using: :btree 15 | -------------------------------------------------------------------------------- /db/prism_small_objects.schema: -------------------------------------------------------------------------------- 1 | create_table "prism_small_objects", primary_key: "prism_small_object_id", id: :bigint, force: :cascade do |t| 2 | t.bigint "prism_staging_object_id", null: false 3 | t.bigint "prism_partition_id", null: false 4 | t.boolean "delayed", null: false 5 | t.bigint "content_length", null: false 6 | t.datetime "upload_start_time", null: false 7 | end 8 | 9 | add_index "prism_small_objects", ["prism_staging_object_id", "prism_partition_id"], 10 | name: "prism_small_objects_prism_staging_object_id_partition_id_idx", 11 | unique: true, 12 | using: :btree 13 | 14 | add_index "prism_small_objects", ["prism_partition_id"], 15 | name: "prism_small_objects_prism_partition_id_idx", 16 | using: :btree 17 | -------------------------------------------------------------------------------- /db/prism_staging_objects.schema: -------------------------------------------------------------------------------- 1 | create_table "prism_staging_objects", primary_key: "prism_staging_object_id", id: :bigint, force: :cascade do |t| 2 | t.string "bucket_name", limit: 63, null: false 3 | t.string "object_key", limit: 512, null: false 4 | t.datetime "send_time", null: false 5 | t.datetime "first_receive_time", null: false 6 | end 7 | 8 | add_index "prism_staging_objects", ["object_key", "bucket_name"], name: "prism_staging_objects_object_unique_idx", unique: true, using: :btree 9 | -------------------------------------------------------------------------------- /db/prism_tables.schema: -------------------------------------------------------------------------------- 1 | create_table "prism_tables", primary_key: "prism_table_id", force: :cascade do |t| 2 | t.string "schema_name", limit: 128, null: false 3 | t.string "table_name", limit: 128, null: false 4 | t.integer "merge_interval", null: false, default: 43200 5 | t.datetime "create_time", null: false 6 | t.string "physical_schema_name", limit: 128, null: true 7 | t.string "physical_table_name", limit: 128, null: true 8 | end 9 | 10 | add_index "prism_tables", ["schema_name", "table_name"], name: "prism_tables_schema_name_table_name_idx", unique: true, using: :btree 11 | -------------------------------------------------------------------------------- /db/prism_tables_strload_streams.schema: -------------------------------------------------------------------------------- 1 | create_table "prism_tables_strload_streams", id: false, force: :cascade do |t| 2 | t.integer "prism_table_id", null: false 3 | t.integer "stream_id", null: false 4 | end 5 | 6 | add_index "prism_tables_strload_streams", ["stream_id"], name: "prism_table_strload_streams_stream_id", unique: true, using: :btree 7 | -------------------------------------------------------------------------------- /db/prism_unknown_staging_objects.schema: -------------------------------------------------------------------------------- 1 | create_table "prism_unknown_staging_objects", primary_key: "prism_unknown_staging_object_id", id: :bigint, force: :cascade do |t| 2 | t.string "bucket_name", limit: 63, null: false 3 | t.string "object_key", limit: 512, null: false 4 | t.datetime "send_time", null: false 5 | t.datetime "first_receive_time", null: false 6 | t.string "message", limit: 256, null: true 7 | end 8 | 9 | add_index "prism_unknown_staging_objects", ["object_key", "bucket_name"], name: "prism_unknown_staging_objects_object_unique_idx", unique: true, using: :btree 10 | -------------------------------------------------------------------------------- /gc/.ruby-version: -------------------------------------------------------------------------------- 1 | 3.0 2 | -------------------------------------------------------------------------------- /gc/Dockerfile: -------------------------------------------------------------------------------- 1 | ################################################################################### 2 | FROM public.ecr.aws/sorah/ruby:3.0-dev AS build 3 | 4 | RUN apt-get update && apt-get install -y libpq-dev 5 | RUN gem install bundler:2.2.27 6 | 7 | WORKDIR /app 8 | 9 | COPY ./Gemfile . 10 | COPY ./Gemfile.lock . 11 | RUN bundle config set --local deployment true && \ 12 | bundle config set --local without 'development test' && \ 13 | bundle install -j4 14 | 15 | ################################################################################### 16 | FROM public.ecr.aws/sorah/ruby:3.0 17 | 18 | RUN apt-get update && apt-get install -y libpq5 19 | RUN gem install bundler:2.2.27 20 | 21 | WORKDIR /app 22 | 23 | COPY --from=build /app/ . 24 | RUN bundle check 25 | 26 | COPY . . 27 | -------------------------------------------------------------------------------- /gc/Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | gem 'bricolage', '6.0.0beta6' 4 | gem 'aws-sdk-glue' 5 | gem 'aws-sdk-s3' 6 | -------------------------------------------------------------------------------- /gc/Gemfile.lock: -------------------------------------------------------------------------------- 1 | GEM 2 | remote: https://rubygems.org/ 3 | specs: 4 | aws-eventstream (1.2.0) 5 | aws-partitions (1.545.0) 6 | aws-sdk-core (3.125.1) 7 | aws-eventstream (~> 1, >= 1.0.2) 8 | aws-partitions (~> 1, >= 1.525.0) 9 | aws-sigv4 (~> 1.1) 10 | jmespath (~> 1.0) 11 | aws-sdk-glue (1.102.0) 12 | aws-sdk-core (~> 3, >= 3.125.0) 13 | aws-sigv4 (~> 1.1) 14 | aws-sdk-kms (1.53.0) 15 | aws-sdk-core (~> 3, >= 3.125.0) 16 | aws-sigv4 (~> 1.1) 17 | aws-sdk-s3 (1.111.0) 18 | aws-sdk-core (~> 3, >= 3.125.0) 19 | aws-sdk-kms (~> 1) 20 | aws-sigv4 (~> 1.4) 21 | aws-sdk-sns (1.50.0) 22 | aws-sdk-core (~> 3, >= 3.125.0) 23 | aws-sigv4 (~> 1.1) 24 | aws-sigv4 (1.4.0) 25 | aws-eventstream (~> 1, >= 1.0.2) 26 | bricolage (6.0.0beta6) 27 | aws-sdk-s3 (~> 1.64) 28 | aws-sdk-sns (~> 1.23) 29 | nokogiri 30 | pg (~> 1.2.3) 31 | jmespath (1.4.0) 32 | mini_portile2 (2.6.1) 33 | nokogiri (1.12.5) 34 | mini_portile2 (~> 2.6.1) 35 | racc (~> 1.4) 36 | pg (1.2.3) 37 | racc (1.6.0) 38 | 39 | PLATFORMS 40 | ruby 41 | 42 | DEPENDENCIES 43 | aws-sdk-glue 44 | aws-sdk-s3 45 | bricolage (= 6.0.0beta6) 46 | 47 | BUNDLED WITH 48 | 2.2.27 49 | -------------------------------------------------------------------------------- /gc/README.md: -------------------------------------------------------------------------------- 1 | # Prism GC 2 | 3 | Prism GC is a Ruby application to delete unused S3 objects on the Prism bucket. 4 | 5 | Prism continously rebuilds merged objects again and again when a new small object is coming, 6 | to keep partition data up to date. But when Prism updates a merged partition, Prism does not 7 | delete old merged objects -- so there are so many old, unused merged objects on the Prism bucket. 8 | Prism GC application collects and deletes such unused objects. 9 | 10 | Prism does not update partitions older than 14 days, applying GC to partitions which is 11 | 15 or 16 days old is safe and recommended. 12 | 13 | ## Usage 14 | 15 | ``` 16 | % PRISM_BUCKET_NAME=prism-example-bucket ./bin/garbage-collect 17 | ``` 18 | 19 | Options: 20 | ``` 21 | Usage: garbage-collect [options] 22 | --prism-ds=NAME Prism meta data data source (default: prism) 23 | --list-tables Shows target prism tables and quit. 24 | --table=SCHEMA_TABLE Processes only this table. 25 | --list-objects Shows garbage object list and quit. 26 | --partition-expr=EXPR AWS Glue Catalog partition expression to filter target partitions. e.g. "dt = '2021-03-15'" 27 | -e, --environment=ENV Bricolage execution environment. (default: development) 28 | -C, --home=PATH Bricolage home directory. (default: /Users/minero-aoki/c/prism.ghe/gc) 29 | --help Prints this message and quit. 30 | ``` 31 | 32 | Environment: 33 | 34 | - PRISM_BUCKET_NAME: Prism S3 bucket name. 35 | - PRISM_DB_HOST: Prism metadata DB host. 36 | - PRISM_DB_PORT: Prism metadata DB port. 37 | - PRISM_DB_DATABASE: Prism metadata DB database. 38 | - PRISM_DB_USERNAME: Prism metadata DB username. 39 | - PRISM_DB_PASSWORD: Prism metadata DB password 40 | - AWS_DEFAULT_REGION: S3 bucket region. 41 | - BRICOLAGE_ENV: Active configuration profile. Default value is `development`. 42 | - TZ: Timezone to be used to decide "today". 43 | -------------------------------------------------------------------------------- /gc/bin/garbage-collect: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | ENV['BUNDLE_GEMFILE'] ||= File.expand_path('../Gemfile', __dir__) 4 | require 'bundler/setup' if File.exists?(ENV['BUNDLE_GEMFILE']) 5 | 6 | libdir = File.expand_path('../lib', __dir__) 7 | $LOAD_PATH.unshift libdir 8 | 9 | require 'prism/garbage_collect_application' 10 | 11 | Prism::GarbageCollectApplication.main 12 | -------------------------------------------------------------------------------- /gc/config/development/datasource.yml: -------------------------------------------------------------------------------- 1 | # 2 | # development data sources 3 | # 4 | 5 | % pgpassfile = ENV['PGPASSFILE'] || user_home_relative_path('.pgpass') 6 | 7 | prism: 8 | type: psql 9 | host: localhost 10 | port: 5432 11 | database: prism 12 | username: prism 13 | pgpass: <%= pgpassfile %> 14 | encoding: utf8 15 | sql_log_level: debug 16 | -------------------------------------------------------------------------------- /gc/config/production/datasource.yml: -------------------------------------------------------------------------------- 1 | # 2 | # production data sources 3 | # 4 | 5 | prism: 6 | type: psql 7 | host: <%= ENV['PRISM_DB_HOST'] %> 8 | port: <%= ENV['PRISM_DB_PORT'] %> 9 | database: <%= ENV['PRISM_DB_DATABASE'] %> 10 | username: <%= ENV['PRISM_DB_USERNAME'] %> 11 | password: <%= ENV['PRISM_DB_PASSWORD'] %> 12 | encoding: utf8 13 | sql_log_level: info 14 | -------------------------------------------------------------------------------- /gc/lib/prism/garbage_collector.rb: -------------------------------------------------------------------------------- 1 | require 'aws-sdk-glue' 2 | require 'aws-sdk-s3' 3 | require 'json' 4 | 5 | module Prism 6 | 7 | class GarbageCollector 8 | def initialize(s3:, glue:) 9 | @s3 = s3 10 | @glue = glue 11 | end 12 | 13 | def foreach_garbage_object(database_name, table_name, expression: nil) 14 | foreach_merged_partition(database_name, table_name, expression: expression) do |merged_partition| 15 | foreach_garbage_object_in_partition(merged_partition) do |m_obj| 16 | bucket = merged_partition.bucket 17 | key = "#{merged_partition.prefix}#{m_obj.to_basename}" 18 | object = S3Object.new(bucket: bucket, key: key) 19 | yield object 20 | end 21 | end 22 | end 23 | 24 | S3Object = Struct.new(:bucket, :key, keyword_init: true) 25 | class S3Object 26 | def to_s 27 | "s3://#{bucket}/#{key}" 28 | end 29 | end 30 | 31 | def foreach_merged_partition(database_name, table_name, expression: nil) 32 | foreach_glue_partitions(database_name, table_name, expression: expression) do |partition| 33 | # part is nil if not merged yet 34 | part = MergedPartition.try_from_location(partition.storage_descriptor.location) 35 | yield part if part 36 | end 37 | end 38 | 39 | def foreach_glue_partitions(database_name, table_name, expression: nil, &block) 40 | res = @glue.get_partitions( 41 | database_name: database_name, 42 | table_name: table_name, 43 | expression: expression 44 | ) 45 | res.each do |page| 46 | page.partitions.each(&block) 47 | end 48 | end 49 | 50 | def foreach_garbage_object_in_partition(merged_partition) 51 | in_use = get_manifest_entries(merged_partition) 52 | greatest_endx_by_start_map = in_use.map {|m_obj| [m_obj.start, m_obj.endx] }.to_h 53 | foreach_merged_object(merged_partition) do |m_obj| 54 | greatest_endx = greatest_endx_by_start_map[m_obj.start] 55 | if greatest_endx && m_obj.endx < greatest_endx 56 | yield m_obj 57 | end 58 | end 59 | end 60 | 61 | def get_manifest_entries(merged_partition) 62 | manifest_body = @s3.get_object(bucket: merged_partition.bucket, key: "#{merged_partition.prefix}#{merged_partition.manifest_basename}").body.read 63 | manifest = JSON.parse(manifest_body, symbolize_names: true) 64 | manifest[:entries].map do |entry| 65 | entry_uri = URI.parse(entry[:url]) 66 | entry_key = entry_uri.path[1..] 67 | entry_basename = File.basename(entry_key) 68 | MergedObject.parse_basename(entry_basename) 69 | end 70 | end 71 | 72 | def foreach_merged_object(merged_partition) 73 | res = @s3.list_objects_v2( 74 | bucket: merged_partition.bucket, 75 | delimiter: "/", 76 | prefix: "#{merged_partition.prefix}part-", 77 | max_keys: 1000, 78 | ) 79 | res.each do |page| 80 | page.contents.each do |s3_obj| 81 | s3_obj_basename = File.basename(s3_obj.key) 82 | yield MergedObject.parse_basename(s3_obj_basename) 83 | end 84 | end 85 | end 86 | end 87 | 88 | class MergedPartition 89 | def MergedPartition.try_from_location(location) 90 | location_uri = URI.parse(location) 91 | unless location_uri.path.include?("/merged/") 92 | return nil 93 | end 94 | bucket = location_uri.host 95 | manifest_key = location_uri.path[1..] 96 | manifest_basename = File.basename(manifest_key) 97 | prefix = "#{File.dirname(manifest_key)}/" 98 | new(bucket, prefix, manifest_basename) 99 | end 100 | 101 | def initialize(bucket, prefix, manifest_basename) 102 | @bucket = bucket 103 | @prefix = prefix 104 | @manifest_basename = manifest_basename 105 | end 106 | 107 | attr_reader :prefix, :bucket, :manifest_basename 108 | end 109 | 110 | class MergedObject 111 | PART_BASENAME_RE = /part-(\d{19})-(\d{19})\.parquet/ 112 | 113 | def MergedObject.parse_basename(basename) 114 | match_data = PART_BASENAME_RE.match(basename) 115 | if match_data.nil? 116 | raise "failed to parse merged object basename: #{basename}" 117 | end 118 | start = match_data[1].to_i 119 | endx = match_data[2].to_i 120 | new(start, endx) 121 | end 122 | 123 | # start: Integer 124 | # endx: Integer 125 | def initialize(start, endx) 126 | @start = start 127 | @endx = endx 128 | end 129 | 130 | attr_reader :start, :endx 131 | 132 | def to_basename 133 | "part-#{@start.to_s.rjust(19, '0')}-#{@endx.to_s.rjust(19, '0')}.parquet" 134 | end 135 | end 136 | 137 | end 138 | -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cookpad/prism/774f6e2c0b6665c5f11c9012f94a42333dc4454c/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionBase=GRADLE_USER_HOME 2 | distributionPath=wrapper/dists 3 | distributionUrl=https\://services.gradle.org/distributions/gradle-7.6.4-bin.zip 4 | networkTimeout=10000 5 | zipStoreBase=GRADLE_USER_HOME 6 | zipStorePath=wrapper/dists 7 | -------------------------------------------------------------------------------- /gradlew.bat: -------------------------------------------------------------------------------- 1 | @rem 2 | @rem Copyright 2015 the original author or authors. 3 | @rem 4 | @rem Licensed under the Apache License, Version 2.0 (the "License"); 5 | @rem you may not use this file except in compliance with the License. 6 | @rem You may obtain a copy of the License at 7 | @rem 8 | @rem https://www.apache.org/licenses/LICENSE-2.0 9 | @rem 10 | @rem Unless required by applicable law or agreed to in writing, software 11 | @rem distributed under the License is distributed on an "AS IS" BASIS, 12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | @rem See the License for the specific language governing permissions and 14 | @rem limitations under the License. 15 | @rem 16 | 17 | @if "%DEBUG%"=="" @echo off 18 | @rem ########################################################################## 19 | @rem 20 | @rem Gradle startup script for Windows 21 | @rem 22 | @rem ########################################################################## 23 | 24 | @rem Set local scope for the variables with windows NT shell 25 | if "%OS%"=="Windows_NT" setlocal 26 | 27 | set DIRNAME=%~dp0 28 | if "%DIRNAME%"=="" set DIRNAME=. 29 | @rem This is normally unused 30 | set APP_BASE_NAME=%~n0 31 | set APP_HOME=%DIRNAME% 32 | 33 | @rem Resolve any "." and ".." in APP_HOME to make it shorter. 34 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi 35 | 36 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 37 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m" 38 | 39 | @rem Find java.exe 40 | if defined JAVA_HOME goto findJavaFromJavaHome 41 | 42 | set JAVA_EXE=java.exe 43 | %JAVA_EXE% -version >NUL 2>&1 44 | if %ERRORLEVEL% equ 0 goto execute 45 | 46 | echo. 47 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 48 | echo. 49 | echo Please set the JAVA_HOME variable in your environment to match the 50 | echo location of your Java installation. 51 | 52 | goto fail 53 | 54 | :findJavaFromJavaHome 55 | set JAVA_HOME=%JAVA_HOME:"=% 56 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 57 | 58 | if exist "%JAVA_EXE%" goto execute 59 | 60 | echo. 61 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 62 | echo. 63 | echo Please set the JAVA_HOME variable in your environment to match the 64 | echo location of your Java installation. 65 | 66 | goto fail 67 | 68 | :execute 69 | @rem Setup the command line 70 | 71 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 72 | 73 | 74 | @rem Execute Gradle 75 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %* 76 | 77 | :end 78 | @rem End local scope for the variables with windows NT shell 79 | if %ERRORLEVEL% equ 0 goto mainEnd 80 | 81 | :fail 82 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 83 | rem the _cmd.exe /c_ return code! 84 | set EXIT_CODE=%ERRORLEVEL% 85 | if %EXIT_CODE% equ 0 set EXIT_CODE=1 86 | if not ""=="%GRADLE_EXIT_CONSOLE%" exit %EXIT_CODE% 87 | exit /b %EXIT_CODE% 88 | 89 | :mainEnd 90 | if "%OS%"=="Windows_NT" endlocal 91 | 92 | :omega 93 | -------------------------------------------------------------------------------- /merge/.gitignore: -------------------------------------------------------------------------------- 1 | /bin/ -------------------------------------------------------------------------------- /merge/build.gradle: -------------------------------------------------------------------------------- 1 | plugins { 2 | id "org.springframework.boot" version "2.1.3.RELEASE" 3 | id "io.spring.dependency-management" version "1.0.7.RELEASE" 4 | id "com.google.cloud.tools.jib" version "3.4.5" 5 | } 6 | 7 | dependencies { 8 | implementation project(':shared') 9 | implementation group: 'org.springframework.boot', name: 'spring-boot-starter', version: '2.1.3.RELEASE' 10 | implementation group: 'org.springframework.boot', name: 'spring-boot-starter-logging', version: '2.1.3.RELEASE' 11 | implementation group: 'io.sentry', name: 'sentry-logback', version: '1.7.30' 12 | implementation group: 'org.postgresql', name: 'postgresql', version: '42.2.5' 13 | } 14 | 15 | jib { 16 | from { 17 | image = 'docker://prism-base' 18 | } 19 | to { 20 | tags = ["latest", rootProject.file("REVISION").text.trim()] 21 | } 22 | container { 23 | jvmFlags = [] 24 | workingDirectory = '/app' 25 | } 26 | } 27 | tasks.jib.dependsOn ':shared:docker' 28 | -------------------------------------------------------------------------------- /merge/config/application.yml: -------------------------------------------------------------------------------- 1 | # This config file is for *DEVELOPMENT* 2 | # Use merge/src/main/jib/app/config/application.yml or environment variables instead 3 | # if you need to change the config in production environment. 4 | 5 | spring: 6 | main: 7 | banner-mode: "off" 8 | datasource: 9 | url: jdbc:postgresql://localhost:5432/prism 10 | driver-class-name: org.postgresql.Driver 11 | username: prism 12 | password: prism 13 | hikari: 14 | maximum-pool-size: 2 15 | minimum-idle: 1 16 | 17 | prism: 18 | bucket-name: prism-example-bucket 19 | prefix: "" 20 | merge-job-timeout: 600 21 | merged-object-size: 134217728 # 128MiB 22 | merge-batch-size: 2000 23 | downloader-threads: 4 24 | merger-threads: 2 25 | 26 | logging: 27 | level: 28 | root: INFO 29 | com.cookpad.prism: DEBUG 30 | com.cookpad.prism.dao: INFO 31 | org.apache.parquet: WARN 32 | org.apache.hadoop: WARN 33 | -------------------------------------------------------------------------------- /merge/src/main/java/com/cookpad/prism/merge/CmdRunner.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.merge; 2 | 3 | import java.time.LocalDate; 4 | import java.util.List; 5 | import java.util.stream.Collectors; 6 | 7 | import org.slf4j.MDC; 8 | import org.springframework.boot.ApplicationArguments; 9 | import org.springframework.boot.CommandLineRunner; 10 | import org.springframework.context.ApplicationContext; 11 | import org.springframework.stereotype.Component; 12 | 13 | import lombok.RequiredArgsConstructor; 14 | 15 | @RequiredArgsConstructor 16 | @Component 17 | public class CmdRunner implements CommandLineRunner { 18 | private final ApplicationContext ctx; 19 | private final ApplicationArguments args; 20 | 21 | @Override 22 | public void run(String... rawArgs) throws Exception { 23 | List nonopts = this.args.getNonOptionArgs(); 24 | String cmdName = "daemon"; 25 | if (nonopts.size() >= 1) { 26 | cmdName = nonopts.get(0); 27 | } 28 | MDC.put("cmd_name", cmdName); 29 | switch (cmdName) { 30 | case "daemon": 31 | DaemonCmd daemonCmd = this.ctx.getBean(DaemonCmd.class); 32 | daemonCmd.run(); 33 | break; 34 | case "oneshot": 35 | OneshotCmd oneshotCmd = this.ctx.getBean(OneshotCmd.class); 36 | oneshotCmd.run(); 37 | break; 38 | case "rebuild": 39 | List tableIds = this.args.getOptionValues("table"); 40 | if (tableIds.size() != 1) { 41 | throw new RuntimeException("just one table id must be given"); 42 | } 43 | int tableId = Integer.parseInt(tableIds.get(0)); 44 | List partitions = this.args.getOptionValues("partition"); 45 | List partitionDates = partitions.stream().map(LocalDate::parse).collect(Collectors.toList()); 46 | RebuildCmd rebuildCmd = this.ctx.getBean(RebuildCmd.class); 47 | rebuildCmd.run(tableId, partitionDates); 48 | break; 49 | default: 50 | throw new RuntimeException("No such cmd: " + cmdName); 51 | } 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /merge/src/main/java/com/cookpad/prism/merge/DaemonCmd.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.merge; 2 | 3 | import com.cookpad.prism.StepHandler; 4 | 5 | import org.springframework.context.annotation.Lazy; 6 | import org.springframework.stereotype.Component; 7 | 8 | import lombok.RequiredArgsConstructor; 9 | import lombok.extern.slf4j.Slf4j; 10 | 11 | @Component 12 | @Lazy 13 | @RequiredArgsConstructor 14 | @Slf4j 15 | public class DaemonCmd { 16 | private final StepHandler stepHandler; 17 | 18 | public void run() { 19 | try { 20 | while (this.stepHandler.handleStep()) { 21 | ; 22 | } 23 | } 24 | finally { 25 | log.info("shutting down job dispatcher..."); 26 | this.stepHandler.shutdown(); 27 | log.info("shut down."); 28 | } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /merge/src/main/java/com/cookpad/prism/merge/Main.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.merge; 2 | 3 | import java.time.Clock; 4 | import java.util.concurrent.ExecutorService; 5 | import java.util.concurrent.Executors; 6 | import java.util.concurrent.ThreadFactory; 7 | 8 | import com.amazonaws.services.s3.AmazonS3; 9 | import com.amazonaws.services.s3.AmazonS3ClientBuilder; 10 | 11 | import org.apache.hadoop.conf.Configuration; 12 | import org.apache.hadoop.fs.FileSystem; 13 | import org.apache.hadoop.fs.RawLocalFileSystem; 14 | import com.cookpad.prism.StepHandler; 15 | import com.cookpad.prism.objectstore.PrismObjectStoreFactory; 16 | import com.cookpad.prism.objectstore.PrismTableLocatorFactory; 17 | import com.cookpad.prism.record.RecordReaderFactory; 18 | import com.cookpad.prism.record.RecordWriterFactory; 19 | import org.mybatis.spring.annotation.MapperScan; 20 | import org.springframework.beans.factory.annotation.Autowired; 21 | import org.springframework.boot.WebApplicationType; 22 | import org.springframework.boot.autoconfigure.SpringBootApplication; 23 | import org.springframework.boot.builder.SpringApplicationBuilder; 24 | import org.springframework.context.annotation.Bean; 25 | 26 | import lombok.extern.slf4j.Slf4j; 27 | 28 | @SpringBootApplication 29 | @MapperScan(basePackages = "com.cookpad.prism.dao") 30 | @Slf4j 31 | public class Main { 32 | public static void main(String[] args) { 33 | try { 34 | new SpringApplicationBuilder(Main.class).web(WebApplicationType.NONE).run(args); 35 | } catch(Exception e) { 36 | log.error("Unhandled exception", e); 37 | throw e; 38 | } 39 | } 40 | 41 | @Bean 42 | public AmazonS3 s3() { 43 | return AmazonS3ClientBuilder.defaultClient(); 44 | } 45 | 46 | @Bean 47 | public Configuration hadoopConf() { 48 | Configuration conf = new Configuration(); 49 | conf.setClass("fs.file.impl", RawLocalFileSystem.class, FileSystem.class); 50 | return conf; 51 | } 52 | 53 | @Bean 54 | public RecordWriterFactory recordWriterFactory(@Autowired Configuration hadoopConf) { 55 | return new RecordWriterFactory(hadoopConf); 56 | } 57 | 58 | @Bean 59 | public RecordReaderFactory recordReaderFactory(@Autowired Configuration hadoopConf) { 60 | return new RecordReaderFactory(hadoopConf); 61 | } 62 | 63 | @Bean 64 | public PrismTableLocatorFactory tableLocatorFactory(@Autowired PrismMergeConf prismConf) { 65 | return new PrismTableLocatorFactory(prismConf.getBucketName(), prismConf.getPrefix()); 66 | } 67 | 68 | @Bean 69 | public PrismObjectStoreFactory prismObjectStoreFactory(@Autowired PrismTableLocatorFactory tableLocatorFactory) { 70 | return new PrismObjectStoreFactory(this.s3(), tableLocatorFactory); 71 | } 72 | 73 | @Bean 74 | public Clock clock() { 75 | return Clock.systemDefaultZone(); 76 | } 77 | 78 | @Bean 79 | public ParallelParquetMerger parallelParquetMerger(@Autowired PrismMergeConf prismMergeConf, @Autowired ParquetFileMerger parquetFileMerger, @Autowired PrismMergeConf prismConf) { 80 | ExecutorService downloadExecutor = Executors.newFixedThreadPool(prismConf.getDownloaderThreads(), DaemonThreadFactory.instance); 81 | ExecutorService mergeExecutor = Executors.newFixedThreadPool(prismConf.getMergerThreads(), DaemonThreadFactory.instance); 82 | return new ParallelParquetMerger(downloadExecutor, mergeExecutor, parquetFileMerger); 83 | } 84 | 85 | static class DaemonThreadFactory implements ThreadFactory { 86 | static final DaemonThreadFactory instance = new DaemonThreadFactory(); 87 | 88 | final ThreadFactory original = Executors.defaultThreadFactory(); 89 | 90 | @Override 91 | public Thread newThread(Runnable r) { 92 | var th = original.newThread(r); 93 | th.setDaemon(true); 94 | return th; 95 | } 96 | } 97 | 98 | @Bean 99 | public StepHandler stepHandler(@Autowired MergeJobDispatcher mergeJobDispatcher) { 100 | return mergeJobDispatcher; 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /merge/src/main/java/com/cookpad/prism/merge/MergeJobDispatcher.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.merge; 2 | 3 | import com.cookpad.prism.StepHandler; 4 | import com.cookpad.prism.merge.MergeJobHandler.JobStatus; 5 | import com.cookpad.prism.dao.PrismMergeJob; 6 | import org.springframework.stereotype.Component; 7 | 8 | import io.sentry.Sentry; 9 | import lombok.RequiredArgsConstructor; 10 | import lombok.extern.slf4j.Slf4j; 11 | 12 | @RequiredArgsConstructor 13 | @Slf4j 14 | @Component 15 | public class MergeJobDispatcher implements StepHandler { 16 | private final MergeJobQueue mergeJobQueue; 17 | private final MergeJobHandler mergeJobHandler; 18 | 19 | @Override 20 | public boolean handleStep() { 21 | PrismMergeJob job = this.mergeJobQueue.dequeue(); 22 | if (job == null) { 23 | try { 24 | Thread.sleep(3000); 25 | } catch (InterruptedException e) { 26 | return true; 27 | } 28 | return true; 29 | } 30 | try { 31 | log.info("Handling job: {}", job); 32 | Sentry.getContext().addTag("merge_job", Long.toString(job.getId())); 33 | JobStatus status = this.mergeJobHandler.handleJob(job); 34 | if (status == JobStatus.CONTINUING) { 35 | this.mergeJobQueue.retry(job); 36 | } else { 37 | this.mergeJobQueue.delete(job); 38 | } 39 | log.info("Handled job: {}", job); 40 | } catch (Exception ex) { 41 | throw new RuntimeException(ex); 42 | } finally { 43 | Sentry.getContext().removeTag("merge_job"); 44 | } 45 | return true; 46 | } 47 | 48 | public void shutdown() { 49 | mergeJobHandler.shutdown(); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /merge/src/main/java/com/cookpad/prism/merge/MergeJobHandler.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.merge; 2 | 3 | import com.cookpad.prism.dao.PrismMergeJob; 4 | 5 | public interface MergeJobHandler { 6 | public JobStatus handleJob(PrismMergeJob job) throws Exception; 7 | 8 | public static enum JobStatus { 9 | FINISHED, 10 | CONTINUING, 11 | } 12 | 13 | public void shutdown(); 14 | } 15 | -------------------------------------------------------------------------------- /merge/src/main/java/com/cookpad/prism/merge/MergeJobQueue.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.merge; 2 | 3 | import java.time.Clock; 4 | import java.time.LocalDateTime; 5 | import java.time.ZoneOffset; 6 | import java.util.List; 7 | 8 | import com.cookpad.prism.dao.PrismMergeJob; 9 | import com.cookpad.prism.dao.PrismMergeJobMapper; 10 | import org.springframework.stereotype.Component; 11 | 12 | import lombok.RequiredArgsConstructor; 13 | 14 | @RequiredArgsConstructor 15 | @Component 16 | public class MergeJobQueue { 17 | private final PrismMergeConf prismConf; 18 | private final PrismMergeJobMapper mergeJobMapper; 19 | private final Clock clock; 20 | 21 | private LocalDateTime now() { 22 | return LocalDateTime.ofInstant(this.clock.instant(), ZoneOffset.UTC); 23 | } 24 | 25 | private LocalDateTime getTimedoutPeriod() { 26 | return this.now().minusSeconds(this.prismConf.getMergeJobTimeout()); 27 | } 28 | 29 | public PrismMergeJob dequeue() { 30 | this.retryTimedoutJobs(); 31 | return this.mergeJobMapper.dequeue(this.now()); 32 | } 33 | 34 | public void retry(PrismMergeJob job) { 35 | // MEMO: transactions are not needed here 36 | // because miss-deleted records will be deleted in next check 37 | this.mergeJobMapper.retry(job.getPartitionId(), job.getScheduleTime()); 38 | this.mergeJobMapper.delete(job.getId()); 39 | } 40 | 41 | public void retryTimedoutJobs() { 42 | List timedoutJobs = this.mergeJobMapper.findTimedoutJobs(this.getTimedoutPeriod(), 100); 43 | for (PrismMergeJob job : timedoutJobs) { 44 | this.retry(job); 45 | } 46 | } 47 | 48 | public void delete(PrismMergeJob job) { 49 | this.mergeJobMapper.delete(job.getId()); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /merge/src/main/java/com/cookpad/prism/merge/OneshotCmd.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.merge; 2 | 3 | import com.cookpad.prism.StepHandler; 4 | 5 | import org.springframework.context.annotation.Lazy; 6 | import org.springframework.stereotype.Component; 7 | 8 | import lombok.RequiredArgsConstructor; 9 | import lombok.extern.slf4j.Slf4j; 10 | 11 | @Component 12 | @Lazy 13 | @RequiredArgsConstructor 14 | @Slf4j 15 | public class OneshotCmd { 16 | final StepHandler stepHandler; 17 | 18 | public void run() { 19 | try { 20 | var succeeded = this.stepHandler.handleStep(); 21 | log.info(succeeded ? "suceeded" : "failed"); 22 | } 23 | finally { 24 | log.info("shutting down job dispatcher..."); 25 | this.stepHandler.shutdown(); 26 | log.info("shut down."); 27 | } 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /merge/src/main/java/com/cookpad/prism/merge/ParquetFileMerger.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.merge; 2 | 3 | import java.io.IOException; 4 | import java.nio.file.Path; 5 | 6 | import org.apache.parquet.hadoop.ParquetReader; 7 | import org.apache.parquet.hadoop.ParquetWriter; 8 | import com.cookpad.prism.record.Record; 9 | import com.cookpad.prism.record.RecordReaderFactory; 10 | import com.cookpad.prism.record.RecordWriterFactory; 11 | import com.cookpad.prism.record.Schema; 12 | import org.springframework.stereotype.Component; 13 | 14 | import lombok.RequiredArgsConstructor; 15 | 16 | @RequiredArgsConstructor 17 | @Component 18 | public class ParquetFileMerger { 19 | private final RecordWriterFactory recordWriterFactory; 20 | private final RecordReaderFactory recordReaderFactory; 21 | 22 | public void merge(Schema schema, Path inputFilePathA, Path inputFilePathB, Path outputFilePath) throws IOException { 23 | try (ParquetWriter writer = recordWriterFactory.build(schema, outputFilePath)) { 24 | try (ParquetReader readerA = recordReaderFactory.build(schema, inputFilePathA); 25 | ParquetReader readerB = recordReaderFactory.build(schema, inputFilePathB)) { 26 | new ParquetMerger(readerA, readerB, writer).merge(); 27 | } 28 | } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /merge/src/main/java/com/cookpad/prism/merge/ParquetMerger.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.merge; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.parquet.hadoop.ParquetReader; 6 | import org.apache.parquet.hadoop.ParquetWriter; 7 | import com.cookpad.prism.record.Record; 8 | import com.cookpad.prism.record.RecordTimestampComparator; 9 | 10 | import lombok.RequiredArgsConstructor; 11 | 12 | // input: parquet temp file object list 13 | // output: merged parquet temp file object list 14 | @RequiredArgsConstructor 15 | public class ParquetMerger { 16 | private final ParquetReader readerA; 17 | private final ParquetReader readerB; 18 | private final ParquetWriter writer; 19 | 20 | public void merge() throws IOException { 21 | RecordTimestampComparator comparator = new RecordTimestampComparator(); 22 | Record recordA = this.readerA.read(); 23 | Record recordB = this.readerB.read(); 24 | while (true) { 25 | int cmp = comparator.compare(recordA, recordB); 26 | if (cmp > 0) { 27 | this.writer.write(recordB); 28 | if (recordA == null) { 29 | this.writeSingle(this.readerB, this.writer); 30 | break; 31 | } 32 | recordB = this.readerB.read(); 33 | } else if (cmp < 0) { 34 | this.writer.write(recordA); 35 | if (recordB == null) { 36 | this.writeSingle(this.readerA, this.writer); 37 | break; 38 | } 39 | recordA = this.readerA.read(); 40 | } else { 41 | if (recordA == null) { 42 | break; 43 | } 44 | this.writer.write(recordA); 45 | this.writer.write(recordB); 46 | recordA = this.readerA.read(); 47 | recordB = this.readerB.read(); 48 | } 49 | } 50 | } 51 | 52 | private void writeSingle(ParquetReader reader, ParquetWriter writer) throws IOException { 53 | Record record; 54 | while ((record = reader.read()) != null) { 55 | writer.write(record); 56 | } 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /merge/src/main/java/com/cookpad/prism/merge/PrismMergeConf.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.merge; 2 | 3 | import com.cookpad.prism.PrismConf; 4 | 5 | import org.springframework.stereotype.Component; 6 | 7 | import lombok.NoArgsConstructor; 8 | import lombok.Getter; 9 | import lombok.Setter; 10 | import lombok.ToString; 11 | 12 | @Component 13 | @NoArgsConstructor 14 | @Getter 15 | @Setter 16 | @ToString 17 | public class PrismMergeConf extends PrismConf { 18 | long mergeJobTimeout; 19 | long mergedObjectSize; 20 | int mergeBatchSize; 21 | int downloaderThreads; 22 | int mergerThreads; 23 | } 24 | -------------------------------------------------------------------------------- /merge/src/main/java/com/cookpad/prism/merge/RebuildCmd.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.merge; 2 | 3 | import java.io.IOException; 4 | import java.time.Clock; 5 | import java.time.LocalDate; 6 | import java.util.List; 7 | import java.util.Optional; 8 | 9 | import com.cookpad.prism.SchemaBuilder; 10 | import com.cookpad.prism.SchemaBuilder.BadColumnsError; 11 | import com.cookpad.prism.merge.MergeJobWorker.MergePlan; 12 | import com.cookpad.prism.merge.MergeJobWorker.MergePlanExecutor; 13 | import com.cookpad.prism.objectstore.PrismObjectStore; 14 | import com.cookpad.prism.objectstore.PrismObjectStoreFactory; 15 | import com.cookpad.prism.record.Schema; 16 | import com.cookpad.prism.dao.OneToMany; 17 | import com.cookpad.prism.dao.PrismMergeRange; 18 | import com.cookpad.prism.dao.PrismMergeRangeMapper; 19 | import com.cookpad.prism.dao.PrismPartition; 20 | import com.cookpad.prism.dao.PrismPartitionMapper; 21 | import com.cookpad.prism.dao.PrismSmallObject; 22 | import com.cookpad.prism.dao.PrismSmallObjectMapper; 23 | import com.cookpad.prism.dao.PrismTable; 24 | import com.cookpad.prism.dao.PrismTableMapper; 25 | import com.cookpad.prism.dao.StreamColumn; 26 | import org.springframework.context.annotation.Lazy; 27 | import org.springframework.stereotype.Component; 28 | 29 | import lombok.RequiredArgsConstructor; 30 | 31 | @Component 32 | @Lazy 33 | @RequiredArgsConstructor 34 | public class RebuildCmd { 35 | private final ParallelParquetMerger parallelParquetMerger; 36 | private final PrismMergeRangeMapper mergeRangeMapper; 37 | private final PrismSmallObjectMapper smallObjectMapper; 38 | private final PrismPartitionMapper partitionMapper; 39 | private final PrismTableMapper tableMapper; 40 | private final PrismObjectStoreFactory objectStoreFactory; 41 | private final Clock clock; 42 | 43 | public void run(int tableId, List partitionDates) throws BadColumnsError, IOException { 44 | OneToMany tableWithColumns = tableMapper.findWithColumns(tableId); 45 | PrismTable table = tableWithColumns.getOne(); 46 | Schema schema = new SchemaBuilder().build(tableWithColumns.getOne(), tableWithColumns.getMany()); 47 | PrismObjectStore prismObjectStore = this.objectStoreFactory.create(table); 48 | MergePlanExecutor planExecutor = MergePlanExecutor.of( 49 | mergeRangeMapper, 50 | parallelParquetMerger, 51 | prismObjectStore, 52 | clock 53 | ); 54 | for (LocalDate date : partitionDates) { 55 | PrismPartition partition = this.partitionMapper.findByTableIdAndDate(tableId, date); 56 | if (partition == null) { 57 | continue; 58 | } 59 | List mergeRanges = this.mergeRangeMapper.findAllInPartition(partition.getId()); 60 | for (PrismMergeRange mergeRange : mergeRanges) { 61 | List smallObjects = this.smallObjectMapper.findAllObjectsInRange(partition.getId(), mergeRange.getLowerBound(), mergeRange.getUpperBound()); 62 | MergePlan plan = new MergePlan(partition, mergeRange.getLowerBound(), false, smallObjects, Optional.empty()); 63 | planExecutor.execute(schema, plan); 64 | } 65 | this.partitionMapper.updateCurrentManifestVersion(partition.getId(), -1); 66 | } 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /merge/src/main/java/com/cookpad/prism/merge/downloader/DownloadedObjectSupplier.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.merge.downloader; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.util.function.Supplier; 6 | 7 | import com.cookpad.prism.TempFile; 8 | 9 | import lombok.RequiredArgsConstructor; 10 | 11 | @RequiredArgsConstructor 12 | public class DownloadedObjectSupplier implements Supplier { 13 | private final ObjectDownloader downloader; 14 | 15 | @Override 16 | public TempFile get() { 17 | try { 18 | File file = this.downloader.download(); 19 | return new TempFile(file.toPath()); 20 | } catch (IOException e) { 21 | throw new RuntimeException(e); 22 | } 23 | } 24 | 25 | public static interface ObjectDownloader { 26 | File download() throws IOException; 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /merge/src/main/java/com/cookpad/prism/merge/downloader/MergedObjectSupplierFactory.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.merge.downloader; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.time.LocalDate; 6 | 7 | import com.cookpad.prism.merge.downloader.DownloadedObjectSupplier.ObjectDownloader; 8 | import com.cookpad.prism.objectstore.MergedObjectStore; 9 | import com.cookpad.prism.dao.PrismMergeRange; 10 | import com.cookpad.prism.dao.PrismPartition; 11 | 12 | import lombok.RequiredArgsConstructor; 13 | 14 | @RequiredArgsConstructor 15 | public class MergedObjectSupplierFactory { 16 | private final MergedObjectStore mergedObjectStore; 17 | 18 | public DownloadedObjectSupplier createSupplier(PrismMergeRange mergeRange, PrismPartition partition) { 19 | return new DownloadedObjectSupplier(new MergedObjectDownloader(partition.getPartitionDate(), mergeRange.getLowerBound(), mergeRange.getUpperBound(), this.mergedObjectStore)); 20 | } 21 | 22 | @RequiredArgsConstructor 23 | public static class MergedObjectDownloader implements ObjectDownloader { 24 | private final LocalDate dt; 25 | private final long lowerBound; 26 | private final long upperBound; 27 | private final MergedObjectStore mergedObjectStore; 28 | 29 | @Override 30 | public File download() throws IOException { 31 | return this.mergedObjectStore.getMergedObjectFile(this.dt, this.lowerBound, this.upperBound); 32 | } 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /merge/src/main/java/com/cookpad/prism/merge/downloader/SmallObjectSupplierFactory.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.merge.downloader; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.time.LocalDate; 6 | import java.util.ArrayList; 7 | import java.util.List; 8 | 9 | import com.cookpad.prism.merge.downloader.DownloadedObjectSupplier.ObjectDownloader; 10 | import com.cookpad.prism.objectstore.SmallObjectStore; 11 | import com.cookpad.prism.dao.PrismPartition; 12 | import com.cookpad.prism.dao.PrismSmallObject; 13 | 14 | import lombok.RequiredArgsConstructor; 15 | 16 | @RequiredArgsConstructor 17 | public class SmallObjectSupplierFactory { 18 | private final SmallObjectStore smallObjectStore; 19 | 20 | public DownloadedObjectSupplier createSingleSupplier(PrismSmallObject smallObject, PrismPartition partition) throws IOException { 21 | LocalDate dt = partition.getPartitionDate(); 22 | long objectId = smallObject.getStagingObjectId(); 23 | if (smallObject.isDelayed()) { 24 | return new DownloadedObjectSupplier(new DelayedObjectDownloader(dt, objectId, this.smallObjectStore)); 25 | } else { 26 | return new DownloadedObjectSupplier(new LiveObjectDownloader(dt, objectId, this.smallObjectStore)); 27 | } 28 | } 29 | 30 | public List createMultipleSuppliers(List smallObjects, PrismPartition partition) throws IOException { 31 | List suppliers = new ArrayList<>(); 32 | for (PrismSmallObject smallObject : smallObjects) { 33 | DownloadedObjectSupplier supplier = this.createSingleSupplier(smallObject, partition); 34 | suppliers.add(supplier); 35 | } 36 | return suppliers; 37 | } 38 | 39 | @RequiredArgsConstructor 40 | public static class LiveObjectDownloader implements ObjectDownloader { 41 | private final LocalDate dt; 42 | private final long objectId; 43 | private final SmallObjectStore smallObjectStore; 44 | 45 | @Override 46 | public File download() throws IOException { 47 | return this.smallObjectStore.getLiveObjectFile(this.dt, this.objectId); 48 | } 49 | } 50 | 51 | @RequiredArgsConstructor 52 | public static class DelayedObjectDownloader implements ObjectDownloader { 53 | private final LocalDate dt; 54 | private final long objectId; 55 | private final SmallObjectStore smallObjectStore; 56 | 57 | @Override 58 | public File download() throws IOException { 59 | return this.smallObjectStore.getDelayedObjectFile(this.dt, this.objectId); 60 | } 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /merge/src/main/jib/app/config/application.yml: -------------------------------------------------------------------------------- 1 | # This config file is for *PRODUCTION* 2 | # Use merge/config/application.yml or environment variables instead 3 | # if you need to change the config in production environment. 4 | 5 | spring: 6 | main: 7 | banner-mode: "off" 8 | datasource: 9 | # database endpoint will be injected by environment variables 10 | driver-class-name: org.postgresql.Driver 11 | hikari: 12 | maximum-pool-size: 2 13 | minimum-idle: 1 14 | 15 | prism: 16 | bucket-name: prism-example-bucket 17 | prefix: "" 18 | merge-job-timeout: 600 19 | merged-object-size: 134217728 # 128MiB 20 | merge-batch-size: 2000 21 | downloader-threads: 4 22 | merger-threads: 2 23 | 24 | logging: 25 | level: 26 | root: INFO 27 | com.cookpad.prism: INFO 28 | com.cookpad.prism.dao: INFO 29 | org.apache.parquet: WARN 30 | org.apache.hadoop: WARN 31 | -------------------------------------------------------------------------------- /merge/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | System.err 5 | 6 | %d [%t] %5level: %logger: %msg%n 7 | 8 | 9 | 10 | 11 | 12 | WARN 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /merge/src/main/resources/sentry.properties: -------------------------------------------------------------------------------- 1 | tags=subsystem:merge 2 | stacktrace.app.packages=com.cookpad.prism 3 | -------------------------------------------------------------------------------- /merge/src/test/java/com/cookpad/prism/merge/MergeJobWorkerTest.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.merge; 2 | 3 | import java.io.IOException; 4 | import java.time.LocalDateTime; 5 | import java.util.ArrayList; 6 | 7 | import com.cookpad.prism.merge.MergeJobWorker.OpenMergeRange; 8 | import com.cookpad.prism.record.Schema.Builder.BadSchemaError; 9 | import com.cookpad.prism.dao.PrismMergeRange; 10 | import com.cookpad.prism.dao.PrismSmallObject; 11 | import org.junit.jupiter.api.Test; 12 | 13 | import static org.junit.jupiter.api.Assertions.assertIterableEquals; 14 | 15 | import lombok.val; 16 | 17 | public class MergeJobWorkerTest { 18 | @Test 19 | public void testEmptyOpenMergeRange() throws BadSchemaError, IOException { 20 | val time = LocalDateTime.now(); 21 | val openMergeRange = OpenMergeRange.empty(0); 22 | val smallObjects = new ArrayList(); 23 | smallObjects.add(new PrismSmallObject(1, 1, 1, false, 30, time)); 24 | smallObjects.add(new PrismSmallObject(2, 1, 1, false, 30, time)); 25 | smallObjects.add(new PrismSmallObject(3, 1, 1, false, 30, time)); 26 | smallObjects.add(new PrismSmallObject(4, 1, 1, false, 30, time)); 27 | 28 | val mergeableObjects = openMergeRange.calculateMergeableObjectList(100, smallObjects); 29 | assertIterableEquals(smallObjects.subList(0, 3), mergeableObjects); 30 | } 31 | 32 | @Test 33 | public void testExistingOpenMergeRange() throws BadSchemaError, IOException { 34 | val time = LocalDateTime.now(); 35 | val mergeRange = new PrismMergeRange(1, 1, 0, 100, 20, time, time); 36 | val openMergeRange = OpenMergeRange.existing(mergeRange); 37 | val smallObjects = new ArrayList(); 38 | smallObjects.add(new PrismSmallObject(101, 1, 1, false, 30, time)); 39 | smallObjects.add(new PrismSmallObject(102, 1, 1, false, 30, time)); 40 | smallObjects.add(new PrismSmallObject(103, 1, 1, false, 30, time)); 41 | smallObjects.add(new PrismSmallObject(104, 1, 1, false, 30, time)); 42 | 43 | val mergeableObjects = openMergeRange.calculateMergeableObjectList(100, smallObjects); 44 | assertIterableEquals(smallObjects.subList(0, 2), mergeableObjects); 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /merge/src/test/java/com/cookpad/prism/merge/ParallelParquetMergerTest.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.merge; 2 | 3 | import java.io.IOException; 4 | import java.time.ZoneOffset; 5 | import java.util.ArrayList; 6 | import java.util.concurrent.Executors; 7 | import java.util.function.Supplier; 8 | 9 | import org.apache.hadoop.conf.Configuration; 10 | import org.apache.parquet.hadoop.ParquetWriter; 11 | import com.cookpad.prism.TempFile; 12 | import com.cookpad.prism.record.Record; 13 | import com.cookpad.prism.record.RecordReaderFactory; 14 | import com.cookpad.prism.record.RecordWriterFactory; 15 | import com.cookpad.prism.record.Schema; 16 | import com.cookpad.prism.record.UnsizedValueType; 17 | import com.cookpad.prism.record.ValueKind; 18 | import com.cookpad.prism.record.ValueListRecord; 19 | import com.cookpad.prism.record.Schema.Builder.BadSchemaError; 20 | import com.cookpad.prism.record.values.NonNullValue; 21 | import com.cookpad.prism.record.values.PrimitiveValue; 22 | import com.cookpad.prism.record.values.Value; 23 | import org.junit.jupiter.api.Test; 24 | 25 | import static org.junit.jupiter.api.Assertions.assertEquals; 26 | 27 | import lombok.val; 28 | 29 | public class ParallelParquetMergerTest { 30 | @Test 31 | public void testMergeInterleave() throws BadSchemaError, IOException { 32 | val schema = new Schema.Builder("test_s", "test_t") 33 | .withTimestamp("time", false, ZoneOffset.UTC) 34 | .addColumn("value", new UnsizedValueType(ValueKind.BIGINT), false) 35 | .build() 36 | ; 37 | val tsColumn = schema.getColumns().get(0); 38 | val valueColumn = schema.getColumns().get(1); 39 | 40 | val values1 = new ArrayList(); 41 | values1.add(new NonNullValue(tsColumn, new PrimitiveValue.LongValue(1L))); 42 | values1.add(new NonNullValue(valueColumn, new PrimitiveValue.LongValue(100L))); 43 | val record1 = new ValueListRecord(values1); 44 | 45 | val values2 = new ArrayList(); 46 | values2.add(new NonNullValue(tsColumn, new PrimitiveValue.LongValue(2L))); 47 | values2.add(new NonNullValue(valueColumn, new PrimitiveValue.LongValue(200L))); 48 | val record2 = new ValueListRecord(values2); 49 | 50 | val values3 = new ArrayList(); 51 | values3.add(new NonNullValue(tsColumn, new PrimitiveValue.LongValue(3L))); 52 | values3.add(new NonNullValue(valueColumn, new PrimitiveValue.LongValue(300L))); 53 | val record3 = new ValueListRecord(values3); 54 | 55 | val conf = new Configuration(); 56 | 57 | val tmp1 = new TempFile("prism-merge-test-", ".parquet"); 58 | val tmp2 = new TempFile("prism-merge-test-", ".parquet"); 59 | val tmp3 = new TempFile("prism-merge-test-", ".parquet"); 60 | 61 | val writerFactory = new RecordWriterFactory(conf); 62 | val readerFactory = new RecordReaderFactory(conf); 63 | ParquetWriter writerA = writerFactory.build(schema, tmp1.getPath()); 64 | writerA.write(record1); 65 | writerA.close(); 66 | 67 | ParquetWriter writerB = writerFactory.build(schema, tmp2.getPath()); 68 | writerB.write(record2); 69 | writerB.close(); 70 | 71 | ParquetWriter writerC = writerFactory.build(schema, tmp3.getPath()); 72 | writerC.write(record3); 73 | writerC.close(); 74 | 75 | val ex = Executors.newFixedThreadPool(4); 76 | val parquetFileMerger = new ParquetFileMerger(writerFactory, readerFactory); 77 | val parallelMerger = new ParallelParquetMerger(ex, ex, parquetFileMerger); 78 | 79 | val suppliers = new ArrayList>(); 80 | suppliers.add(() -> tmp2); 81 | suppliers.add(() -> tmp1); 82 | suppliers.add(() -> tmp3); 83 | val out = parallelMerger.merge(schema, suppliers); 84 | 85 | val reader = readerFactory.build(schema, out.getPath()); 86 | assertEquals(record1, reader.read()); 87 | assertEquals(record2, reader.read()); 88 | assertEquals(record3, reader.read()); 89 | assertEquals(null, reader.read()); 90 | 91 | tmp1.close(); 92 | tmp2.close(); 93 | tmp3.close(); 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /merge/src/test/java/com/cookpad/prism/merge/downloader/SmallObjectSupplierFactoryTest.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.merge.downloader; 2 | 3 | import org.junit.jupiter.api.Test; 4 | 5 | import static org.junit.jupiter.api.Assertions.assertEquals; 6 | import static org.mockito.Mockito.*; 7 | 8 | import java.io.File; 9 | import java.io.IOException; 10 | import java.nio.file.Path; 11 | import java.time.LocalDate; 12 | import java.time.LocalDateTime; 13 | 14 | import com.cookpad.prism.objectstore.SmallObjectStore; 15 | import com.cookpad.prism.dao.PrismPartition; 16 | import com.cookpad.prism.dao.PrismSmallObject; 17 | 18 | import lombok.val; 19 | 20 | public class SmallObjectSupplierFactoryTest { 21 | @Test 22 | public void testCreateSingleSupplierDelayed() throws IOException { 23 | val dt = LocalDate.of(2018, 9, 5); 24 | val objectId = 100; 25 | 26 | val path = mock(Path.class); 27 | val file = mock(File.class); 28 | when(file.toPath()) 29 | .thenReturn(path); 30 | val smallObjectStore = mock(SmallObjectStore.class); 31 | when(smallObjectStore.getDelayedObjectFile(dt, objectId)) 32 | .thenReturn(file); 33 | //smallObjectStore. 34 | val downloader = new SmallObjectSupplierFactory(smallObjectStore); 35 | val supplier = downloader.createSingleSupplier( 36 | new PrismSmallObject(1, objectId, 2, true, 1000, LocalDateTime.now()), 37 | new PrismPartition(2, 3, dt, -1, 0, null, false) 38 | ); 39 | assertEquals(path, supplier.get().getPath()); 40 | } 41 | 42 | @Test 43 | public void testCreateSingleSupplierLive() throws IOException { 44 | val dt = LocalDate.of(2018, 9, 5); 45 | val objectId = 100; 46 | 47 | val path = mock(Path.class); 48 | val file = mock(File.class); 49 | when(file.toPath()) 50 | .thenReturn(path); 51 | val smallObjectStore = mock(SmallObjectStore.class); 52 | when(smallObjectStore.getLiveObjectFile(dt, objectId)) 53 | .thenReturn(file); 54 | //smallObjectStore. 55 | val downloader = new SmallObjectSupplierFactory(smallObjectStore); 56 | val supplier = downloader.createSingleSupplier( 57 | new PrismSmallObject(1, objectId, 2, false, 1000, LocalDateTime.now()), 58 | new PrismPartition(2, 3, dt, -1, 0, null, false) 59 | ); 60 | assertEquals(path, supplier.get().getPath()); 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /settings.gradle: -------------------------------------------------------------------------------- 1 | include 'shared' 2 | include 'stream' 3 | include 'batch' 4 | include 'merge' 5 | 6 | rootProject.name = 'prism' 7 | -------------------------------------------------------------------------------- /shared/.gitignore: -------------------------------------------------------------------------------- 1 | /bin/ -------------------------------------------------------------------------------- /shared/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM public.ecr.aws/amazoncorretto/amazoncorretto:11 as hadoop-libs 2 | ARG HADOOP_VERSION=3.3.4 3 | 4 | RUN yum install -y tar gzip 5 | 6 | WORKDIR /tmp 7 | RUN curl -sSfO "https://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz{,.sha512,.asc}" 8 | RUN curl -sSf https://downloads.apache.org/hadoop/common/KEYS | gpg --import 9 | RUN sha512sum -c hadoop-${HADOOP_VERSION}.tar.gz.sha512 10 | RUN gpg --verify hadoop-${HADOOP_VERSION}.tar.gz.asc 11 | RUN tar xf hadoop-${HADOOP_VERSION}.tar.gz && mv hadoop-${HADOOP_VERSION} hadoop 12 | 13 | FROM public.ecr.aws/amazoncorretto/amazoncorretto:11 as libisal 14 | ARG LIBISAL_VERSION=2.30.0 15 | ARG LIBISAL_SHA512SUM=d3ecfb7326097534b06a74b584100336509525ae7cadc6112d0c27e3d8704f3810e18f583d3cc33fa266bfec96db023607622b22ddbf17988ec4bf1bb3b3b9b2 16 | 17 | RUN yum install -y tar gzip autoconf automake libtool make nasm 18 | 19 | WORKDIR /tmp 20 | RUN curl -sSfLO https://github.com/intel/isa-l/archive/refs/tags/v${LIBISAL_VERSION}.tar.gz 21 | RUN echo ${LIBISAL_SHA512SUM} v${LIBISAL_VERSION}.tar.gz | sha512sum -c - 22 | RUN tar xf v${LIBISAL_VERSION}.tar.gz 23 | WORKDIR /tmp/isa-l-${LIBISAL_VERSION} 24 | RUN ./autogen.sh && ./configure --prefix=/usr --libdir=/usr/lib64 && make install 25 | 26 | FROM public.ecr.aws/amazoncorretto/amazoncorretto:11 27 | 28 | RUN yum install -y libzstd && yum clean all && rm -rf /var/cache/yum 29 | COPY --from=hadoop-libs /tmp/hadoop/lib/native/lib*.so /usr/lib64/ 30 | COPY --from=libisal /usr/lib64/libisal.so.2 /usr/lib64/ 31 | -------------------------------------------------------------------------------- /shared/build.gradle: -------------------------------------------------------------------------------- 1 | plugins { 2 | id 'com.palantir.docker' version '0.34.0' 3 | id 'java-library' 4 | } 5 | 6 | dependencies { 7 | api group: 'org.mybatis.spring.boot', name: 'mybatis-spring-boot-starter', version: '1.3.2' 8 | api group: 'org.apache.parquet', name: 'parquet-hadoop', version: '1.12.3' 9 | api group: 'org.apache.hadoop', name: 'hadoop-common', version: '3.3.4' 10 | api group: 'org.apache.hadoop', name: 'hadoop-mapreduce-client-core', version: '3.3.4' 11 | api group: 'com.fasterxml.jackson.core', name: 'jackson-core', version: '2.8.7' 12 | api group: 'com.fasterxml.jackson.core', name: 'jackson-databind', version: '2.8.7' 13 | api group: 'com.amazonaws', name: 'aws-java-sdk-core', version: '1.11.438' 14 | api group: 'com.amazonaws', name: 'aws-java-sdk-s3', version: '1.11.438' 15 | api group: 'com.amazonaws', name: 'aws-java-sdk-sqs', version: '1.11.438' 16 | api group: 'com.amazonaws', name: 'aws-java-sdk-sns', version: '1.11.438' 17 | api group: 'javax.activation', name: 'activation', version: '1.1.1' 18 | } 19 | 20 | configurations.all { 21 | exclude module: 'slf4j-log4j12' 22 | exclude module: 'log4j' 23 | exclude module: 'slf4j-reload4j' 24 | } 25 | 26 | docker { 27 | name 'prism-base' 28 | dockerfile file('Dockerfile') 29 | } 30 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/Banner.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism; 2 | 3 | public class Banner { 4 | final private static String banner = 5 | " // ) )\n" + 6 | " //___/ / __ ( ) ___ _ __\n" + 7 | " / ____ / // ) ) / / (( ) ) // ) ) ) )\n" + 8 | " // // / / \\ \\ // / / / /\n" + 9 | "// // / / // ) ) // / / / /\n"; 10 | public static String getBanner() { 11 | return Banner.banner; 12 | } 13 | private Banner() {} 14 | } 15 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/PrismConf.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism; 2 | 3 | import org.springframework.boot.context.properties.ConfigurationProperties; 4 | 5 | import lombok.NoArgsConstructor; 6 | import lombok.Getter; 7 | import lombok.Setter; 8 | import lombok.ToString; 9 | 10 | @ConfigurationProperties(prefix="prism") 11 | @NoArgsConstructor 12 | @Getter 13 | @Setter 14 | @ToString 15 | public class PrismConf { 16 | String bucketName; 17 | String prefix; 18 | } 19 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/SchemaBuilder.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism; 2 | 3 | import java.util.List; 4 | import java.util.Objects; 5 | 6 | import com.cookpad.prism.record.Schema; 7 | import com.cookpad.prism.record.SizedValueType; 8 | import com.cookpad.prism.record.UnsizedValueType; 9 | import com.cookpad.prism.record.ValueKind; 10 | import com.cookpad.prism.record.ValueType; 11 | import com.cookpad.prism.record.Schema.Builder.BadSchemaError; 12 | import com.cookpad.prism.dao.PrismTable; 13 | import com.cookpad.prism.dao.StreamColumn; 14 | 15 | import lombok.RequiredArgsConstructor; 16 | 17 | @RequiredArgsConstructor 18 | public class SchemaBuilder { 19 | private ValueKind getValueKindFromTypeName(String typeName) { 20 | ValueKind valueKind = ValueKind.valueOf(typeName.toUpperCase()); 21 | if (valueKind == null) { 22 | throw new IllegalArgumentException("No such ValueKind: " + typeName); 23 | } 24 | return valueKind; 25 | } 26 | 27 | public Schema build(PrismTable table, List columns) throws BadColumnsError { 28 | StreamColumn partitionSourceColumn = null; 29 | for (StreamColumn column: columns) { 30 | if (column.isPartitionSource()) { 31 | partitionSourceColumn = column; 32 | break; 33 | } 34 | } 35 | if (partitionSourceColumn == null) { 36 | throw new BadColumnsError("no partition source column"); 37 | } 38 | try { 39 | Schema.Builder builder = new Schema.Builder(table.getLogicalSchemaName(), table.getLogicalTableName()); 40 | for (StreamColumn column: columns) { 41 | if (column.getType().toUpperCase().equals("UNKNOWN")) { 42 | continue; 43 | } 44 | ValueKind valueKind = getValueKindFromTypeName(column.getType()); 45 | ValueType valueType; 46 | if (column.getLength() == null) { 47 | valueType = new UnsizedValueType(valueKind); 48 | } else { 49 | valueType = new SizedValueType(valueKind, column.getLength()); 50 | } 51 | if (column.isPartitionSource()) { 52 | builder.addTimestampColumn(column.getName(), valueType, false, column.getZoneOffsetAsZoneOffset()); 53 | } else { 54 | if (this.isCompatible(partitionSourceColumn, column)) { 55 | builder.addSecondaryTimestampColumn(column.getName(), valueType, true); 56 | } else { 57 | builder.addColumn(column.getName(), valueType, true); 58 | } 59 | } 60 | } 61 | return builder.build(); 62 | } catch(BadSchemaError cause) { 63 | throw new BadColumnsError(cause); 64 | } 65 | } 66 | 67 | public boolean isCompatible(StreamColumn a, StreamColumn b) { 68 | return ( 69 | Objects.equals(a.getSourceName(), b.getSourceName()) && 70 | Objects.equals(a.getType(), b.getType()) && 71 | Objects.equals(a.getLength(), b.getLength()) && 72 | Objects.equals(a.getZoneOffset(), b.getZoneOffset()) && 73 | Objects.equals(a.getSourceOffset(), b.getSourceOffset()) 74 | ); 75 | } 76 | 77 | @SuppressWarnings("serial") 78 | public static class BadColumnsError extends Exception { 79 | BadColumnsError(Throwable cause) { 80 | super(cause); 81 | } 82 | 83 | BadColumnsError(String message) { 84 | super(message); 85 | } 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/StepHandler.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism; 2 | 3 | public interface StepHandler { 4 | /** 5 | * return false if it is done successfully 6 | * return true if it continues 7 | */ 8 | public boolean handleStep(); 9 | 10 | public void shutdown(); 11 | } 12 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/TempFile.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.nio.file.Files; 6 | import java.nio.file.Path; 7 | 8 | import lombok.Getter; 9 | import lombok.RequiredArgsConstructor; 10 | import lombok.extern.slf4j.Slf4j; 11 | 12 | @Slf4j 13 | @RequiredArgsConstructor 14 | public class TempFile implements AutoCloseable { 15 | @Getter 16 | private final Path path; 17 | 18 | public TempFile(String prefix, String suffix) throws IOException { 19 | this.path = Files.createTempFile(prefix, suffix); 20 | log.debug("Created temp file: {}", this.path); 21 | } 22 | 23 | @Override 24 | public void close() throws IOException { 25 | File file = this.path.toFile(); 26 | if (!file.delete()) { 27 | log.warn("Failed to delete temp file: {}", file.getPath()); 28 | } 29 | log.debug("Deleted temp file: {}", file.getPath()); 30 | } 31 | 32 | @RequiredArgsConstructor 33 | public static class Factory { 34 | private final String prefix; 35 | private final String suffix; 36 | 37 | public TempFile create() throws IOException { 38 | return new TempFile(this.prefix, this.suffix); 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/dao/OneToMany.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.dao; 2 | 3 | import java.util.List; 4 | 5 | import lombok.AllArgsConstructor; 6 | import lombok.Data; 7 | import lombok.NoArgsConstructor; 8 | 9 | @AllArgsConstructor 10 | @NoArgsConstructor 11 | @Data 12 | public class OneToMany { 13 | private O one; 14 | private List many; 15 | } 16 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/dao/OneToOne.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.dao; 2 | 3 | import lombok.AllArgsConstructor; 4 | import lombok.Data; 5 | import lombok.NoArgsConstructor; 6 | 7 | @AllArgsConstructor 8 | @NoArgsConstructor 9 | @Data 10 | public class OneToOne { 11 | private L left; 12 | private R right; 13 | } 14 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/dao/PacketStream.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.dao; 2 | 3 | import java.time.LocalDateTime; 4 | 5 | import lombok.AllArgsConstructor; 6 | import lombok.Data; 7 | import lombok.NoArgsConstructor; 8 | 9 | @Data 10 | @AllArgsConstructor 11 | @NoArgsConstructor 12 | public class PacketStream { 13 | private long id; 14 | private String name; 15 | private boolean disabled; 16 | private boolean discard; 17 | private boolean noDispatch; 18 | private boolean initialized; 19 | private LocalDateTime createTime; 20 | private boolean columnInitialized; 21 | } 22 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/dao/PacketStreamMapper.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.dao; 2 | 3 | import java.util.List; 4 | 5 | import org.apache.ibatis.annotations.Mapper; 6 | import org.apache.ibatis.annotations.Param; 7 | 8 | @Mapper 9 | public interface PacketStreamMapper { 10 | List> findByStreamName(@Param("streamName") String streamName); 11 | 12 | List, PrismTable>> findByDestBucketAndPrefix( 13 | @Param("destBucket") String destBucket, @Param("destPrefix") String destPrefix); 14 | 15 | List> findByPrismTableId(@Param("tableId") int tableId); 16 | } 17 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/dao/PrismMergeJob.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.dao; 2 | 3 | import java.time.LocalDateTime; 4 | 5 | import lombok.AllArgsConstructor; 6 | import lombok.Data; 7 | import lombok.NoArgsConstructor; 8 | 9 | @Data 10 | @AllArgsConstructor 11 | @NoArgsConstructor 12 | public class PrismMergeJob { 13 | private long id; 14 | private long partitionId; 15 | private LocalDateTime scheduleTime; 16 | private long ongoingMark; // 0 if it's pending, same as id if it's ongoing 17 | private LocalDateTime heartbeatTime; 18 | } 19 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/dao/PrismMergeJobMapper.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.dao; 2 | 3 | import java.time.LocalDateTime; 4 | import java.util.List; 5 | 6 | import org.apache.ibatis.annotations.Param; 7 | 8 | public interface PrismMergeJobMapper { 9 | void enqueue(@Param("partitionId") long partitionId, @Param("scheduleTime") LocalDateTime scheduleTime); 10 | void retry(@Param("partitionId") long partitionId, @Param("scheduleTime") LocalDateTime scheduleTime); 11 | void delete(@Param("id") long id); 12 | PrismMergeJob dequeue(@Param("now") LocalDateTime now); 13 | List findTimedoutJobs(@Param("timedoutPeriod") LocalDateTime timedoutPeriod, @Param("limit") int limit); 14 | } 15 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/dao/PrismMergeRange.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.dao; 2 | 3 | import java.time.LocalDateTime; 4 | 5 | import lombok.AllArgsConstructor; 6 | import lombok.Data; 7 | import lombok.NoArgsConstructor; 8 | 9 | @Data 10 | @AllArgsConstructor 11 | @NoArgsConstructor 12 | public class PrismMergeRange { 13 | private long id; 14 | private long partitionId; 15 | private long lowerBound; 16 | private long upperBound; 17 | private long contentLength; 18 | private LocalDateTime createTime; 19 | private LocalDateTime updateTime; 20 | } 21 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/dao/PrismMergeRangeMapper.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.dao; 2 | 3 | import java.time.LocalDateTime; 4 | import java.util.List; 5 | 6 | import org.apache.ibatis.annotations.Param; 7 | 8 | public interface PrismMergeRangeMapper { 9 | PrismMergeRange findOpenRange(@Param("partitionId") long partitionId); 10 | List findAllInPartition(@Param("partitionId") long partitionId); 11 | void upsertRange( 12 | @Param("partitionId") long partitionId, 13 | @Param("lowerBound") long lowerBound, 14 | @Param("upperBound") long upperBound, 15 | @Param("contentLength") long contentLength, 16 | @Param("now") LocalDateTime now 17 | ); 18 | } 19 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/dao/PrismPartition.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.dao; 2 | 3 | import java.time.LocalDate; 4 | 5 | import lombok.AllArgsConstructor; 6 | import lombok.Data; 7 | import lombok.NoArgsConstructor; 8 | 9 | @Data 10 | @AllArgsConstructor 11 | @NoArgsConstructor 12 | public class PrismPartition { 13 | private long id; 14 | private int tableId; 15 | private LocalDate partitionDate; 16 | private long currentManifestVersion; 17 | private long desiredManifestVersion; 18 | private Long lastLiveObjectId; 19 | private boolean switched; 20 | } 21 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/dao/PrismPartitionMapper.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.dao; 2 | 3 | import java.time.LocalDate; 4 | import java.time.LocalDateTime; 5 | import java.util.List; 6 | 7 | import org.apache.ibatis.annotations.Param; 8 | 9 | public interface PrismPartitionMapper { 10 | PrismPartition createPartitionIfNotExists(@Param("tableId") int tableId, @Param("partitionDate") LocalDate partitionDate); 11 | PrismPartition find(@Param("id") long id); 12 | PrismPartition findByTableIdAndDate(@Param("tableId") int tableId, @Param("partitionDate") LocalDate partitionDate); 13 | void closePartitions(@Param("now") LocalDateTime now); 14 | void switchPartitions(); 15 | List getNewPartitions(); 16 | List getSwitchedPartitionsToUpdate(); 17 | void updateCurrentManifestVersion(@Param("id") long id, @Param("currentManifestVersion") long currentManifestVersion); 18 | void updateDesiredManifestVersion(@Param("id") long id, @Param("desiredManifestVersion") long desiredManifestVersion); 19 | } 20 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/dao/PrismSmallObject.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.dao; 2 | 3 | import java.time.LocalDateTime; 4 | 5 | import lombok.AllArgsConstructor; 6 | import lombok.Data; 7 | import lombok.NoArgsConstructor; 8 | 9 | @AllArgsConstructor 10 | @NoArgsConstructor 11 | @Data 12 | public class PrismSmallObject { 13 | private static long UNINITIALIZED_ID = -1; 14 | 15 | private long id = UNINITIALIZED_ID; 16 | private long stagingObjectId; 17 | private long partitionId; 18 | private boolean isDelayed; 19 | private long contentLength; 20 | private LocalDateTime uploadStartTime; 21 | 22 | public long getId() { 23 | if (this.id == UNINITIALIZED_ID) { 24 | throw new IllegalStateException("Get uninitialized id of PrismSmallObject"); 25 | } 26 | return this.id; 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/dao/PrismSmallObjectMapper.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.dao; 2 | 3 | import java.time.LocalDateTime; 4 | import java.util.List; 5 | 6 | import org.apache.ibatis.annotations.Param; 7 | import org.springframework.transaction.annotation.Propagation; 8 | import org.springframework.transaction.annotation.Transactional; 9 | 10 | public interface PrismSmallObjectMapper { 11 | PrismSmallObject findByParams(@Param("stagingObjectId") long stagingObjectId, @Param("partitionId") long partitionId); 12 | PrismSmallObject createByParams(@Param("stagingObjectId") long stagingObjectId, @Param("partitionId") long partitionId, @Param("uploadStartTime") LocalDateTime uploadStartTime, @Param("contentLength") long contentLength); 13 | List findNewObjects(@Param("partitionId") long partitionId, @Param("lowerBound") long lowerBound, @Param("limit") int limit); 14 | List findAllObjectsInRange(@Param("partitionId") long partitionId, @Param("lowerBound") long lowerBound, @Param("upperBound") long upperBound); 15 | 16 | @Transactional(propagation = Propagation.NESTED) 17 | default PrismSmallObject findOrCreateByParams(long stagingObjectId, long partitionId, LocalDateTime uploadStartTime, long contentLength) { 18 | PrismSmallObject smallObject = this.findByParams(stagingObjectId, partitionId); 19 | if (smallObject != null) { 20 | return smallObject; 21 | } 22 | smallObject = this.createByParams(stagingObjectId, partitionId, uploadStartTime, contentLength); 23 | return smallObject; 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/dao/PrismStagingObject.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.dao; 2 | 3 | import java.net.URI; 4 | import java.net.URISyntaxException; 5 | import java.time.LocalDateTime; 6 | 7 | import lombok.AllArgsConstructor; 8 | import lombok.Data; 9 | import lombok.NoArgsConstructor; 10 | 11 | @AllArgsConstructor 12 | @NoArgsConstructor 13 | @Data 14 | public class PrismStagingObject { 15 | private static long UNINITIALIZED_ID = -1; 16 | 17 | private long id = UNINITIALIZED_ID; 18 | private String bucketName; 19 | private String objectKey; 20 | private LocalDateTime sendTime; 21 | private LocalDateTime firstReceiveTime; 22 | 23 | public long getId() { 24 | if (this.id == UNINITIALIZED_ID) { 25 | throw new IllegalStateException("Get uninitialized id of PrismStagingObject"); 26 | } 27 | return this.id; 28 | } 29 | 30 | public URI getObjectUri() { 31 | try { 32 | return new URI("s3", this.getBucketName(), "/" + this.getObjectKey(), null); 33 | } catch (URISyntaxException e) { 34 | throw new RuntimeException(e); 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/dao/PrismStagingObjectMapper.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.dao; 2 | 3 | import org.apache.ibatis.annotations.Insert; 4 | import org.apache.ibatis.annotations.Options; 5 | import org.apache.ibatis.annotations.Param; 6 | import org.apache.ibatis.annotations.ResultMap; 7 | import org.apache.ibatis.annotations.Select; 8 | 9 | public interface PrismStagingObjectMapper { 10 | @Select("select * from prism_staging_objects where object_key = #{objectKey} and bucket_name = #{bucketName}") 11 | @ResultMap("prismStagingObjectMap") 12 | PrismStagingObject findByBucketNameAndObjectKey(@Param("bucketName") String bucketName, @Param("objectKey") String objectKey); 13 | 14 | @Insert("insert into prism_staging_objects(bucket_name, object_key, send_time, first_receive_time) values (#{bucketName}, #{objectKey}, #{sendTime}, #{firstReceiveTime})") 15 | @Options(useGeneratedKeys = true, keyProperty = "id", keyColumn = "prism_staging_object_id") 16 | void create(PrismStagingObject newObject); 17 | } 18 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/dao/PrismTable.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.dao; 2 | 3 | import java.time.LocalDateTime; 4 | 5 | import lombok.AllArgsConstructor; 6 | import lombok.Data; 7 | import lombok.NoArgsConstructor; 8 | 9 | @Data 10 | @AllArgsConstructor 11 | @NoArgsConstructor 12 | public class PrismTable { 13 | private int id; 14 | private String physicalSchemaName; 15 | private String physicalTableName; 16 | private String logicalSchemaName; 17 | private String logicalTableName; 18 | private LocalDateTime createTime; 19 | private int mergeInterval; 20 | 21 | public String getPhysicalSchemaName() { 22 | if (this.physicalSchemaName != null) { 23 | return this.physicalSchemaName; 24 | } 25 | return this.logicalSchemaName; 26 | } 27 | 28 | public String getPhysicalTableName() { 29 | if (this.physicalTableName != null) { 30 | return this.physicalTableName; 31 | } 32 | return this.logicalTableName; 33 | } 34 | 35 | public String getPhysicalFullName() { 36 | return String.format("%s.%s", this.getPhysicalSchemaName(), this.getPhysicalTableName()); 37 | } 38 | 39 | public String getLogicalFullName() { 40 | return String.format("%s.%s", this.getLogicalSchemaName(), this.getLogicalTableName()); 41 | } 42 | 43 | public LocalDateTime scheduleTime(LocalDateTime now) { 44 | return now.plusSeconds(this.mergeInterval); 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/dao/PrismTableMapper.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.dao; 2 | 3 | import java.util.List; 4 | 5 | import org.apache.ibatis.annotations.Param; 6 | 7 | public interface PrismTableMapper { 8 | public PrismTable find(@Param("tableId") int tableId); 9 | public OneToMany findWithColumns(@Param("tableId") int tableId); 10 | public List> getAllWithColumns(); 11 | public void unlink(@Param("tableId") int tableId); 12 | public void drop(@Param("tableId") int tableId); 13 | } 14 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/dao/PrismUnknownStagingObject.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.dao; 2 | 3 | import java.net.URI; 4 | import java.net.URISyntaxException; 5 | import java.time.LocalDateTime; 6 | 7 | import lombok.AllArgsConstructor; 8 | import lombok.Data; 9 | import lombok.NoArgsConstructor; 10 | 11 | @AllArgsConstructor 12 | @NoArgsConstructor 13 | @Data 14 | public class PrismUnknownStagingObject { 15 | private static long UNINITIALIZED_ID = -1; 16 | 17 | private long id = UNINITIALIZED_ID; 18 | private String bucketName; 19 | private String objectKey; 20 | private LocalDateTime sendTime; 21 | private LocalDateTime firstReceiveTime; 22 | private String message; 23 | 24 | public long getId() { 25 | if (this.id == UNINITIALIZED_ID) { 26 | throw new IllegalStateException("Get uninitialized id of PrismStagingObject"); 27 | } 28 | return this.id; 29 | } 30 | 31 | public URI getObjectUri() { 32 | try { 33 | return new URI("s3", this.getBucketName(), "/" + this.getObjectKey(), null); 34 | } catch (URISyntaxException e) { 35 | throw new RuntimeException(e); 36 | } 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/dao/PrismUnknownStagingObjectMapper.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.dao; 2 | 3 | import org.apache.ibatis.annotations.Insert; 4 | import org.apache.ibatis.annotations.Options; 5 | import org.apache.ibatis.annotations.Param; 6 | import org.apache.ibatis.annotations.ResultMap; 7 | import org.apache.ibatis.annotations.Select; 8 | 9 | public interface PrismUnknownStagingObjectMapper { 10 | @Select("select * from prism_unknown_staging_objects where object_key = #{objectKey} and bucket_name = #{bucketName}") 11 | @ResultMap("prismUnknownStagingObjectMap") 12 | PrismUnknownStagingObject findByBucketNameAndObjectKey(@Param("bucketName") String bucketName, @Param("objectKey") String objectKey); 13 | 14 | @Insert("insert into prism_unknown_staging_objects(bucket_name, object_key, send_time, first_receive_time, message) values (#{bucketName}, #{objectKey}, #{sendTime}, #{firstReceiveTime}, #{message})") 15 | @Options(useGeneratedKeys = true, keyProperty = "id", keyColumn = "prism_unknown_staging_object_id") 16 | void create(PrismUnknownStagingObject newObject); 17 | } 18 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/dao/StreamColumn.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.dao; 2 | 3 | import java.time.Instant; 4 | import java.time.ZoneOffset; 5 | 6 | import lombok.AllArgsConstructor; 7 | import lombok.Data; 8 | import lombok.NoArgsConstructor; 9 | 10 | @Data 11 | @AllArgsConstructor 12 | @NoArgsConstructor 13 | public class StreamColumn { 14 | private long id; 15 | private String name; 16 | private String sourceName; 17 | private String type; 18 | private Integer length; 19 | private String sourceOffset; 20 | private String zoneOffset; 21 | private Instant createTime; 22 | private boolean isPartitionSource; 23 | 24 | public ZoneOffset getZoneOffsetAsZoneOffset() { 25 | if (this.getZoneOffset() == null) { 26 | return null; 27 | } 28 | return ZoneOffset.of(this.getZoneOffset()); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/jsonl/JsonlReader.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.jsonl; 2 | 3 | import java.io.IOException; 4 | import java.io.LineNumberReader; 5 | 6 | import com.fasterxml.jackson.core.JsonParser; 7 | import com.fasterxml.jackson.databind.JsonNode; 8 | import com.fasterxml.jackson.databind.MappingJsonFactory; 9 | 10 | public class JsonlReader implements AutoCloseable { 11 | private static final MappingJsonFactory FACTORY = new MappingJsonFactory(); 12 | 13 | final private LineNumberReader inner; 14 | public JsonlReader(LineNumberReader inner) { 15 | this.inner = inner; 16 | } 17 | 18 | public JsonNode read() throws IOException { 19 | String line = this.inner.readLine(); 20 | if (line == null) { 21 | return null; 22 | } 23 | JsonParser parser = FACTORY.createParser(line); 24 | return parser.readValueAs(JsonNode.class); 25 | } 26 | 27 | @Override 28 | public void close() throws IOException { 29 | this.inner.close(); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/jsonl/converters/Converter.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.jsonl.converters; 2 | 3 | import com.fasterxml.jackson.databind.JsonNode; 4 | 5 | import com.cookpad.prism.record.Schema.Column; 6 | import com.cookpad.prism.record.values.Value; 7 | 8 | public interface Converter { 9 | public Value convertFrom(Column column, JsonNode node) throws UnexpectedValueType; 10 | } 11 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/jsonl/converters/DefaultConverter.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.jsonl.converters; 2 | 3 | import com.fasterxml.jackson.databind.JsonNode; 4 | 5 | import com.cookpad.prism.record.Schema.Column; 6 | import com.cookpad.prism.record.values.NonNullValue; 7 | import com.cookpad.prism.record.values.PrimitiveValue; 8 | import com.cookpad.prism.record.values.Value; 9 | 10 | import lombok.NonNull; 11 | import lombok.RequiredArgsConstructor; 12 | 13 | @RequiredArgsConstructor 14 | public class DefaultConverter implements Converter { 15 | final private PrimitiveConverter primitiveConverter; 16 | @NonNull 17 | final private PrimitiveValue defaultValue; 18 | 19 | @Override 20 | public Value convertFrom(Column column, JsonNode node) throws UnexpectedValueType { 21 | if (node == null || node.isNull()) { 22 | return new NonNullValue(column, this.defaultValue); 23 | } 24 | PrimitiveValue value = this.primitiveConverter.convertFrom(node); 25 | return new NonNullValue(column, value); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/jsonl/converters/NonNullConverter.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.jsonl.converters; 2 | 3 | import com.fasterxml.jackson.databind.JsonNode; 4 | 5 | import com.cookpad.prism.record.Schema.Column; 6 | import com.cookpad.prism.record.values.NonNullValue; 7 | import com.cookpad.prism.record.values.PrimitiveValue; 8 | import com.cookpad.prism.record.values.Value; 9 | 10 | import lombok.RequiredArgsConstructor; 11 | 12 | @RequiredArgsConstructor 13 | public class NonNullConverter implements Converter { 14 | final private PrimitiveConverter primitiveConverter; 15 | 16 | @Override 17 | public Value convertFrom(Column column, JsonNode node) throws UnexpectedValueType { 18 | if (node == null || node.isNull()) { 19 | throw new UnexpectedValueType("non null"); 20 | } 21 | PrimitiveValue value = this.primitiveConverter.convertFrom(node); 22 | return new NonNullValue(column, value); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/jsonl/converters/NullableConverter.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.jsonl.converters; 2 | 3 | import com.fasterxml.jackson.databind.JsonNode; 4 | 5 | import com.cookpad.prism.record.Schema.Column; 6 | import com.cookpad.prism.record.values.NonNullValue; 7 | import com.cookpad.prism.record.values.NullValue; 8 | import com.cookpad.prism.record.values.PrimitiveValue; 9 | import com.cookpad.prism.record.values.Value; 10 | 11 | import lombok.RequiredArgsConstructor; 12 | 13 | @RequiredArgsConstructor 14 | public class NullableConverter implements Converter { 15 | final private PrimitiveConverter primitiveConverter; 16 | 17 | @Override 18 | public Value convertFrom(Column column, JsonNode node) throws UnexpectedValueType { 19 | if (node == null || node.isNull()) { 20 | return new NullValue(); 21 | } 22 | PrimitiveValue value = this.primitiveConverter.convertFrom(node); 23 | return new NonNullValue(column, value); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/jsonl/converters/PrimitiveConverter.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.jsonl.converters; 2 | 3 | import java.time.Instant; 4 | import java.time.OffsetDateTime; 5 | import java.time.ZoneOffset; 6 | import java.time.format.DateTimeParseException; 7 | 8 | import com.fasterxml.jackson.core.JsonProcessingException; 9 | import com.fasterxml.jackson.databind.JsonNode; 10 | import com.fasterxml.jackson.databind.ObjectMapper; 11 | 12 | import org.apache.parquet.io.api.Binary; 13 | import com.cookpad.prism.record.values.PrimitiveValue; 14 | import static com.cookpad.prism.record.values.PrimitiveValue.*; 15 | 16 | public interface PrimitiveConverter> { 17 | public T convertFrom(JsonNode node) throws UnexpectedValueType; 18 | 19 | public static class StringConverter implements PrimitiveConverter { 20 | final static ObjectMapper MAPPER = new ObjectMapper(); 21 | @Override 22 | public BinaryValue convertFrom(JsonNode node) throws UnexpectedValueType { 23 | String str; 24 | if (node.isTextual()) { 25 | str = node.asText(); 26 | } else { 27 | try { 28 | str = MAPPER.writeValueAsString(node); 29 | } catch (JsonProcessingException e) { 30 | throw new UnexpectedValueType("JSON"); 31 | } 32 | } 33 | return new BinaryValue(Binary.fromString(str)); 34 | } 35 | } 36 | 37 | public static class BooleanConverter implements PrimitiveConverter { 38 | @Override 39 | public BooleanValue convertFrom(JsonNode node) throws UnexpectedValueType { 40 | if (!node.isBoolean()) { 41 | throw new UnexpectedValueType("booelan"); 42 | } 43 | return new BooleanValue(node.asBoolean()); 44 | } 45 | } 46 | 47 | public static class IntegerConverter implements PrimitiveConverter { 48 | @Override 49 | public IntegerValue convertFrom(JsonNode node) throws UnexpectedValueType { 50 | if (!node.isIntegralNumber()) { 51 | throw new UnexpectedValueType("int"); 52 | } 53 | return new IntegerValue(node.asInt()); 54 | } 55 | } 56 | 57 | public static class BigintConverter implements PrimitiveConverter { 58 | @Override 59 | public LongValue convertFrom(JsonNode node) throws UnexpectedValueType { 60 | if (!node.isIntegralNumber()) { 61 | throw new UnexpectedValueType("long"); 62 | } 63 | return new LongValue(node.asLong()); 64 | } 65 | } 66 | 67 | public static class TimestampConverter implements PrimitiveConverter { 68 | public OffsetDateTime toOffsetDateTime(JsonNode value) throws UnexpectedValueType { 69 | if (!value.isTextual()) { 70 | throw new UnexpectedValueType("ISO8601 string"); 71 | } 72 | String iso8601Text = value.asText(); 73 | OffsetDateTime odt; 74 | try { 75 | odt = OffsetDateTime.parse(iso8601Text); 76 | } catch(DateTimeParseException ex) { 77 | throw new UnexpectedValueType("ISO8601 string"); 78 | } 79 | if (odt.toInstant().compareTo(Instant.EPOCH) < 0) { 80 | ZoneOffset offset = odt.getOffset(); 81 | odt = OffsetDateTime.ofInstant(Instant.EPOCH, offset); 82 | } 83 | return odt; 84 | } 85 | 86 | @Override 87 | public LongValue convertFrom(JsonNode value) throws UnexpectedValueType { 88 | OffsetDateTime odt = this.toOffsetDateTime(value); 89 | return new LongValue(odt.toInstant().toEpochMilli()); 90 | } 91 | } 92 | 93 | public static class DoubleConverter implements PrimitiveConverter { 94 | @Override 95 | public DoubleValue convertFrom(JsonNode node) throws UnexpectedValueType { 96 | if (!node.isNumber()) { 97 | throw new UnexpectedValueType("double"); 98 | } 99 | return new DoubleValue(node.asDouble()); 100 | } 101 | } 102 | 103 | public static class FloatConverter implements PrimitiveConverter { 104 | @Override 105 | public FloatValue convertFrom(JsonNode node) throws UnexpectedValueType { 106 | if (!node.isNumber()) { 107 | throw new UnexpectedValueType(node.asText() + "is not a float value."); 108 | } 109 | // node.isNumber() is always true here 110 | return new FloatValue(node.floatValue()); 111 | } 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/jsonl/converters/UnexpectedValueType.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.jsonl.converters; 2 | 3 | import lombok.Getter; 4 | 5 | public class UnexpectedValueType extends RuntimeException { 6 | private static final long serialVersionUID = 1L; 7 | @Getter 8 | final private String expectedType; 9 | UnexpectedValueType(String expectedType) { 10 | super("not a " + expectedType); 11 | this.expectedType = expectedType; 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/objectstore/MergedObjectStore.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.objectstore; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.io.InputStream; 6 | import java.time.LocalDate; 7 | 8 | public interface MergedObjectStore { 9 | public InputStream getMergedObject(LocalDate dt, long lowerBound, long upperBound); 10 | public File getMergedObjectFile(LocalDate dt, long lowerBound, long upperBound) throws IOException; 11 | public String putMergedObjectFile(LocalDate dt, long lowerBound, long upperBound, File content); 12 | public String putMergedPartitionManifest(LocalDate dt, long manifestVersion, String content); 13 | } 14 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/objectstore/MergedPartitionManifest.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.objectstore; 2 | 3 | public class MergedPartitionManifest { 4 | 5 | } 6 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/objectstore/PartitionManifest.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.objectstore; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import com.fasterxml.jackson.annotation.JsonProperty; 7 | import com.fasterxml.jackson.core.JsonProcessingException; 8 | import com.fasterxml.jackson.databind.ObjectMapper; 9 | 10 | import lombok.RequiredArgsConstructor; 11 | 12 | public class PartitionManifest { 13 | @JsonProperty("entries") 14 | private List entries = new ArrayList<>(); 15 | 16 | @RequiredArgsConstructor 17 | public static class Entry { 18 | @JsonProperty("url") 19 | private final String url; 20 | @JsonProperty("meta") 21 | private final Meta meta; 22 | 23 | public Entry(String url, long contentLength) { 24 | this(url, new Meta(contentLength)); 25 | } 26 | 27 | @RequiredArgsConstructor 28 | public static class Meta { 29 | @JsonProperty("content_length") 30 | private final long contentLength; 31 | } 32 | } 33 | 34 | public void add(String url, long contentLength) { 35 | this.entries.add(new Entry(url, contentLength)); 36 | } 37 | 38 | public String toJSON() { 39 | ObjectMapper mapper = new ObjectMapper(); 40 | try { 41 | return mapper.writeValueAsString(this); 42 | } catch (JsonProcessingException e) { 43 | throw new RuntimeException(e); 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/objectstore/PrismObjectStoreFactory.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.objectstore; 2 | 3 | import com.amazonaws.services.s3.AmazonS3; 4 | 5 | import com.cookpad.prism.dao.PrismTable; 6 | 7 | import lombok.RequiredArgsConstructor; 8 | 9 | @RequiredArgsConstructor 10 | public class PrismObjectStoreFactory { 11 | final private AmazonS3 s3; 12 | final private PrismTableLocatorFactory objectLocator; 13 | 14 | public PrismObjectStore create(PrismTable table) { 15 | PrismTableLocator locator = this.objectLocator.build(table); 16 | return new PrismObjectStore(this.s3, locator); 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/objectstore/PrismTableLocator.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.objectstore; 2 | 3 | import java.net.URI; 4 | import java.net.URISyntaxException; 5 | import java.time.LocalDate; 6 | import java.time.format.DateTimeFormatter; 7 | 8 | import lombok.Getter; 9 | import lombok.RequiredArgsConstructor; 10 | 11 | @RequiredArgsConstructor 12 | public class PrismTableLocator { 13 | @Getter 14 | final private String bucketName; 15 | @Getter 16 | final private String tablePrefix; 17 | 18 | private static final DateTimeFormatter DT_FORMAT = DateTimeFormatter.ofPattern("yyyy-MM-dd"); 19 | 20 | public String formatDt(LocalDate dt) { 21 | return dt.format(DT_FORMAT); 22 | } 23 | 24 | private String getSmallObjectPartitionPrefix(String type, LocalDate dt) { 25 | String yyyyMMdd = this.formatDt(dt); 26 | String key = String.format("%s%s/dt=%s/", this.tablePrefix, type, yyyyMMdd); 27 | return key; 28 | } 29 | 30 | private String getSmallObjectBasename(long objectId) { 31 | return String.format("prism-%019d.parquet", objectId); 32 | } 33 | 34 | public String getLiveObjectPartitionPrefix(LocalDate dt) { 35 | return this.getSmallObjectPartitionPrefix("live", dt); 36 | } 37 | 38 | public String getDelayedObjectPartitionPrefix(LocalDate dt) { 39 | return this.getSmallObjectPartitionPrefix("delayed", dt); 40 | } 41 | 42 | public String getLiveObjectKey(LocalDate dt, long objectId) { 43 | String prefix = this.getLiveObjectPartitionPrefix(dt); 44 | String basename = this.getSmallObjectBasename(objectId); 45 | return prefix + basename; 46 | } 47 | 48 | public String getDelayedObjectKey(LocalDate dt, long objectId) { 49 | String prefix = this.getDelayedObjectPartitionPrefix(dt); 50 | String basename = this.getSmallObjectBasename(objectId); 51 | return prefix + basename; 52 | } 53 | 54 | public String getMergedObjectPartitionPrefix(LocalDate dt) { 55 | String yyyyMMdd = this.formatDt(dt); 56 | String prefix = String.format("%smerged/dt=%s/", this.tablePrefix, yyyyMMdd); 57 | return prefix; 58 | } 59 | 60 | public String getMergedObjectKey(LocalDate dt, long lowerBound, long upperBound) { 61 | String prefix = this.getMergedObjectPartitionPrefix(dt); 62 | String key = String.format("%spart-%019d-%019d.parquet", prefix, lowerBound, upperBound); 63 | return key; 64 | } 65 | 66 | public String getMergedPartitionManifestKey(LocalDate dt, long manifestVersion) { 67 | String prefix = this.getMergedObjectPartitionPrefix(dt); 68 | String key = String.format("%smanifest-%019d.json", prefix, manifestVersion); 69 | return key; 70 | } 71 | 72 | public URI toFullUrl(String key) { 73 | try { 74 | return new URI("s3", this.getBucketName(), "/" + key, null); 75 | } catch (URISyntaxException e) { 76 | throw new RuntimeException(e); 77 | } 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/objectstore/PrismTableLocatorFactory.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.objectstore; 2 | 3 | import java.nio.charset.StandardCharsets; 4 | import java.security.MessageDigest; 5 | import java.security.NoSuchAlgorithmException; 6 | 7 | import com.cookpad.prism.dao.PrismTable; 8 | 9 | import lombok.Getter; 10 | import lombok.RequiredArgsConstructor; 11 | 12 | @RequiredArgsConstructor 13 | public class PrismTableLocatorFactory { 14 | @Getter 15 | final private String bucketName; 16 | final private String globalPrefix; 17 | 18 | private String getHashPrefixedTableName(String schemaName, String tableName) { 19 | String fullName = String.format("%s.%s", schemaName, tableName); 20 | MessageDigest md5; 21 | try { 22 | md5 = MessageDigest.getInstance("MD5"); 23 | } catch (NoSuchAlgorithmException e) { 24 | throw new RuntimeException(e); 25 | } 26 | byte[] hash = md5.digest(fullName.getBytes(StandardCharsets.UTF_8)); 27 | String prefixedTableName = String.format("%02x%02x.%s", hash[0], hash[1], fullName); 28 | return prefixedTableName; 29 | } 30 | 31 | private PrismTableLocator buildWithPhysicalNames(String schemaName, String tableName) { 32 | String hashPrefixedTableName = this.getHashPrefixedTableName(schemaName, tableName); 33 | String tablePrefix = String.format("%s%s/", this.globalPrefix, hashPrefixedTableName); 34 | return new PrismTableLocator(this.bucketName, tablePrefix); 35 | } 36 | 37 | public PrismTableLocator build(PrismTable table) { 38 | return this.buildWithPhysicalNames(table.getPhysicalSchemaName(), table.getPhysicalTableName()); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/objectstore/SmallObjectStore.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.objectstore; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.io.InputStream; 6 | import java.time.LocalDate; 7 | 8 | public interface SmallObjectStore { 9 | public InputStream getLiveObject(LocalDate dt, long objectId); 10 | public File getLiveObjectFile(LocalDate dt, long objectId) throws IOException; 11 | public String putLiveObjectFile(LocalDate dt, long objectId, File content); 12 | public InputStream getDelayedObject(LocalDate dt, long objectId); 13 | public File getDelayedObjectFile(LocalDate dt, long objectId) throws IOException; 14 | public String putDelayedObjectFile(LocalDate dt, long objectId, File content); 15 | } 16 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/record/PrismRecordMaterializer.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.record; 2 | 3 | import org.apache.parquet.io.api.GroupConverter; 4 | import org.apache.parquet.io.api.RecordMaterializer; 5 | 6 | import lombok.NonNull; 7 | 8 | public class PrismRecordMaterializer extends RecordMaterializer { 9 | public final RecordConverter root; 10 | 11 | public PrismRecordMaterializer(@NonNull Schema schema) { 12 | this.root = new RecordConverter(schema); 13 | } 14 | 15 | @Override 16 | public Record getCurrentRecord() { 17 | return this.root.getCurrentRecord(); 18 | } 19 | 20 | @Override 21 | public GroupConverter getRootConverter() { 22 | return this.root; 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/record/Record.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.record; 2 | 3 | import org.apache.parquet.io.api.RecordConsumer; 4 | import com.cookpad.prism.record.values.Value; 5 | 6 | public interface Record { 7 | public void writeMessage(RecordConsumer consumer); 8 | public Value getValue(int index); 9 | } 10 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/record/RecordConverter.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.record; 2 | 3 | import org.apache.parquet.io.api.Binary; 4 | import org.apache.parquet.io.api.Converter; 5 | import org.apache.parquet.io.api.GroupConverter; 6 | import org.apache.parquet.io.api.PrimitiveConverter; 7 | import com.cookpad.prism.record.Schema.Column; 8 | import com.cookpad.prism.record.values.NonNullValue; 9 | import com.cookpad.prism.record.values.NullValue; 10 | import com.cookpad.prism.record.values.Value; 11 | 12 | import static com.cookpad.prism.record.values.PrimitiveValue.*; 13 | 14 | import java.util.List; 15 | import java.util.stream.Collectors; 16 | 17 | import lombok.Getter; 18 | import lombok.NonNull; 19 | import lombok.RequiredArgsConstructor; 20 | 21 | public class RecordConverter extends GroupConverter { 22 | private final Schema schema; 23 | @Getter 24 | private List currentValues; 25 | private final List converters; 26 | 27 | public RecordConverter(@NonNull Schema schema) { 28 | this.schema = schema; 29 | this.initCurrentValues(); 30 | this.converters = this.schema.getColumns() 31 | .stream() 32 | .map((col) -> new SimplePrimitiveConverter(this, col)) 33 | .collect(Collectors.toList()); 34 | } 35 | 36 | @Override 37 | public Converter getConverter(int fieldIndex) { 38 | return this.converters.get(fieldIndex); 39 | } 40 | 41 | @Override 42 | public void start() { 43 | this.initCurrentValues(); 44 | } 45 | 46 | @Override 47 | public void end() { 48 | // Do nothing 49 | } 50 | 51 | private void initCurrentValues() { 52 | this.currentValues = this.schema.getColumns() 53 | .stream() 54 | .map((column) -> new NullValue()) 55 | .collect(Collectors.toList()) 56 | ; 57 | } 58 | 59 | public Record getCurrentRecord() { 60 | return new ValueListRecord(this.currentValues); 61 | } 62 | 63 | @RequiredArgsConstructor 64 | private static class SimplePrimitiveConverter extends PrimitiveConverter { 65 | final private RecordConverter parent; 66 | final private Column column; 67 | 68 | private void setValue(Value value) { 69 | this.parent.getCurrentValues().set(this.column.getIndex(), value); 70 | } 71 | 72 | @Override 73 | public void addBinary(Binary value) { 74 | this.setValue(new NonNullValue(this.column, new BinaryValue(value))); 75 | } 76 | 77 | @Override 78 | public void addBoolean(boolean value) { 79 | this.setValue(new NonNullValue(this.column, new BooleanValue(value))); 80 | } 81 | 82 | @Override 83 | public void addInt(int value) { 84 | this.setValue(new NonNullValue(this.column, new IntegerValue(value))); 85 | } 86 | 87 | @Override 88 | public void addLong(long value) { 89 | this.setValue(new NonNullValue(this.column, new LongValue(value))); 90 | } 91 | 92 | @Override 93 | public void addDouble(double value) { 94 | this.setValue(new NonNullValue(this.column, new DoubleValue(value))); 95 | } 96 | 97 | @Override 98 | public void addFloat(float value) { 99 | this.setValue(new NonNullValue(this.column, new FloatValue(value))); 100 | } 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/record/RecordReadSupport.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.record; 2 | 3 | import java.util.Map; 4 | 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.parquet.hadoop.api.InitContext; 7 | import org.apache.parquet.hadoop.api.ReadSupport; 8 | import org.apache.parquet.io.api.RecordMaterializer; 9 | import org.apache.parquet.schema.MessageType; 10 | 11 | import lombok.NonNull; 12 | import lombok.RequiredArgsConstructor; 13 | 14 | @RequiredArgsConstructor 15 | public class RecordReadSupport extends ReadSupport { 16 | @NonNull 17 | final private Schema schema; 18 | 19 | @Override 20 | public RecordMaterializer prepareForRead(Configuration configuration, Map keyValueMetaData, 21 | MessageType fileSchema, ReadContext readContext) { 22 | return new PrismRecordMaterializer(this.schema); 23 | } 24 | 25 | @Override 26 | public ReadContext init(InitContext context) { 27 | return new ReadContext(this.schema.toMessageType()); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/record/RecordReaderFactory.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.record; 2 | 3 | import java.io.IOException; 4 | import java.nio.file.Path; 5 | 6 | import org.apache.hadoop.conf.Configuration; 7 | import org.apache.parquet.hadoop.ParquetReader; 8 | 9 | import lombok.RequiredArgsConstructor; 10 | 11 | @RequiredArgsConstructor 12 | public class RecordReaderFactory { 13 | private final Configuration conf; 14 | 15 | public ParquetReader build(Schema schema, Path path) throws IOException { 16 | org.apache.hadoop.fs.Path hadoopPath = new org.apache.hadoop.fs.Path(path.toAbsolutePath().toString()); 17 | RecordReadSupport readSupport = new RecordReadSupport(schema); 18 | ParquetReader.Builder builder = ParquetReader.builder(readSupport, hadoopPath).withConf(this.conf); 19 | return builder.build(); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/record/RecordTimestampComparator.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.record; 2 | 3 | import java.io.Serializable; 4 | import java.util.Comparator; 5 | 6 | import com.cookpad.prism.record.values.NonNullValue; 7 | import com.cookpad.prism.record.values.PrimitiveValue; 8 | import com.cookpad.prism.record.values.Value; 9 | import com.cookpad.prism.record.values.PrimitiveValue.LongValue; 10 | 11 | public class RecordTimestampComparator implements Comparator, Serializable { 12 | private static final long serialVersionUID = 1L; 13 | 14 | private long getUnixTimestamp(Record o) { 15 | Value tsValue = o.getValue(Schema.TIMESTAMP_INDEX); 16 | if (!(tsValue instanceof NonNullValue)) { 17 | throw new RuntimeException("value of timestamp column is null"); 18 | } 19 | PrimitiveValue pValue = ((NonNullValue)tsValue).getInner(); 20 | if (!(pValue instanceof LongValue)) { 21 | throw new RuntimeException("value of timestamp column is not a LongValue"); 22 | } 23 | return ((LongValue)pValue).getValue(); 24 | } 25 | 26 | @Override 27 | public int compare(Record o1, Record o2) { 28 | // null is greater 29 | if (o1 == null) { 30 | if (o2 == null) { 31 | return 0; 32 | } 33 | return 1; 34 | } 35 | if (o2 == null) { 36 | return -1; 37 | } 38 | return Long.compare(this.getUnixTimestamp(o1), this.getUnixTimestamp(o2)); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/record/RecordWriteSupport.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.record; 2 | 3 | import java.util.HashMap; 4 | import java.util.Map; 5 | 6 | import org.apache.hadoop.conf.Configuration; 7 | import org.apache.parquet.hadoop.api.WriteSupport; 8 | import org.apache.parquet.io.api.RecordConsumer; 9 | import org.apache.parquet.schema.MessageType; 10 | import com.cookpad.prism.record.values.Value.WriteFieldException; 11 | 12 | import lombok.RequiredArgsConstructor; 13 | 14 | @RequiredArgsConstructor 15 | public class RecordWriteSupport extends WriteSupport { 16 | final private Schema schema; 17 | private RecordConsumer consumer; 18 | 19 | @Override 20 | public WriteContext init(Configuration configuration) { 21 | MessageType messageType = schema.toMessageType(); 22 | Map metadata = new HashMap<>(); 23 | return new WriteContext(messageType, metadata); 24 | } 25 | 26 | @Override 27 | public void prepareForWrite(RecordConsumer recordConsumer) { 28 | this.consumer = recordConsumer; 29 | } 30 | 31 | @Override 32 | public void write(Record record) { 33 | try { 34 | record.writeMessage(this.consumer); 35 | } catch (WriteFieldException e) { 36 | throw new WriteRecordException(this.schema.getSchemaName(), this.schema.getTableName(), e); 37 | } 38 | } 39 | 40 | @SuppressWarnings("serial") 41 | public static class WriteRecordException extends RuntimeException { 42 | public WriteRecordException(String schemaName, String tableName, WriteFieldException e) { 43 | super(String.format("Can't write value: '%s' in '%s.%s(%s)'", e.getValue(), schemaName, tableName, e.getColumName()), e); 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/record/RecordWriterBuilder.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.record; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.fs.Path; 5 | import org.apache.parquet.hadoop.ParquetWriter; 6 | import org.apache.parquet.hadoop.api.WriteSupport; 7 | 8 | public class RecordWriterBuilder extends ParquetWriter.Builder { 9 | private Schema schema; 10 | 11 | public RecordWriterBuilder withSchema(Schema schema) { 12 | this.schema = schema; 13 | return this; 14 | } 15 | 16 | public RecordWriterBuilder(Path file) { 17 | super(file); 18 | } 19 | 20 | @Override 21 | protected RecordWriterBuilder self() { 22 | return this; 23 | } 24 | 25 | @Override 26 | protected WriteSupport getWriteSupport(Configuration conf) { 27 | return new RecordWriteSupport(this.schema); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/record/RecordWriterFactory.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.record; 2 | 3 | import java.io.IOException; 4 | import java.nio.file.Path; 5 | 6 | import org.apache.hadoop.conf.Configuration; 7 | import org.apache.parquet.hadoop.ParquetWriter; 8 | import org.apache.parquet.hadoop.ParquetFileWriter.Mode; 9 | import org.apache.parquet.hadoop.metadata.CompressionCodecName; 10 | 11 | import lombok.RequiredArgsConstructor; 12 | 13 | @RequiredArgsConstructor 14 | public class RecordWriterFactory { 15 | private final Configuration conf; 16 | 17 | public ParquetWriter build(Schema schema, Path path) throws IOException { 18 | org.apache.hadoop.fs.Path hadoopPath = new org.apache.hadoop.fs.Path(path.toAbsolutePath().toString()); 19 | RecordWriterBuilder builder = new RecordWriterBuilder(hadoopPath) 20 | .withConf(this.conf) 21 | .withSchema(schema) 22 | .withCompressionCodec(CompressionCodecName.SNAPPY) 23 | .withWriteMode(Mode.OVERWRITE) 24 | ; 25 | return builder.build(); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/record/SizedValueType.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.record; 2 | 3 | import lombok.Getter; 4 | 5 | public class SizedValueType implements ValueType { 6 | @Getter 7 | final private ValueKind valueKind; 8 | @Getter 9 | final private int size; 10 | 11 | @Override 12 | public String toRedshiftTypeName() { 13 | return String.format("%s(%d)", this.valueKind.getRedshiftTypeName(), this.getSize()); 14 | } 15 | 16 | public SizedValueType(ValueKind valueKind, int size) { 17 | if (!valueKind.isSized()) { 18 | throw new IllegalArgumentException("Given value kind is not sized"); 19 | } 20 | this.valueKind = valueKind; 21 | this.size = size; 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/record/UnsizedValueType.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.record; 2 | 3 | import lombok.Getter; 4 | 5 | public class UnsizedValueType implements ValueType { 6 | @Getter 7 | final private ValueKind valueKind; 8 | 9 | @Override 10 | public String toRedshiftTypeName() { 11 | return valueKind.getRedshiftTypeName(); 12 | } 13 | 14 | public UnsizedValueType(ValueKind valueKind) { 15 | if (valueKind.isSized()) { 16 | throw new IllegalArgumentException("Given value kind is sized"); 17 | } 18 | this.valueKind = valueKind; 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/record/ValueKind.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.record; 2 | 3 | import org.apache.parquet.schema.OriginalType; 4 | import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; 5 | 6 | import lombok.Getter; 7 | import lombok.RequiredArgsConstructor; 8 | 9 | @RequiredArgsConstructor 10 | public enum ValueKind { 11 | STRING(PrimitiveTypeName.BINARY, "varchar", true) { 12 | @Override 13 | public OriginalType getOriginalType() { 14 | return OriginalType.UTF8; 15 | } 16 | }, 17 | BOOLEAN(PrimitiveTypeName.BOOLEAN, "boolean", false), 18 | INTEGER(PrimitiveTypeName.INT32, "int", false), 19 | BIGINT(PrimitiveTypeName.INT64, "bigint", false), 20 | TIMESTAMP(PrimitiveTypeName.INT64, "timestamp", false) { 21 | @Override 22 | public OriginalType getOriginalType() { 23 | return OriginalType.TIMESTAMP_MILLIS; 24 | } 25 | }, 26 | SMALLINT(PrimitiveTypeName.INT32, "smallint", false), 27 | DATE(PrimitiveTypeName.BINARY, "varchar(10)", false), 28 | DOUBLE(PrimitiveTypeName.DOUBLE, "double", false), 29 | REAL(PrimitiveTypeName.FLOAT, "float", false), 30 | ; 31 | 32 | @Getter 33 | final private PrimitiveTypeName primitiveType; 34 | @Getter 35 | final private String redshiftTypeName; 36 | @Getter 37 | final private boolean isSized; 38 | 39 | public OriginalType getOriginalType() { 40 | return null; 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/record/ValueListRecord.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.record; 2 | 3 | import java.util.List; 4 | 5 | import org.apache.parquet.io.api.RecordConsumer; 6 | import com.cookpad.prism.record.values.Value; 7 | 8 | import lombok.EqualsAndHashCode; 9 | import lombok.RequiredArgsConstructor; 10 | import lombok.ToString; 11 | 12 | @RequiredArgsConstructor 13 | @ToString 14 | @EqualsAndHashCode 15 | public class ValueListRecord implements Record { 16 | final private List values; 17 | 18 | public void writeMessage(RecordConsumer consumer) { 19 | consumer.startMessage(); 20 | for (Value value: this.values) { 21 | value.writeField(consumer); 22 | } 23 | consumer.endMessage(); 24 | } 25 | 26 | @Override 27 | public Value getValue(int index) { 28 | return this.values.get(index); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/record/ValueType.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.record; 2 | 3 | public interface ValueType { 4 | public String toRedshiftTypeName(); 5 | public ValueKind getValueKind(); 6 | } 7 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/record/partitioned/DateAttachedRecord.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.record.partitioned; 2 | 3 | import java.time.LocalDate; 4 | 5 | import org.apache.parquet.io.api.RecordConsumer; 6 | import com.cookpad.prism.record.Record; 7 | import com.cookpad.prism.record.partitioned.PartitionedRecord; 8 | import com.cookpad.prism.record.values.Value; 9 | 10 | import lombok.RequiredArgsConstructor; 11 | 12 | @RequiredArgsConstructor 13 | public class DateAttachedRecord implements PartitionedRecord { 14 | final private Record inner; 15 | final private LocalDate dt; 16 | 17 | @Override 18 | public void writeMessage(RecordConsumer consumer) { 19 | this.inner.writeMessage(consumer); 20 | } 21 | 22 | @Override 23 | public LocalDate getPartitionDate() { 24 | return this.dt; 25 | } 26 | 27 | @Override 28 | public Value getValue(int index) { 29 | return this.inner.getValue(index); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/record/partitioned/PartitionCollector.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.record.partitioned; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.nio.file.Path; 6 | import java.time.LocalDate; 7 | import java.util.Map; 8 | import java.util.TreeMap; 9 | 10 | import com.cookpad.prism.TempFile; 11 | 12 | import lombok.RequiredArgsConstructor; 13 | import lombok.extern.slf4j.Slf4j; 14 | 15 | @Slf4j 16 | @RequiredArgsConstructor 17 | public class PartitionCollector implements AutoCloseable { 18 | private TreeMap partitionToTempFile = null; 19 | 20 | public TreeMap collect() { 21 | if (this.partitionToTempFile == null) { 22 | throw new IllegalStateException("Close PartitionedWriter before collect partitions"); 23 | } 24 | TreeMap result = new TreeMap<>(); 25 | for (Map.Entry kv : this.partitionToTempFile.entrySet()) { 26 | result.put(kv.getKey(), kv.getValue().getPath()); 27 | } 28 | return result; 29 | } 30 | 31 | public void commit(TreeMap partitionToTempFile) { 32 | if (this.partitionToTempFile != null) { 33 | throw new IllegalStateException("Multiple commit is not allowed"); 34 | } 35 | this.partitionToTempFile = partitionToTempFile; 36 | } 37 | 38 | // Clean up temp files or they will run out of disk space 39 | @Override 40 | public void close() throws IOException { 41 | for (TempFile dest: this.partitionToTempFile.values()) { 42 | File file = dest.getPath().toFile(); 43 | if (!file.delete()) { 44 | log.error("Failed to delete temp file: {}", file.getPath()); 45 | } 46 | log.debug("Deleted temp file: {}", file.getPath()); 47 | } 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/record/partitioned/PartitionedRecord.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.record.partitioned; 2 | 3 | import java.time.LocalDate; 4 | 5 | import com.cookpad.prism.record.Record; 6 | 7 | public interface PartitionedRecord extends Record { 8 | public abstract LocalDate getPartitionDate(); 9 | } 10 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/record/partitioned/PartitionedRecordWriter.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.record.partitioned; 2 | 3 | import java.io.IOException; 4 | 5 | public interface PartitionedRecordWriter extends AutoCloseable { 6 | public void write(PartitionedRecord record) throws IOException; 7 | @Override 8 | public void close() throws IOException; 9 | } 10 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/record/partitioned/PartitionedWriter.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.record.partitioned; 2 | 3 | import java.io.IOException; 4 | import java.time.LocalDate; 5 | import java.util.Map; 6 | import java.util.TreeMap; 7 | 8 | import org.apache.parquet.hadoop.ParquetWriter; 9 | import com.cookpad.prism.TempFile; 10 | import com.cookpad.prism.record.Record; 11 | import com.cookpad.prism.record.RecordWriterFactory; 12 | import com.cookpad.prism.record.Schema; 13 | 14 | import lombok.Data; 15 | import lombok.Getter; 16 | import lombok.RequiredArgsConstructor; 17 | 18 | @RequiredArgsConstructor 19 | public class PartitionedWriter implements PartitionedRecordWriter { 20 | private final RecordWriterFactory recordWriterFactory; 21 | private final TempFile.Factory tempFileFactory; 22 | private final PartitionCollector partitionCollector; 23 | private final Schema schema; 24 | 25 | @Getter 26 | final private TreeMap partitions = new TreeMap<>(); 27 | 28 | private ParquetWriter route(PartitionedRecord record) throws IOException { 29 | LocalDate date = record.getPartitionDate(); 30 | Partition dest = this.partitions.get(date); 31 | if (dest == null) { 32 | TempFile tempFile = this.tempFileFactory.create(); 33 | ParquetWriter writer = recordWriterFactory.build(this.schema, tempFile.getPath()); 34 | dest = new Partition(tempFile, writer); 35 | this.partitions.put(date, dest); 36 | } 37 | return dest.getWriter(); 38 | } 39 | 40 | public void write(PartitionedRecord record) throws IOException { 41 | ParquetWriter dest = this.route(record); 42 | dest.write(record); 43 | } 44 | 45 | @Override 46 | public void close() throws IOException { 47 | TreeMap partitionToTempFile = new TreeMap<>(); 48 | for (Map.Entry kv: this.partitions.entrySet()) { 49 | kv.getValue().getWriter().close(); 50 | partitionToTempFile.put(kv.getKey(), kv.getValue().getTempFile()); 51 | } 52 | this.partitionCollector.commit(partitionToTempFile); 53 | } 54 | 55 | @Data 56 | private static class Partition { 57 | private final TempFile tempFile; 58 | private final ParquetWriter writer; 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/record/partitioned/SortedPartitionedWriter.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.record.partitioned; 2 | 3 | import java.io.IOException; 4 | import java.time.LocalDate; 5 | import java.util.TreeMap; 6 | 7 | import org.apache.parquet.hadoop.ParquetWriter; 8 | import com.cookpad.prism.TempFile; 9 | import com.cookpad.prism.record.Record; 10 | import com.cookpad.prism.record.RecordWriterFactory; 11 | import com.cookpad.prism.record.Schema; 12 | 13 | import lombok.Getter; 14 | import lombok.RequiredArgsConstructor; 15 | 16 | @RequiredArgsConstructor 17 | public class SortedPartitionedWriter implements PartitionedRecordWriter { 18 | private final RecordWriterFactory recordWriterFactory; 19 | private final TempFile.Factory tempFileFactory; 20 | private final PartitionCollector partitionCollector; 21 | private final Schema schema; 22 | 23 | private LocalDate currentDate = null; 24 | private ParquetWriter currentWriter = null; 25 | 26 | @Getter 27 | private final TreeMap partitions = new TreeMap<>(); 28 | 29 | @Override 30 | public void write(PartitionedRecord record) throws IOException { 31 | if (this.currentDate == null || record.getPartitionDate().compareTo(this.currentDate) > 0) { 32 | this.switchWriter(record.getPartitionDate()); 33 | } else if (record.getPartitionDate().compareTo(this.currentDate) < 0) { 34 | throw new IllegalStateException("Partition date is unordered"); 35 | } 36 | this.currentWriter.write(record); 37 | } 38 | 39 | private void switchWriter(LocalDate newDate) throws IOException { 40 | if (this.currentWriter != null) { 41 | this.currentWriter.close(); 42 | } 43 | TempFile tempFile = this.tempFileFactory.create(); 44 | this.currentWriter = this.recordWriterFactory.build(this.schema, tempFile.getPath()); 45 | this.currentDate = newDate; 46 | this.partitions.put(newDate, tempFile); 47 | } 48 | 49 | @Override 50 | public void close() throws IOException { 51 | if (this.currentWriter != null) { 52 | this.currentWriter.close(); 53 | } 54 | this.partitionCollector.commit(this.partitions); 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/record/values/NonNullValue.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.record.values; 2 | 3 | import org.apache.parquet.io.api.RecordConsumer; 4 | import com.cookpad.prism.record.Schema.Column; 5 | 6 | import lombok.EqualsAndHashCode; 7 | import lombok.Getter; 8 | import lombok.RequiredArgsConstructor; 9 | import lombok.ToString; 10 | 11 | @RequiredArgsConstructor 12 | @ToString 13 | @EqualsAndHashCode 14 | public class NonNullValue implements Value { 15 | final private Column column; 16 | @Getter 17 | final private PrimitiveValue inner; 18 | 19 | @Override 20 | public void writeField(RecordConsumer consumer) { 21 | try { 22 | this.column.startField(consumer); 23 | this.inner.writeValue(consumer); 24 | this.column.endField(consumer); 25 | } catch (Exception e) { 26 | String columnName = column.getName(); 27 | Object value = inner.getValue(); 28 | throw new WriteFieldException(columnName, value, e); 29 | } 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/record/values/NullValue.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.record.values; 2 | 3 | import org.apache.parquet.io.api.RecordConsumer; 4 | 5 | import lombok.EqualsAndHashCode; 6 | import lombok.ToString; 7 | 8 | @ToString 9 | @EqualsAndHashCode 10 | public class NullValue implements Value { 11 | @Override 12 | public void writeField(RecordConsumer consumer) { 13 | // Do nothing 14 | return; 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/record/values/PrimitiveValue.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.record.values; 2 | 3 | import org.apache.parquet.io.api.Binary; 4 | import org.apache.parquet.io.api.RecordConsumer; 5 | 6 | import lombok.EqualsAndHashCode; 7 | import lombok.Getter; 8 | import lombok.NonNull; 9 | import lombok.RequiredArgsConstructor; 10 | import lombok.ToString; 11 | 12 | public interface PrimitiveValue { 13 | public T getValue(); 14 | public void writeValue(RecordConsumer consumer); 15 | 16 | @RequiredArgsConstructor 17 | @ToString 18 | @EqualsAndHashCode 19 | public static class BinaryValue implements PrimitiveValue { 20 | @Getter 21 | @NonNull 22 | final private Binary value; 23 | 24 | @Override 25 | public void writeValue(RecordConsumer consumer) { 26 | consumer.addBinary(this.getValue()); 27 | } 28 | } 29 | 30 | @RequiredArgsConstructor 31 | @ToString 32 | @EqualsAndHashCode 33 | public static class BooleanValue implements PrimitiveValue { 34 | @Getter 35 | @NonNull 36 | final private Boolean value; 37 | 38 | @Override 39 | public void writeValue(RecordConsumer consumer) { 40 | consumer.addBoolean(this.getValue()); 41 | } 42 | } 43 | 44 | @RequiredArgsConstructor 45 | @ToString 46 | @EqualsAndHashCode 47 | public static class IntegerValue implements PrimitiveValue { 48 | @Getter 49 | @NonNull 50 | final private Integer value; 51 | 52 | @Override 53 | public void writeValue(RecordConsumer consumer) { 54 | consumer.addInteger(this.getValue()); 55 | } 56 | } 57 | 58 | @RequiredArgsConstructor 59 | @ToString 60 | @EqualsAndHashCode 61 | public static class LongValue implements PrimitiveValue { 62 | @Getter 63 | @NonNull 64 | final private Long value; 65 | 66 | @Override 67 | public void writeValue(RecordConsumer consumer) { 68 | consumer.addLong(this.getValue()); 69 | } 70 | } 71 | 72 | @RequiredArgsConstructor 73 | @ToString 74 | @EqualsAndHashCode 75 | public static class DoubleValue implements PrimitiveValue { 76 | @Getter 77 | @NonNull 78 | final private Double value; 79 | 80 | @Override 81 | public void writeValue(RecordConsumer consumer) { 82 | consumer.addDouble(this.getValue()); 83 | } 84 | } 85 | 86 | @RequiredArgsConstructor 87 | @ToString 88 | @EqualsAndHashCode 89 | public static class FloatValue implements PrimitiveValue { 90 | @Getter 91 | @NonNull 92 | final private Float value; 93 | 94 | @Override 95 | public void writeValue(RecordConsumer consumer) { 96 | consumer.addFloat(this.getValue()); 97 | } 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /shared/src/main/java/com/cookpad/prism/record/values/Value.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.record.values; 2 | 3 | import org.apache.parquet.io.api.RecordConsumer; 4 | 5 | import lombok.Getter; 6 | 7 | public interface Value { 8 | public void writeField(RecordConsumer consumer); 9 | 10 | @SuppressWarnings("serial") 11 | public static class WriteFieldException extends RuntimeException { 12 | @Getter 13 | private String columName = null; 14 | @Getter 15 | private Object value = null; 16 | 17 | public WriteFieldException(String message, Throwable cause) { 18 | super(message, cause); 19 | } 20 | 21 | public WriteFieldException(String columnName, Object value, Throwable cause) { 22 | super(String.format("Encountered problem in writing: value '%s' in column '%s'", value, columnName), cause); 23 | this.columName = columnName; 24 | this.value = value; 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /shared/src/main/resources/application.yml: -------------------------------------------------------------------------------- 1 | mybatis: 2 | config-location: mybatis-config.xml 3 | -------------------------------------------------------------------------------- /shared/src/main/resources/com/cookpad/prism/dao/PrismMergeJobMapper.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 26 | 27 | 28 | 29 | 45 | 46 | 47 | 48 | 54 | 55 | 56 | 84 | 85 | 104 | 105 | -------------------------------------------------------------------------------- /shared/src/main/resources/com/cookpad/prism/dao/PrismMergeRangeMapper.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 36 | 37 | 57 | 58 | 59 | 112 | 113 | 114 | -------------------------------------------------------------------------------- /shared/src/main/resources/com/cookpad/prism/dao/PrismSmallObjectMapper.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 44 | 45 | 54 | 55 | 71 | 72 | 87 | 88 | -------------------------------------------------------------------------------- /shared/src/main/resources/com/cookpad/prism/dao/PrismStagingObjectMapper.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /shared/src/main/resources/com/cookpad/prism/dao/PrismUnknownStagingObjectMapper.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /shared/src/main/resources/mybatis-config.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /shared/src/test/java/com/cookpad/prism/objectstore/PrismObjectStoreTest.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.objectstore; 2 | 3 | import static org.mockito.Mockito.*; 4 | import static org.junit.jupiter.api.Assertions.assertEquals; 5 | 6 | import java.io.File; 7 | import java.time.LocalDate; 8 | import java.util.List; 9 | 10 | import org.junit.jupiter.api.BeforeEach; 11 | import org.junit.jupiter.api.Test; 12 | import org.mockito.ArgumentCaptor; 13 | 14 | import com.amazonaws.services.s3.AmazonS3; 15 | import com.amazonaws.services.s3.model.ObjectTagging; 16 | import com.amazonaws.services.s3.model.PutObjectRequest; 17 | import com.amazonaws.services.s3.model.Tag; 18 | 19 | public class PrismObjectStoreTest { 20 | private AmazonS3 s3; 21 | private PrismTableLocator locator; 22 | private PrismObjectStore store; 23 | 24 | @BeforeEach 25 | void setUp() { 26 | s3 = mock(AmazonS3.class); 27 | locator = mock(PrismTableLocator.class); 28 | store = new PrismObjectStore(s3, locator); 29 | } 30 | 31 | @Test 32 | void putLiveObjectFile_shouldCreateRequestWithCorrectTag() { 33 | // Arrange 34 | LocalDate testDate = LocalDate.of(2025, 1, 1); 35 | long objectId = 100; 36 | File testFile = mock(File.class); 37 | String bucketName = "test-bucket"; 38 | String testKey = "test-key"; 39 | 40 | when(locator.getBucketName()).thenReturn(bucketName); 41 | when(locator.getLiveObjectKey(testDate, objectId)).thenReturn(testKey); 42 | 43 | // Act 44 | store.putLiveObjectFile(testDate, objectId, testFile); 45 | 46 | // Assert 47 | ArgumentCaptor requestCaptor = ArgumentCaptor.forClass(PutObjectRequest.class); 48 | verify(s3).putObject(requestCaptor.capture()); 49 | 50 | PutObjectRequest capturedRequest = requestCaptor.getValue(); 51 | assertEquals(bucketName, capturedRequest.getBucketName()); 52 | assertEquals(testKey, capturedRequest.getKey()); 53 | assertEquals(testFile, capturedRequest.getFile()); 54 | 55 | ObjectTagging tagging = capturedRequest.getTagging(); 56 | List tags = tagging.getTagSet(); 57 | assertEquals(1, tags.size()); 58 | assertEquals("PrismObjectType", tags.get(0).getKey()); 59 | assertEquals("live", tags.get(0).getValue()); 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /shared/src/test/java/com/cookpad/prism/objectstore/S3TableLocatorTest.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.objectstore; 2 | 3 | import static org.junit.jupiter.api.Assertions.assertEquals; 4 | 5 | import java.time.LocalDateTime; 6 | 7 | import com.cookpad.prism.dao.PrismTable; 8 | import org.junit.jupiter.api.Test; 9 | 10 | import lombok.val; 11 | 12 | public class S3TableLocatorTest { 13 | @Test 14 | void getTablePrefix() { 15 | val prismTable = new PrismTable(200, null, null, "test_schema", "nanika_log", LocalDateTime.now(), 43200); 16 | val locatorFactory = new PrismTableLocatorFactory("prism-sandbox", "global-prefix/"); 17 | val tablePrefix = locatorFactory.build(prismTable).getTablePrefix(); 18 | assertEquals("global-prefix/b601.test_schema.nanika_log/", tablePrefix); 19 | } 20 | 21 | @Test 22 | void getTablePrefixWithPhysicalName() { 23 | val prismTable = new PrismTable(200, "phy_test_schema", "phy_nanika_log", "test_schema", "nanika_log", LocalDateTime.now(), 43200); 24 | val locatorFactory = new PrismTableLocatorFactory("prism-sandbox", "global-prefix/"); 25 | val tablePrefix = locatorFactory.build(prismTable).getTablePrefix(); 26 | assertEquals("global-prefix/342b.phy_test_schema.phy_nanika_log/", tablePrefix); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /spotbugs-exclude.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /stream/.gitignore: -------------------------------------------------------------------------------- 1 | /bin/ -------------------------------------------------------------------------------- /stream/README.md: -------------------------------------------------------------------------------- 1 | # Prism Stream 2 | 3 | ## Overview 4 | 5 | - `Main` 6 | - `events.SqsEventDispatcher` 7 | - polls event messages from the SQS 8 | - `events.StagingObjectDispatcher` 9 | - queries prism_staging_objects table 10 | - issues or obtain object ID 11 | - `ParquetConverter` 12 | - is called for each input object 13 | - `jsonl.JsonlReader` & `jsonl.JsonlRecordReader` (in shared) 14 | - reads the input object as `Record`s 15 | - `PartitionedWriter` 16 | - writes them to parquets 17 | - and uploads it as S3 objects 18 | - delete the message from the SQS 19 | -------------------------------------------------------------------------------- /stream/build.gradle: -------------------------------------------------------------------------------- 1 | plugins { 2 | id "org.springframework.boot" version "2.1.3.RELEASE" 3 | id "io.spring.dependency-management" version "1.0.7.RELEASE" 4 | id "com.google.cloud.tools.jib" version "3.4.5" 5 | } 6 | 7 | dependencies { 8 | implementation project(':shared') 9 | implementation group: 'org.springframework.boot', name: 'spring-boot-starter', version: '2.1.3.RELEASE' 10 | implementation group: 'org.springframework.boot', name: 'spring-boot-starter-logging', version: '2.1.3.RELEASE' 11 | implementation group: 'com.fasterxml.jackson.datatype', name: 'jackson-datatype-jsr310', version: '2.8.5' 12 | implementation group: 'io.sentry', name: 'sentry-logback', version: '1.7.30' 13 | implementation group: 'org.postgresql', name: 'postgresql', version: '42.2.5' 14 | testImplementation group: 'org.springframework.boot', name: 'spring-boot-test-autoconfigure', version: '2.1.3.RELEASE' 15 | } 16 | 17 | jib { 18 | from { 19 | image = 'docker://prism-base' 20 | } 21 | to { 22 | tags = ["latest", rootProject.file("REVISION").text.trim()] 23 | } 24 | container { 25 | jvmFlags = [] 26 | workingDirectory = '/app' 27 | } 28 | } 29 | tasks.jib.dependsOn ':shared:docker' 30 | -------------------------------------------------------------------------------- /stream/config/application.yml: -------------------------------------------------------------------------------- 1 | # This config file is for *DEVELOPMENT* 2 | # Edit stream/src/main/jib/app/config/application.yml or environment variables instead 3 | # if you need to change the config in production environment. 4 | 5 | spring: 6 | main: 7 | banner-mode: "off" 8 | datasource: 9 | url: jdbc:postgresql://localhost:5432/prism 10 | driver-class-name: org.postgresql.Driver 11 | username: prism 12 | password: prism 13 | hikari: 14 | maximum-pool-size: 2 15 | minimum-idle: 1 16 | 17 | prism: 18 | bucket-name: prism-example-bucket 19 | prefix: "" 20 | queue-url: "https://sqs.ap-northeast-1.amazonaws.com/111111111111/prism-stream-events" 21 | #ignore-from-exclusive: "1900-01-01" 22 | #ignore-to-inclusive: "2000-01-01" 23 | 24 | logging: 25 | level: 26 | root: INFO 27 | com.cookpad.prism: DEBUG 28 | com.cookpad.prism.dao: INFO 29 | org.apache.parquet: WARN 30 | org.apache.hadoop: WARN 31 | -------------------------------------------------------------------------------- /stream/src/main/java/com/cookpad/prism/objectstore/StagingObjectStore.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.objectstore; 2 | 3 | import java.io.InputStream; 4 | 5 | import com.amazonaws.services.s3.AmazonS3; 6 | import com.amazonaws.services.s3.model.S3Object; 7 | 8 | import com.cookpad.prism.dao.PrismStagingObject; 9 | 10 | import lombok.RequiredArgsConstructor; 11 | 12 | @RequiredArgsConstructor 13 | public class StagingObjectStore { 14 | final private AmazonS3 s3; 15 | 16 | public InputStream getStagingObject(PrismStagingObject stagingObject) { 17 | S3Object s3Object = this.s3.getObject(stagingObject.getBucketName(), stagingObject.getObjectKey()); 18 | return s3Object.getObjectContent(); 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /stream/src/main/java/com/cookpad/prism/stream/FileQueueEventDispatcherFactory.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.stream; 2 | 3 | import java.io.IOException; 4 | import java.time.Clock; 5 | 6 | import com.amazonaws.services.s3.AmazonS3URI; 7 | 8 | import com.cookpad.prism.stream.events.EventHandler; 9 | import com.cookpad.prism.stream.events.FileQueueEventDispatcher; 10 | import com.cookpad.prism.stream.filequeue.FileQueue; 11 | import com.cookpad.prism.stream.filequeue.S3QueueDownloader; 12 | import org.springframework.stereotype.Component; 13 | 14 | import lombok.RequiredArgsConstructor; 15 | 16 | @RequiredArgsConstructor 17 | @Component 18 | public class FileQueueEventDispatcherFactory { 19 | private final S3QueueDownloader s3QueueDownloader; 20 | private final EventHandler eventHandler; 21 | private final Clock clock; 22 | 23 | public FileQueueEventDispatcher build(String queueFileUrl) throws IOException { 24 | AmazonS3URI uri = new AmazonS3URI(queueFileUrl); 25 | FileQueue fileQueue = s3QueueDownloader.download(uri); 26 | return new FileQueueEventDispatcher(fileQueue, eventHandler, clock); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /stream/src/main/java/com/cookpad/prism/stream/PrismStreamConf.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.stream; 2 | 3 | import com.cookpad.prism.PrismConf; 4 | 5 | import org.springframework.stereotype.Component; 6 | 7 | import lombok.NoArgsConstructor; 8 | import lombok.Getter; 9 | import lombok.Setter; 10 | import lombok.ToString; 11 | 12 | @Component 13 | @NoArgsConstructor 14 | @Getter 15 | @Setter 16 | @ToString(callSuper = true) 17 | public class PrismStreamConf extends PrismConf { 18 | String queueUrl; 19 | String ignoreToInclusive; 20 | String ignoreFromExclusive; 21 | } 22 | -------------------------------------------------------------------------------- /stream/src/main/java/com/cookpad/prism/stream/StagingObjectAttributes.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.stream; 2 | 3 | import java.time.LocalDate; 4 | import java.util.regex.Matcher; 5 | import java.util.regex.Pattern; 6 | 7 | import lombok.EqualsAndHashCode; 8 | import lombok.Getter; 9 | import lombok.NonNull; 10 | import lombok.RequiredArgsConstructor; 11 | import lombok.ToString; 12 | 13 | @RequiredArgsConstructor 14 | @ToString 15 | @Getter 16 | @EqualsAndHashCode 17 | public class StagingObjectAttributes { 18 | private static Pattern PATTERN = Pattern.compile("(\\w{4}\\.(?:\\w+\\.){0,2}(\\w+\\.\\w+))/(\\d{4})/(\\d{2})/(\\d{2})/(.*\\.gz)"); 19 | 20 | private final String streamPrefix; 21 | private final String streamName; 22 | private final LocalDate date; 23 | private final String objectName; 24 | 25 | public static StagingObjectAttributes parse(@NonNull String key) throws NotAnStagingObjectException { 26 | Matcher m = PATTERN.matcher(key); 27 | if (!m.matches()) { 28 | throw new NotAnStagingObjectException(); 29 | } 30 | String streamPrefix = m.group(1); 31 | String streamName = m.group(2); 32 | String yyyy = m.group(3); 33 | String mm = m.group(4); 34 | String dd = m.group(5); 35 | int year = Integer.parseInt(yyyy, 10); 36 | int month = Integer.parseInt(mm, 10); 37 | int dayOfMonth = Integer.parseInt(dd, 10); 38 | LocalDate date = LocalDate.of(year, month, dayOfMonth); 39 | String objectName = m.group(6); 40 | return new StagingObjectAttributes(streamPrefix, streamName, date, objectName); 41 | } 42 | 43 | @SuppressWarnings("serial") 44 | public static class NotAnStagingObjectException extends Exception {} 45 | } 46 | -------------------------------------------------------------------------------- /stream/src/main/java/com/cookpad/prism/stream/events/DateRange.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.stream.events; 2 | 3 | import java.time.LocalDate; 4 | import java.util.Optional; 5 | 6 | public class DateRange { 7 | private final Optional startInclusive; 8 | private final Optional endExclusive; 9 | 10 | public DateRange(Optional startInclusive, Optional endExclusive) { 11 | if (startInclusive.isPresent() && endExclusive.isPresent()) { 12 | if (startInclusive.get().compareTo(endExclusive.get()) >= 0) { 13 | throw new IllegalArgumentException("endExeclusive must be greater than startInclusive"); 14 | } 15 | } 16 | this.startInclusive = startInclusive; 17 | this.endExclusive = endExclusive; 18 | } 19 | 20 | public boolean contains(LocalDate target) { 21 | if (!this.startInclusive.isPresent() && !this.endExclusive.isPresent()) { 22 | return false; 23 | } 24 | return startInclusive.map(start -> start.compareTo(target) <= 0).orElse(true) 25 | && endExclusive.map(end -> target.compareTo(end) < 0).orElse(true); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /stream/src/main/java/com/cookpad/prism/stream/events/EventHandler.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.stream.events; 2 | 3 | public interface EventHandler { 4 | void handleEvent(StagingObjectEvent event) throws CatchAndReleaseException; 5 | 6 | @SuppressWarnings("serial") 7 | public static class CatchAndReleaseException extends Exception {} 8 | } 9 | -------------------------------------------------------------------------------- /stream/src/main/java/com/cookpad/prism/stream/events/FileQueueEventDispatcher.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.stream.events; 2 | 3 | import java.io.IOException; 4 | import java.time.Clock; 5 | import java.time.Instant; 6 | 7 | import com.amazonaws.services.s3.AmazonS3URI; 8 | 9 | import com.cookpad.prism.StepHandler; 10 | import com.cookpad.prism.stream.events.EventHandler.CatchAndReleaseException; 11 | import com.cookpad.prism.stream.filequeue.FileQueue; 12 | 13 | import io.sentry.Sentry; 14 | import lombok.RequiredArgsConstructor; 15 | import lombok.extern.slf4j.Slf4j; 16 | 17 | @RequiredArgsConstructor 18 | @Slf4j 19 | public class FileQueueEventDispatcher implements StepHandler { 20 | private final FileQueue fileQueue; 21 | private final EventHandler eventHandler; 22 | private final Clock clock; 23 | 24 | @Override 25 | public boolean handleStep() { 26 | while (true) { 27 | try { 28 | String s3UrlString = this.fileQueue.peek(); 29 | if (s3UrlString == null) { 30 | break; 31 | } 32 | final AmazonS3URI s3Url = new AmazonS3URI(s3UrlString); 33 | final Instant receiveTime = Instant.now(clock); 34 | final Instant sendTime = receiveTime; 35 | final StagingObjectEvent event = new StagingObjectEvent(s3Url.getBucket(), s3Url.getKey(), sendTime, receiveTime); 36 | eventHandler.handleEvent(event); 37 | this.fileQueue.dequeue(); 38 | } catch (CatchAndReleaseException e) { 39 | // catch and release 40 | continue; 41 | } catch (IOException e) { 42 | log.error(String.format("Encountered an error at line:%d", this.fileQueue.lineNumber()), e); 43 | continue; 44 | } 45 | } 46 | return false; 47 | } 48 | 49 | @Override 50 | public void shutdown() { 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /stream/src/main/java/com/cookpad/prism/stream/events/SnsEnvelope.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.stream.events; 2 | 3 | import java.io.IOException; 4 | import java.time.Instant; 5 | 6 | import com.fasterxml.jackson.annotation.JsonIgnoreProperties; 7 | import com.fasterxml.jackson.annotation.JsonProperty; 8 | import com.fasterxml.jackson.databind.ObjectMapper; 9 | import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule; 10 | 11 | import lombok.AllArgsConstructor; 12 | import lombok.Data; 13 | import lombok.NoArgsConstructor; 14 | 15 | @AllArgsConstructor 16 | @NoArgsConstructor 17 | @Data 18 | @JsonIgnoreProperties(ignoreUnknown = true) 19 | public class SnsEnvelope { 20 | @JsonProperty("Message") 21 | private String message; 22 | @JsonProperty("Timestamp") 23 | private Instant timestamp; 24 | 25 | private static ObjectMapper MAPPER = new ObjectMapper(); 26 | static {{ 27 | MAPPER.registerModule(new JavaTimeModule()); 28 | }}; 29 | public static SnsEnvelope parseJson(String json) throws IOException { 30 | return MAPPER.readValue(json, SnsEnvelope.class); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /stream/src/main/java/com/cookpad/prism/stream/events/SqsEventDispatcher.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.stream.events; 2 | 3 | import java.io.IOException; 4 | import java.time.Clock; 5 | import java.time.Instant; 6 | import java.util.List; 7 | 8 | import com.amazonaws.SdkClientException; 9 | import com.amazonaws.services.s3.event.S3EventNotification; 10 | import com.amazonaws.services.s3.event.S3EventNotification.S3Entity; 11 | import com.amazonaws.services.s3.event.S3EventNotification.S3EventNotificationRecord; 12 | import com.amazonaws.services.sqs.AmazonSQS; 13 | import com.amazonaws.services.sqs.model.DeleteMessageRequest; 14 | import com.amazonaws.services.sqs.model.Message; 15 | import com.amazonaws.services.sqs.model.ReceiveMessageRequest; 16 | import com.amazonaws.services.sqs.model.ReceiveMessageResult; 17 | 18 | import com.cookpad.prism.StepHandler; 19 | 20 | import io.sentry.Sentry; 21 | import lombok.RequiredArgsConstructor; 22 | import lombok.extern.slf4j.Slf4j; 23 | 24 | @RequiredArgsConstructor 25 | @Slf4j 26 | public class SqsEventDispatcher implements StepHandler { 27 | final private AmazonSQS sqs; 28 | final private String queueUrl; 29 | final private EventHandler eventHandler; 30 | final private Clock clock; 31 | 32 | @SuppressWarnings("serial") 33 | public static class ExtractException extends Exception { 34 | public ExtractException(Exception cause) { 35 | super(cause); 36 | } 37 | } 38 | 39 | private void handleMessage(Instant receiveTime, Message msg) throws ExtractException, EventHandler.CatchAndReleaseException { 40 | final String msgBody = msg.getBody(); 41 | final S3EventNotification s3Event; 42 | final SnsEnvelope snsEnvelope; 43 | // At first, try parsing msgBody as a SNS Envelope. 44 | try { 45 | snsEnvelope = SnsEnvelope.parseJson(msgBody); 46 | } catch (IOException e) { 47 | throw new ExtractException(e); 48 | } 49 | 50 | final String s3EventMessage; 51 | if (snsEnvelope.getMessage() == null) { 52 | // Assume that message is bare if envelope's message is null 53 | s3EventMessage = msgBody; 54 | } else { 55 | s3EventMessage = snsEnvelope.getMessage(); 56 | } 57 | 58 | try { 59 | s3Event = S3EventNotification.parseJson(s3EventMessage); 60 | } catch (SdkClientException e) { 61 | throw new ExtractException(e); 62 | } 63 | 64 | List records = s3Event.getRecords(); 65 | if (records == null) { 66 | // Just ignore test events which do not contain any records 67 | log.warn("S3Event.Records is empty"); 68 | return; 69 | } 70 | for (S3EventNotificationRecord record: s3Event.getRecords()) { 71 | final Instant sendTime = Instant.ofEpochMilli(record.getEventTime().getMillis()); 72 | final S3Entity s3Entity = record.getS3(); 73 | final String bucketName = s3Entity.getBucket().getName(); 74 | final String objectKey = s3Entity.getObject().getKey(); 75 | StagingObjectEvent event = new StagingObjectEvent(bucketName, objectKey, sendTime, receiveTime); 76 | try { 77 | Sentry.getContext().addExtra("object_url", event.getObjectUri().toString()); 78 | log.info("handle event: s3://{}/{}", bucketName, objectKey); 79 | this.eventHandler.handleEvent(event); 80 | } finally { 81 | Sentry.getContext().removeExtra("object_url"); 82 | } 83 | } 84 | } 85 | 86 | private void receiveAndDispatch() { 87 | final ReceiveMessageRequest req = new ReceiveMessageRequest(this.queueUrl); 88 | req.setVisibilityTimeout(1200); 89 | req.setMaxNumberOfMessages(10); 90 | req.setWaitTimeSeconds(20); 91 | 92 | final ReceiveMessageResult msgResult = this.sqs.receiveMessage(req); 93 | final Instant receivedTime = Instant.now(clock); 94 | for (Message msg: msgResult.getMessages()) { 95 | try { 96 | this.handleMessage(receivedTime, msg); 97 | } catch (EventHandler.CatchAndReleaseException e) { 98 | // catch and release 99 | continue; 100 | } catch (Exception e) { 101 | log.error("Encountered an error in processing event message", e); 102 | continue; 103 | } 104 | DeleteMessageRequest delReq = new DeleteMessageRequest(this.queueUrl, msg.getReceiptHandle()); 105 | this.sqs.deleteMessage(delReq); 106 | } 107 | } 108 | 109 | @Override 110 | public boolean handleStep() { 111 | receiveAndDispatch(); 112 | return true; 113 | } 114 | 115 | @Override 116 | public void shutdown() { 117 | // FIXME: shutdown SQS client? 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /stream/src/main/java/com/cookpad/prism/stream/events/StagingObjectEvent.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.stream.events; 2 | 3 | import java.net.URI; 4 | import java.net.URISyntaxException; 5 | import java.time.Instant; 6 | import java.time.LocalDateTime; 7 | import java.time.ZoneOffset; 8 | 9 | import com.cookpad.prism.dao.PrismStagingObject; 10 | import com.cookpad.prism.dao.PrismUnknownStagingObject; 11 | 12 | import lombok.EqualsAndHashCode; 13 | import lombok.Getter; 14 | import lombok.RequiredArgsConstructor; 15 | import lombok.ToString; 16 | 17 | @RequiredArgsConstructor 18 | @Getter 19 | @ToString 20 | @EqualsAndHashCode 21 | public class StagingObjectEvent { 22 | final private String bucketName; 23 | final private String objectKey; 24 | final private Instant sendTime; 25 | final private Instant receiveTime; 26 | 27 | public PrismStagingObject toStagingObject() { 28 | PrismStagingObject stagingObject = new PrismStagingObject(); 29 | stagingObject.setBucketName(bucketName); 30 | stagingObject.setObjectKey(objectKey); 31 | stagingObject.setSendTime(LocalDateTime.ofInstant(this.sendTime, ZoneOffset.UTC)); 32 | stagingObject.setFirstReceiveTime(LocalDateTime.ofInstant(this.receiveTime, ZoneOffset.UTC)); 33 | return stagingObject; 34 | } 35 | 36 | public PrismUnknownStagingObject toUnknownStagingObject(String message) { 37 | PrismUnknownStagingObject stagingObject = new PrismUnknownStagingObject(); 38 | stagingObject.setBucketName(bucketName); 39 | stagingObject.setObjectKey(objectKey); 40 | stagingObject.setSendTime(LocalDateTime.ofInstant(this.sendTime, ZoneOffset.UTC)); 41 | stagingObject.setFirstReceiveTime(LocalDateTime.ofInstant(this.receiveTime, ZoneOffset.UTC)); 42 | stagingObject.setMessage(message); 43 | return stagingObject; 44 | } 45 | 46 | public URI getObjectUri() { 47 | try { 48 | return new URI("s3", this.getBucketName(), "/" + this.getObjectKey(), null); 49 | } catch (URISyntaxException e) { 50 | throw new RuntimeException(e); 51 | } 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /stream/src/main/java/com/cookpad/prism/stream/events/StagingObjectHandler.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.stream.events; 2 | 3 | import com.cookpad.prism.stream.StagingObjectAttributes; 4 | import com.cookpad.prism.dao.OneToMany; 5 | import com.cookpad.prism.dao.PacketStream; 6 | import com.cookpad.prism.dao.PrismStagingObject; 7 | import com.cookpad.prism.dao.PrismTable; 8 | import com.cookpad.prism.dao.StreamColumn; 9 | 10 | public interface StagingObjectHandler { 11 | public void handleStagingObject(PrismStagingObject stagingObject, StagingObjectAttributes attrs, OneToMany packetStreamWithColumns, PrismTable table) throws UnknownObjectException; 12 | 13 | @SuppressWarnings("serial") 14 | public static class UnknownObjectException extends Exception { 15 | public UnknownObjectException(Exception cause) { 16 | super(cause); 17 | } 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /stream/src/main/java/com/cookpad/prism/stream/filequeue/FileQueue.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.stream.filequeue; 2 | 3 | import java.io.File; 4 | import java.io.FileInputStream; 5 | import java.io.FileNotFoundException; 6 | import java.io.IOException; 7 | import java.io.InputStream; 8 | import java.io.InputStreamReader; 9 | import java.io.LineNumberReader; 10 | import java.nio.charset.StandardCharsets; 11 | import java.util.zip.GZIPInputStream; 12 | 13 | public class FileQueue { 14 | private final LineNumberReader inner; 15 | private String currentLine = null; 16 | public FileQueue(InputStream in) { 17 | InputStreamReader isr = new InputStreamReader(in, StandardCharsets.UTF_8); 18 | this.inner = new LineNumberReader(isr); 19 | } 20 | 21 | static FileQueue fromGzipStream(InputStream gzipped) throws IOException { 22 | GZIPInputStream unzipped = new GZIPInputStream(gzipped); 23 | return new FileQueue(unzipped); 24 | } 25 | 26 | static FileQueue fromGzipFile(File file) throws IOException { 27 | FileInputStream fileInputStream = new FileInputStream(file); 28 | return FileQueue.fromGzipStream(fileInputStream); 29 | } 30 | 31 | static FileQueue fromPlainFile(File file) throws FileNotFoundException { 32 | FileInputStream fileInputStream = new FileInputStream(file); 33 | return new FileQueue(fileInputStream); 34 | } 35 | 36 | public String peek() throws IOException { 37 | if (this.currentLine == null) { 38 | this.dequeue(); 39 | } 40 | return this.currentLine; 41 | } 42 | 43 | public int lineNumber() { 44 | return this.inner.getLineNumber(); 45 | } 46 | 47 | public String dequeue() throws IOException { 48 | String ret = this.currentLine; 49 | this.currentLine = this.inner.readLine(); 50 | return ret; 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /stream/src/main/java/com/cookpad/prism/stream/filequeue/S3QueueDownloader.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.stream.filequeue; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | import java.nio.file.Files; 6 | import java.nio.file.Path; 7 | import java.nio.file.StandardCopyOption; 8 | 9 | import com.amazonaws.services.s3.AmazonS3; 10 | import com.amazonaws.services.s3.AmazonS3URI; 11 | import com.amazonaws.services.s3.model.S3Object; 12 | 13 | import lombok.RequiredArgsConstructor; 14 | 15 | @RequiredArgsConstructor 16 | public class S3QueueDownloader { 17 | private final AmazonS3 s3; 18 | 19 | public FileQueue download(AmazonS3URI queueObjectUrl) throws IOException { 20 | String key = queueObjectUrl.getKey(); 21 | S3Object queueObject = s3.getObject(queueObjectUrl.getBucket(), key); 22 | Path tmpPath = Files.createTempFile("prism-rebuild-queue-", ".queue").toAbsolutePath(); 23 | try (InputStream in = queueObject.getObjectContent()) { 24 | Files.copy(in, tmpPath, StandardCopyOption.REPLACE_EXISTING); 25 | } 26 | if (key.endsWith(".gz")) { 27 | return FileQueue.fromGzipFile(tmpPath.toFile()); 28 | } else { 29 | return FileQueue.fromPlainFile(tmpPath.toFile()); 30 | } 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /stream/src/main/jib/app/config/application.yml: -------------------------------------------------------------------------------- 1 | # This config file is for *PRODUCTION* 2 | # Edit stream/config/application.yml instead if you need to change the config in *DEVELOPMENT* environment 3 | 4 | spring: 5 | main: 6 | banner-mode: "off" 7 | datasource: 8 | # database endpoint will be injected by environment variables 9 | driver-class-name: org.postgresql.Driver 10 | hikari: 11 | maximum-pool-size: 2 12 | minimum-idle: 1 13 | 14 | prism: 15 | bucket-name: prism-example-bucket 16 | prefix: "" 17 | queue-url: "https://sqs.ap-northeast-1.amazonaws.com/111111111111/prism-stream-events" 18 | #ignore-from-exclusive: "1900-01-01" 19 | #ignore-to-inclusive: "2010-01-01" 20 | 21 | logging: 22 | level: 23 | root: INFO 24 | com.cookpad.prism: INFO 25 | com.cookpad.prism.dao: INFO 26 | org.apache.parquet: WARN 27 | org.apache.hadoop: WARN 28 | -------------------------------------------------------------------------------- /stream/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | System.err 5 | 6 | %d [%t] %5level: %logger: %msg%n 7 | 8 | 9 | 10 | 11 | 12 | WARN 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /stream/src/main/resources/sentry.properties: -------------------------------------------------------------------------------- 1 | tags=subsystem:stream 2 | stacktrace.app.packages=com.cookpad.prism 3 | -------------------------------------------------------------------------------- /stream/src/test/java/com/cookpad/prism/stream/StagingObjectAttributesTest.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.stream; 2 | 3 | import java.time.LocalDate; 4 | import org.junit.jupiter.api.Test; 5 | import static org.junit.jupiter.api.Assertions.assertEquals; 6 | import static org.junit.jupiter.api.Assertions.assertThrows; 7 | 8 | public class StagingObjectAttributesTest { 9 | @Test 10 | void parse() throws Exception { 11 | var attrs = StagingObjectAttributes.parse("a764.dwh.streaming_load.hako_console.hako_console_autoscale_limit_change/2018/07/19/20180719_0429_0_364bfa27-8ed9-4218-9513-e80c97897dea.gz"); 12 | assertEquals(new StagingObjectAttributes( 13 | "a764.dwh.streaming_load.hako_console.hako_console_autoscale_limit_change", 14 | "hako_console.hako_console_autoscale_limit_change", 15 | LocalDate.of(2018, 7, 19), 16 | "20180719_0429_0_364bfa27-8ed9-4218-9513-e80c97897dea.gz" 17 | ), attrs); 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /stream/src/test/java/com/cookpad/prism/stream/events/DateRangeTest.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.stream.events; 2 | 3 | import org.junit.jupiter.api.Test; 4 | 5 | import static org.junit.jupiter.api.Assertions.assertFalse; 6 | import static org.junit.jupiter.api.Assertions.assertTrue; 7 | 8 | import java.time.LocalDate; 9 | import java.util.Optional; 10 | 11 | public class DateRangeTest { 12 | final static LocalDate earliest = LocalDate.of(2018, 8, 8); 13 | final static LocalDate earlier = LocalDate.of(2018, 8, 9); 14 | final static LocalDate middle = LocalDate.of(2018, 8, 10); 15 | final static LocalDate later = LocalDate.of(2018, 8, 11); 16 | final static LocalDate latest = LocalDate.of(2018, 8, 12); 17 | 18 | @Test 19 | public void testStartOnly() { 20 | var range = new DateRange(Optional.of(middle), Optional.empty()); 21 | assertFalse(range.contains(earliest)); 22 | assertFalse(range.contains(earlier)); 23 | assertTrue(range.contains(middle)); // <- start 24 | assertTrue(range.contains(later)); 25 | assertTrue(range.contains(latest)); 26 | } 27 | 28 | @Test 29 | public void testEndOnly() { 30 | var range = new DateRange(Optional.empty(), Optional.of(middle)); 31 | assertTrue(range.contains(earliest)); 32 | assertTrue(range.contains(earlier)); 33 | assertFalse(range.contains(middle)); // <- end 34 | assertFalse(range.contains(later)); 35 | assertFalse(range.contains(latest)); 36 | } 37 | 38 | @Test 39 | public void testBothStartAndEnd() { 40 | var range = new DateRange(Optional.of(earlier), Optional.of(later)); 41 | assertFalse(range.contains(earliest)); 42 | assertTrue(range.contains(earlier)); // <- start 43 | assertTrue(range.contains(middle)); 44 | assertFalse(range.contains(later)); // <- end 45 | assertFalse(range.contains(latest)); 46 | } 47 | 48 | @Test 49 | public void testNothing() { 50 | var range = new DateRange(Optional.empty(), Optional.empty()); 51 | assertFalse(range.contains(earliest)); 52 | assertFalse(range.contains(earlier)); 53 | assertFalse(range.contains(middle)); 54 | assertFalse(range.contains(later)); 55 | assertFalse(range.contains(latest)); 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /stream/src/test/java/com/cookpad/prism/stream/events/SqsEventDispatcherTest.java: -------------------------------------------------------------------------------- 1 | package com.cookpad.prism.stream.events; 2 | 3 | import com.cookpad.prism.stream.events.EventHandler.CatchAndReleaseException; 4 | import org.junit.jupiter.api.Test; 5 | 6 | import static org.mockito.Mockito.*; 7 | 8 | import java.time.Clock; 9 | import java.time.Instant; 10 | import java.time.ZonedDateTime; 11 | 12 | import com.amazonaws.services.sqs.AmazonSQS; 13 | import com.amazonaws.services.sqs.model.Message; 14 | import com.amazonaws.services.sqs.model.ReceiveMessageRequest; 15 | import com.amazonaws.services.sqs.model.ReceiveMessageResult; 16 | 17 | import lombok.val; 18 | 19 | public class SqsEventDispatcherTest { 20 | @Test 21 | void testHandleStepWithSNS() throws CatchAndReleaseException { 22 | val now = Instant.now(); 23 | val mockedClock = mock(Clock.class); 24 | when(mockedClock.instant()).thenReturn(now); 25 | val mockedSqs = mock(AmazonSQS.class); 26 | val messageBody = "{\"Message\":\"{\\\"Records\\\":[{\\\"eventVersion\\\":\\\"2.0\\\",\\\"eventTime\\\":\\\"2018-06-27T11:24:59.461Z\\\",\\\"eventSource\\\":\\\"aws:s3\\\",\\\"awsRegion\\\":\\\"ap-northeast-1\\\",\\\"eventName\\\":\\\"ObjectCreated:Put\\\",\\\"s3\\\":{\\\"bucket\\\":{\\\"name\\\":\\\"staging-bucket\\\"},\\\"object\\\":{\\\"key\\\":\\\"69ab.logs.pv_log/2018/06/27/20180627_1124_0_4ee44954-228b-4f08-a832-360c625f4e92.gz\\\"}}}]}\",\"Timestamp\":\"2018-06-27T11:24:59.461Z\"}"; 27 | val result = new ReceiveMessageResult() 28 | .withMessages( 29 | new Message() 30 | .withReceiptHandle("DUMMY_RECEIPT_HANDLE") 31 | .withBody(messageBody) 32 | ) 33 | ; 34 | when(mockedSqs.receiveMessage(any(ReceiveMessageRequest.class))).thenReturn(result); 35 | 36 | val mockedHandler = mock(EventHandler.class); 37 | doNothing().when(mockedHandler).handleEvent(any()); 38 | 39 | val dispatcher = new SqsEventDispatcher(mockedSqs, "dummy", mockedHandler, mockedClock); 40 | dispatcher.handleStep(); 41 | 42 | verify(mockedHandler).handleEvent( 43 | new StagingObjectEvent( 44 | "staging-bucket", 45 | "69ab.logs.pv_log/2018/06/27/20180627_1124_0_4ee44954-228b-4f08-a832-360c625f4e92.gz", 46 | ZonedDateTime.parse("2018-06-27T11:24:59.461Z").toInstant(), 47 | now 48 | ) 49 | ); 50 | } 51 | 52 | @Test 53 | void testHandleStepWithoutSNS() throws CatchAndReleaseException { 54 | val now = Instant.now(); 55 | val mockedClock = mock(Clock.class); 56 | when(mockedClock.instant()).thenReturn(now); 57 | val mockedSqs = mock(AmazonSQS.class); 58 | val messageBody = "{\"Records\":[{\"eventVersion\":\"2.0\",\"eventTime\":\"2018-06-27T11:24:59.461Z\",\"eventSource\":\"aws:s3\",\"awsRegion\":\"ap-northeast-1\",\"eventName\":\"ObjectCreated:Put\",\"s3\":{\"bucket\":{\"name\":\"staging-bucket\"},\"object\":{\"key\":\"69ab.logs.pv_log/2018/06/27/20180627_1124_0_4ee44954-228b-4f08-a832-360c625f4e92.gz\"}}}]}"; 59 | val result = new ReceiveMessageResult() 60 | .withMessages( 61 | new Message() 62 | .withReceiptHandle("DUMMY_RECEIPT_HANDLE") 63 | .withBody(messageBody) 64 | ) 65 | ; 66 | when(mockedSqs.receiveMessage(any(ReceiveMessageRequest.class))).thenReturn(result); 67 | 68 | val mockedHandler = mock(EventHandler.class); 69 | doNothing().when(mockedHandler).handleEvent(any()); 70 | 71 | val dispatcher = new SqsEventDispatcher(mockedSqs, "dummy", mockedHandler, mockedClock); 72 | dispatcher.handleStep(); 73 | 74 | verify(mockedHandler).handleEvent( 75 | new StagingObjectEvent( 76 | "staging-bucket", 77 | "69ab.logs.pv_log/2018/06/27/20180627_1124_0_4ee44954-228b-4f08-a832-360c625f4e92.gz", 78 | ZonedDateTime.parse("2018-06-27T11:24:59.461Z").toInstant(), 79 | now 80 | ) 81 | ); 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /stream/src/test/resources/com/cookpad/prism/stream/small_object_22.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cookpad/prism/774f6e2c0b6665c5f11c9012f94a42333dc4454c/stream/src/test/resources/com/cookpad/prism/stream/small_object_22.parquet -------------------------------------------------------------------------------- /stream/src/test/resources/com/cookpad/prism/stream/small_object_23.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cookpad/prism/774f6e2c0b6665c5f11c9012f94a42333dc4454c/stream/src/test/resources/com/cookpad/prism/stream/small_object_23.parquet -------------------------------------------------------------------------------- /stream/src/test/resources/com/cookpad/prism/stream/staging_object.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cookpad/prism/774f6e2c0b6665c5f11c9012f94a42333dc4454c/stream/src/test/resources/com/cookpad/prism/stream/staging_object.gz --------------------------------------------------------------------------------