├── img
├── home.png
├── gitpod.png
├── astra-to-csv.png
├── astra-to-gcs.png
├── csv-to-astra.png
├── gcs-to-astra.png
├── astra-db-create.png
├── astra-db-list.png
├── astra-db-describe.png
├── astra-to-bigquery.png
├── bigquery-to-astra.png
├── cassandra-to-astra.png
├── gcp-create-project.png
├── astra-db-download-scb.png
└── open-btn.svg
├── slides
└── slides.pdf
├── .idea
├── vcs.xml
├── .gitignore
├── misc.xml
├── encodings.xml
├── jarRepositories.xml
└── compiler.xml
├── samples-dataflow
├── src
│ ├── main
│ │ ├── resources
│ │ │ ├── schema_language_codes.json
│ │ │ └── logback.xml
│ │ └── java
│ │ │ └── com
│ │ │ └── datastax
│ │ │ └── astra
│ │ │ └── dataflow
│ │ │ ├── domains
│ │ │ ├── LanguageCodeDao.java
│ │ │ ├── LanguageCodeDaoMapper.java
│ │ │ ├── LanguageCodeDaoMapperFactoryFn.java
│ │ │ └── LanguageCode.java
│ │ │ ├── utils
│ │ │ ├── GoogleSecretManagerUtils.java
│ │ │ └── GoogleBigQueryUtils.java
│ │ │ ├── transforms
│ │ │ └── CassandraToBigQuerySchemaMapperFn.java
│ │ │ └── AstraDb_To_BigQuery_Dynamic.java
│ └── test
│ │ └── resources
│ │ └── language-codes.csv
├── dependency-reduced-pom.xml
└── pom.xml
├── samples-beam
├── src
│ └── main
│ │ ├── java
│ │ └── com
│ │ │ └── datastax
│ │ │ └── astra
│ │ │ └── beam
│ │ │ ├── fable
│ │ │ ├── FableDao.java
│ │ │ ├── FableDaoMapper.java
│ │ │ ├── FableDto.java
│ │ │ ├── Fable.java
│ │ │ ├── SimpleFableDbMapper.java
│ │ │ ├── FableDaoMapperFactoryFn.java
│ │ │ └── AbstractCassIOEntity.java
│ │ │ └── genai
│ │ │ ├── GenAI_01_ImportData.java
│ │ │ └── GenAI_02_CreateEmbeddings.java
│ │ └── resources
│ │ ├── logback-test.xml
│ │ └── fables_of_fontaine.csv
├── dependency-reduced-pom.xml
└── pom.xml
├── .gitignore
├── .gitpod.yml
├── .env
├── pom.xml
└── README.md
/img/home.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datastaxdevs/workshop-beam/main/img/home.png
--------------------------------------------------------------------------------
/img/gitpod.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datastaxdevs/workshop-beam/main/img/gitpod.png
--------------------------------------------------------------------------------
/slides/slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datastaxdevs/workshop-beam/main/slides/slides.pdf
--------------------------------------------------------------------------------
/img/astra-to-csv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datastaxdevs/workshop-beam/main/img/astra-to-csv.png
--------------------------------------------------------------------------------
/img/astra-to-gcs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datastaxdevs/workshop-beam/main/img/astra-to-gcs.png
--------------------------------------------------------------------------------
/img/csv-to-astra.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datastaxdevs/workshop-beam/main/img/csv-to-astra.png
--------------------------------------------------------------------------------
/img/gcs-to-astra.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datastaxdevs/workshop-beam/main/img/gcs-to-astra.png
--------------------------------------------------------------------------------
/img/astra-db-create.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datastaxdevs/workshop-beam/main/img/astra-db-create.png
--------------------------------------------------------------------------------
/img/astra-db-list.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datastaxdevs/workshop-beam/main/img/astra-db-list.png
--------------------------------------------------------------------------------
/img/astra-db-describe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datastaxdevs/workshop-beam/main/img/astra-db-describe.png
--------------------------------------------------------------------------------
/img/astra-to-bigquery.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datastaxdevs/workshop-beam/main/img/astra-to-bigquery.png
--------------------------------------------------------------------------------
/img/bigquery-to-astra.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datastaxdevs/workshop-beam/main/img/bigquery-to-astra.png
--------------------------------------------------------------------------------
/img/cassandra-to-astra.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datastaxdevs/workshop-beam/main/img/cassandra-to-astra.png
--------------------------------------------------------------------------------
/img/gcp-create-project.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datastaxdevs/workshop-beam/main/img/gcp-create-project.png
--------------------------------------------------------------------------------
/img/astra-db-download-scb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datastaxdevs/workshop-beam/main/img/astra-db-download-scb.png
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Editor-based HTTP Client requests
5 | /httpRequests/
6 | # Datasource local storage ignored files
7 | /dataSources/
8 | /dataSources.local.xml
9 |
--------------------------------------------------------------------------------
/samples-dataflow/src/main/resources/schema_language_codes.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "mode": "REQUIRED",
4 | "name": "code",
5 | "type": "STRING"
6 | },
7 | {
8 | "mode": "REQUIRED",
9 | "name": "language",
10 | "type": "STRING"
11 | }
12 | ]
--------------------------------------------------------------------------------
/samples-beam/src/main/java/com/datastax/astra/beam/fable/FableDao.java:
--------------------------------------------------------------------------------
1 | package com.datastax.astra.beam.fable;
2 |
3 | import com.datastax.oss.driver.api.mapper.annotations.Dao;
4 | import org.apache.beam.sdk.io.astra.db.mapping.AstraDbMapper;
5 |
6 | @Dao
7 | public interface FableDao extends AstraDbMapper {}
8 |
--------------------------------------------------------------------------------
/samples-dataflow/src/main/java/com/datastax/astra/dataflow/domains/LanguageCodeDao.java:
--------------------------------------------------------------------------------
1 | package com.datastax.astra.dataflow.domains;
2 |
3 | import com.datastax.oss.driver.api.mapper.annotations.Dao;
4 | import org.apache.beam.sdk.io.astra.db.mapping.AstraDbMapper;
5 |
6 | @Dao
7 | public interface LanguageCodeDao extends AstraDbMapper {}
8 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | astra-sdk-java.wiki/
2 | .env
3 | .astrarc
4 | sec
5 |
6 |
7 | # eclipse conf file
8 | .settings
9 | .classpath
10 | .project
11 | .cache
12 |
13 | # idea conf files
14 | .idea
15 | *.ipr
16 | *.iws
17 | *.iml
18 |
19 | # building
20 | target
21 | build
22 | tmp
23 | releases
24 |
25 | # misc
26 | .DS_Store
27 |
28 | .factorypath
29 | .sts4-cache
30 | *.log
31 |
32 | release.properties
33 | pom.xml.releaseBackup
34 | dependency-reduced-pom*
--------------------------------------------------------------------------------
/.gitpod.yml:
--------------------------------------------------------------------------------
1 | tasks:
2 | - name: setup
3 | before: |
4 | printf 'export PATH="$HOME%s:$PATH"\n' "/.astra/cli" >> $HOME/.bashrc
5 | printf 'unset JAVA_TOOL_OPTIONS\n' >> $HOME/.bashrc
6 | curl -Ls "https://dtsx.io/get-astra-cli" | bash >> ./install.log
7 | command: |
8 | unset JAVA_TOOL_OPTIONS
9 | source /home/gitpod/.astra/cli/astra-init.sh
10 | clear
11 | astra
12 |
13 | vscode:
14 | extensions:
15 | - redhat.java
16 | - gabrielbb.vscode-lombok
17 |
18 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/samples-beam/src/main/java/com/datastax/astra/beam/fable/FableDaoMapper.java:
--------------------------------------------------------------------------------
1 |
2 | package com.datastax.astra.beam.fable;
3 |
4 | import com.datastax.oss.driver.api.core.CqlIdentifier;
5 | import com.datastax.oss.driver.api.mapper.annotations.DaoFactory;
6 | import com.datastax.oss.driver.api.mapper.annotations.DaoKeyspace;
7 | import com.datastax.oss.driver.api.mapper.annotations.Mapper;
8 |
9 | @Mapper
10 | public interface FableDaoMapper {
11 |
12 | @DaoFactory
13 | FableDao getFableDao(@DaoKeyspace CqlIdentifier keyspace);
14 |
15 | }
16 |
17 |
--------------------------------------------------------------------------------
/samples-dataflow/src/main/java/com/datastax/astra/dataflow/domains/LanguageCodeDaoMapper.java:
--------------------------------------------------------------------------------
1 | package com.datastax.astra.dataflow.domains;
2 |
3 | import com.datastax.oss.driver.api.core.CqlIdentifier;
4 | import com.datastax.oss.driver.api.mapper.annotations.DaoFactory;
5 | import com.datastax.oss.driver.api.mapper.annotations.DaoKeyspace;
6 | import com.datastax.oss.driver.api.mapper.annotations.Mapper;
7 |
8 | @Mapper
9 | public interface LanguageCodeDaoMapper {
10 |
11 | @DaoFactory
12 | LanguageCodeDao getLanguageCodeDao(@DaoKeyspace CqlIdentifier keyspace);
13 |
14 | }
15 |
16 |
--------------------------------------------------------------------------------
/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/samples-dataflow/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | %msg%n
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
--------------------------------------------------------------------------------
/samples-beam/src/main/resources/logback-test.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | %d{HH:mm:ss.SSS} %magenta(%-5level) %cyan(%logger{30}) : %msg%n
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
--------------------------------------------------------------------------------
/samples-beam/dependency-reduced-pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | astra-beam-starter
5 | com.datastax.astra
6 | 1.0-SNAPSHOT
7 |
8 | 4.0.0
9 | samples-beam
10 | + samples-beam
11 | Sample Pipeline for Astra with Beam
12 |
13 |
14 |
15 | maven-compiler-plugin
16 |
17 |
18 |
19 | com.datastax.oss
20 | java-driver-mapper-processor
21 | ${cassandra-driver4x.version}
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 | 4.16.0
30 |
31 |
32 |
--------------------------------------------------------------------------------
/samples-beam/src/main/java/com/datastax/astra/beam/fable/FableDto.java:
--------------------------------------------------------------------------------
1 | package com.datastax.astra.beam.fable;
2 |
3 | import com.datastax.oss.driver.api.core.data.CqlVector;
4 | import lombok.Data;
5 |
6 | import java.io.Serializable;
7 | import java.util.List;
8 | import java.util.stream.Collectors;
9 | import java.util.stream.StreamSupport;
10 |
11 | @Data
12 | public class FableDto implements Serializable {
13 |
14 | private String title;
15 |
16 | private String documentId;
17 |
18 | private String document;
19 |
20 | private String metadata;
21 |
22 | private List vector;
23 |
24 | public FableDto() {}
25 |
26 | public FableDto(Fable p) {
27 | this.title = p.getTitle();
28 | this.documentId = p.getDocumentId();
29 | this.document = p.getDocument();
30 | this.metadata = p.getMetaData();
31 | this.vector = StreamSupport
32 | .stream(p.getVector().getValues().spliterator(), false)
33 | .collect(Collectors.toList());
34 | }
35 |
36 | public Fable toFable() {
37 | Fable p = new Fable(this.title, this.documentId, this.document);
38 | p.setMetaData(this.metadata);
39 | if (this.vector != null) {
40 | p.setVector(CqlVector.builder().addAll(this.vector).build());
41 | }
42 | return p;
43 | }
44 | }
45 |
46 |
--------------------------------------------------------------------------------
/samples-dataflow/src/main/java/com/datastax/astra/dataflow/domains/LanguageCodeDaoMapperFactoryFn.java:
--------------------------------------------------------------------------------
1 | package com.datastax.astra.dataflow.domains;
2 |
3 | /*-
4 | * #%L
5 | * Beam SDK for Astra
6 | * --
7 | * Copyright (C) 2023 DataStax
8 | * --
9 | * Licensed under the Apache License, Version 2.0
10 | * You may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | import com.datastax.oss.driver.api.core.CqlSession;
24 | import org.apache.beam.sdk.io.astra.db.mapping.AstraDbMapper;
25 | import org.apache.beam.sdk.transforms.SerializableFunction;
26 |
27 | public class LanguageCodeDaoMapperFactoryFn implements SerializableFunction> {
28 |
29 | @Override
30 | public LanguageCodeDao apply(CqlSession cqlSession) {
31 | return new LanguageCodeDaoMapperBuilder(cqlSession).build()
32 | .getLanguageCodeDao(cqlSession.getKeyspace().get());
33 | }
34 |
35 | }
36 |
--------------------------------------------------------------------------------
/samples-beam/src/main/java/com/datastax/astra/beam/fable/Fable.java:
--------------------------------------------------------------------------------
1 | package com.datastax.astra.beam.fable;
2 |
3 | import com.datastax.oss.driver.api.mapper.annotations.CqlName;
4 | import com.datastax.oss.driver.api.mapper.annotations.Entity;
5 | import lombok.Data;
6 | import lombok.NoArgsConstructor;
7 |
8 | @Entity
9 | @Data
10 | @NoArgsConstructor
11 | @CqlName("fable")
12 | public class Fable extends AbstractCassIOEntity {
13 |
14 | @CqlName("title")
15 | private String title;
16 |
17 | public Fable(String title, String rowId, String row) {
18 | this.title = title;
19 | this.documentId = rowId;
20 | this.document = row;
21 | }
22 |
23 | public static String cqlCreateTable(String keyspace) {
24 | StringBuilder cql = new StringBuilder();
25 | cql.append("CREATE TABLE IF NOT EXISTS %s.fable (");
26 | cql.append(" document_id %s PRIMARY KEY,");
27 | cql.append(" title TEXT,");
28 | cql.append(" document TEXT)");
29 | return String.format(cql.toString(), keyspace, "TEXT");
30 | }
31 |
32 | /**
33 | * Help generating the Target Table if it does not exist.
34 | *
35 | * @return
36 | * create statement
37 | */
38 | public static Fable fromCsvRow(String csvRow) {
39 | String[] chunks = csvRow.split(";");
40 | return new Fable(chunks[0], chunks[1], chunks[2]);
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/.idea/jarRepositories.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
--------------------------------------------------------------------------------
/img/open-btn.svg:
--------------------------------------------------------------------------------
1 |
7 |
10 |
18 |
19 |
21 | OPEN IN GOOGLE CLOUD SHELL
27 |
31 |
35 |
39 |
43 |
44 |
45 |
46 |
--------------------------------------------------------------------------------
/samples-beam/src/main/java/com/datastax/astra/beam/fable/SimpleFableDbMapper.java:
--------------------------------------------------------------------------------
1 | package com.datastax.astra.beam.fable;
2 |
3 | import com.datastax.oss.driver.api.core.CqlSession;
4 | import com.datastax.oss.driver.api.core.cql.Row;
5 | import com.datastax.oss.driver.api.core.cql.SimpleStatement;
6 | import org.apache.beam.sdk.io.astra.db.mapping.AstraDbMapper;
7 | import org.apache.beam.sdk.transforms.SerializableFunction;
8 |
9 | import java.util.concurrent.CompletionStage;
10 |
11 | public class SimpleFableDbMapper implements SerializableFunction> {
12 | @Override
13 | public AstraDbMapper apply(CqlSession cqlSession) {
14 | return new AstraDbMapper() {
15 |
16 | @Override
17 | public FableDto mapRow(Row row) {
18 | FableDto dto = new FableDto();
19 | dto.setTitle(row.getString("title"));
20 | dto.setDocument(row.getString("document"));
21 | dto.setDocumentId(row.getString("document_id"));
22 | return dto;
23 | }
24 |
25 | @Override
26 | public CompletionStage deleteAsync(FableDto entity) {
27 | return null;
28 | }
29 |
30 | @Override
31 | public CompletionStage saveAsync(FableDto entity) {
32 | return cqlSession.executeAsync(SimpleStatement.newInstance(
33 | "INSERT INTO fable (document_id, title, document) VALUES (?, ?, ?)",
34 | entity.getDocumentId(), entity.getTitle(), entity.getDocument()))
35 | .thenAccept(rs -> {});
36 | }
37 | };
38 | }
39 |
40 | }
41 |
--------------------------------------------------------------------------------
/.env:
--------------------------------------------------------------------------------
1 | ASTRA_DB_APPLICATION_TOKEN="AstraCS:uZclXTYecCAqPPjiNmkezapR:e87d6edb702acd87516e4ef78e0c0e515c32ab2c3529f5a3242688034149a0e4"
2 | ASTRA_DB_GRAPHQL_URL="https://4d8c7097-4759-427f-a9a5-d52d98757ea8-us-east1.apps.astra.datastax.com/api/graphql/beam"
3 | ASTRA_DB_GRAPHQL_URL_ADMIN="https://4d8c7097-4759-427f-a9a5-d52d98757ea8-us-east1.apps.astra.datastax.com/api/graphql-admin"
4 | ASTRA_DB_GRAPHQL_URL_PLAYGROUND="https://4d8c7097-4759-427f-a9a5-d52d98757ea8-us-east1.apps.astra.datastax.com/api/playground"
5 | ASTRA_DB_GRAPHQL_URL_SCHEMA="https://4d8c7097-4759-427f-a9a5-d52d98757ea8-us-east1.apps.astra.datastax.com/api/graphql-schema"
6 | ASTRA_DB_ID="4d8c7097-4759-427f-a9a5-d52d98757ea8"
7 | ASTRA_DB_KEYSPACE="beam"
8 | ASTRA_DB_REGION="us-east1"
9 | ASTRA_DB_REST_URL="https://4d8c7097-4759-427f-a9a5-d52d98757ea8-us-east1.apps.astra.datastax.com/api/rest"
10 | ASTRA_DB_REST_URL_SWAGGER="https://4d8c7097-4759-427f-a9a5-d52d98757ea8-us-east1.apps.astra.datastax.com/api/rest/swagger-ui/"
11 | ASTRA_DB_SECURE_BUNDLE_PATH="/Users/cedricklunven/.astra/scb/scb_4d8c7097-4759-427f-a9a5-d52d98757ea8_us-east1.zip"
12 | ASTRA_DB_SECURE_BUNDLE_URL="https://datastax-cluster-config-prod.s3.us-east-2.amazonaws.com/4d8c7097-4759-427f-a9a5-d52d98757ea8-1/secure-connect-workshop-beam.zip?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIA2AIQRQ76S2JCB77W%2F20230809%2Fus-east-2%2Fs3%2Faws4_request&X-Amz-Date=20230809T123526Z&X-Amz-Expires=300&X-Amz-SignedHeaders=host&X-Amz-Signature=ba82c4379dd3b13454d2bbee80a5ae2c2af9d57e02b3f78e8170a23d7feabf18"
13 | ASTRA_ORG_ID="f9460f14-9879-4ebe-83f2-48d3f3dce13c"
14 | ASTRA_ORG_NAME="cedrick.lunven@datastax.com"
15 | ASTRA_ORG_TOKEN="AstraCS:uZclXTYecCAqPPjiNmkezapR:e87d6edb702acd87516e4ef78e0c0e515c32ab2c3529f5a3242688034149a0e4"
16 |
--------------------------------------------------------------------------------
/samples-dataflow/dependency-reduced-pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | astra-beam-starter
5 | com.datastax.astra
6 | 1.0-SNAPSHOT
7 |
8 | 4.0.0
9 | samples-dataflow
10 | + samples-dataflow
11 | Sample Pipeline for Astra with Dataflow
12 |
13 |
14 |
15 | maven-compiler-plugin
16 |
17 |
18 |
19 | com.datastax.oss
20 | java-driver-mapper-processor
21 | ${cassandra-driver4x.version}
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 | com.google.cloud
31 | libraries-bom
32 | 26.10.0
33 | pom
34 | compile
35 |
36 |
37 |
38 | 26.10.0
39 | 31.1-jre
40 | 2.10.0
41 | 4.16.0
42 | 2.26.1
43 |
44 |
45 |
--------------------------------------------------------------------------------
/samples-dataflow/src/main/java/com/datastax/astra/dataflow/utils/GoogleSecretManagerUtils.java:
--------------------------------------------------------------------------------
1 | package com.datastax.astra.dataflow.utils;
2 |
3 | import com.google.cloud.secretmanager.v1.SecretManagerServiceClient;
4 |
5 | import java.io.IOException;
6 |
7 | /**
8 | * Utilities to read secrets in Google Secret Manager.
9 | */
10 | public class GoogleSecretManagerUtils {
11 |
12 | /**
13 | * Secret Manager Client
14 | */
15 | public static SecretManagerServiceClient client;
16 |
17 | /**
18 | * Hide Default constructor for utiltiy class
19 | */
20 | private GoogleSecretManagerUtils() {
21 | }
22 |
23 | /**
24 | * Access the Token.
25 | *
26 | * @param secretResourceId
27 | * token resource Id
28 | * @return
29 | * token Value
30 | */
31 | public static final String readTokenSecret(String secretResourceId) {
32 | try {
33 | if (client == null) client = SecretManagerServiceClient.create();
34 | return client
35 | .accessSecretVersion(secretResourceId)
36 | .getPayload().getData()
37 | .toStringUtf8();
38 | } catch (IOException e) {
39 | throw new IllegalStateException("Cannot read google secrets", e);
40 | }
41 | }
42 |
43 | /**
44 | * Access Secure Bundle.
45 | *
46 | * @param secretResourceId
47 | * Secure Bundle resource Id
48 | * @return
49 | * Secure Bundle Value
50 | */
51 | public static final byte[] readSecureBundleSecret(String secretResourceId) {
52 | try {
53 | if (client == null) client = SecretManagerServiceClient.create();
54 | return client
55 | .accessSecretVersion(secretResourceId)
56 | .getPayload().getData()
57 | .toByteArray();
58 | } catch (IOException e) {
59 | throw new IllegalStateException("Cannot read google secrets", e);
60 | }
61 | }
62 |
63 | }
64 |
--------------------------------------------------------------------------------
/samples-dataflow/src/main/java/com/datastax/astra/dataflow/utils/GoogleBigQueryUtils.java:
--------------------------------------------------------------------------------
1 | package com.datastax.astra.dataflow.utils;
2 |
3 | import com.google.api.services.bigquery.model.TableFieldSchema;
4 | import com.google.api.services.bigquery.model.TableSchema;
5 | import com.google.gson.Gson;
6 | import com.google.gson.reflect.TypeToken;
7 |
8 | import java.io.InputStream;
9 | import java.lang.reflect.Type;
10 | import java.nio.charset.StandardCharsets;
11 | import java.util.ArrayList;
12 | import java.util.List;
13 | import java.util.Scanner;
14 |
15 | /**
16 | * Utility class to work with Google BigQuery.
17 | */
18 | public class GoogleBigQueryUtils {
19 |
20 | /**
21 | * Utilities for BigQuery
22 | */
23 | private GoogleBigQueryUtils() {}
24 |
25 | /**
26 | * Get Table Schema from JSON.
27 | *
28 | * @param jsonFileName
29 | * json file
30 | * @return
31 | * table Schema
32 | */
33 | public static TableSchema readTableSchemaFromJsonFile(String jsonFileName) {
34 | Type listType = new TypeToken>(){}.getType();
35 | List yourClassList = new Gson().fromJson(readJsonFile(jsonFileName), listType);
36 | TableSchema tableSchema = new TableSchema();
37 | tableSchema.setFields(new Gson().fromJson(readJsonFile(jsonFileName), listType));
38 | return tableSchema;
39 | }
40 |
41 | /**
42 | * Read a JSON FILE.
43 | * @param filePath
44 | * current json
45 | * @return
46 | * content of the JSON
47 | */
48 | private static String readJsonFile(String filePath) {
49 | // Get the InputStream for the file from the classpath
50 | InputStream inputStream = GoogleBigQueryUtils.class.getClassLoader().getResourceAsStream(filePath);
51 | if (inputStream != null) {
52 | try (Scanner scanner = new Scanner(inputStream, StandardCharsets.UTF_8.name())) {
53 | return scanner.useDelimiter("\\A").next();
54 | }
55 | } else {
56 | throw new IllegalArgumentException("Cannot read Json Schema File");
57 | }
58 | }
59 | }
--------------------------------------------------------------------------------
/samples-beam/src/main/resources/fables_of_fontaine.csv:
--------------------------------------------------------------------------------
1 | THE FLY AND THE GAME;1;A knight of powder-horn and shot
2 | THE FLY AND THE GAME;2;Once fill'd his bag--as I would not,
3 | THE FLY AND THE GAME;3;Unless the feelings of my breast
4 | THE FLY AND THE GAME;4;By poverty were sorely press'd--
5 | THE FLY AND THE GAME;5;With birds and squirrels for the spits
6 | THE FLY AND THE GAME;6;Of certain gormandizing cits.
7 | THE FLY AND THE GAME;7;With merry heart the fellow went
8 | THE FLY AND THE GAME;8;Direct to Mr. Centpercent,
9 | THE FLY AND THE GAME;9;Who loved, as well was understood,
10 | THE FLY AND THE GAME;10;Whatever game was nice and good.
11 | THE FLY AND THE GAME;11;This gentleman, with knowing air,
12 | THE FLY AND THE GAME;12;Survey'd the dainty lot with care,
13 | THE FLY AND THE GAME;13;Pronounced it racy, rich, and rare,
14 | THE FLY AND THE GAME;14;And call'd his wife, to know her wishes
15 | THE FLY AND THE GAME;15;About its purchase for their dishes.
16 | THE FLY AND THE GAME;16;The lady thought the creatures prime,
17 | THE FLY AND THE GAME;17;And for their dinner just in time
18 | THE FLY AND THE GAME;18;So sweet they were, and delicate,
19 | THE FLY AND THE GAME;19;For dinner she could hardly wait.
20 | THE FLY AND THE GAME;20;But now there came--could luck be worse?--
21 | THE FLY AND THE GAME;21;Just as the buyer drew his purse,
22 | THE FLY AND THE GAME;22;A bulky fly, with solemn buzz,
23 | THE FLY AND THE GAME;23;And smelt, as an inspector does,
24 | THE FLY AND THE GAME;24;This bird and that, and said the meat--
25 | THE FLY AND THE GAME;25;But here his words I won't repeat--
26 | THE FLY AND THE GAME;26;Was anything but fit to eat.
27 | THE FLY AND THE GAME;27;'Ah!' cried the lady, 'there's a fly
28 | THE FLY AND THE GAME;28;I never knew to tell a lie
29 | THE FLY AND THE GAME;29;His coat, you see, is bottle-green
30 | THE FLY AND THE GAME;30;He knows a thing or two I ween
31 | THE FLY AND THE GAME;31;My dear, I beg you, do not buy:
32 | THE FLY AND THE GAME;32;Such game as this may suit the dogs.'
33 | THE FLY AND THE GAME;33;So on our peddling sportsman jogs,
34 | THE FLY AND THE GAME;34;His soul possess'd of this surmise,
35 | THE FLY AND THE GAME;35;About some men, as well as flies:
36 | THE FLY AND THE GAME;36;A filthy taint they soonest find
37 | THE FLY AND THE GAME;37;Who are to relish filth inclined.
--------------------------------------------------------------------------------
/samples-beam/src/main/java/com/datastax/astra/beam/fable/FableDaoMapperFactoryFn.java:
--------------------------------------------------------------------------------
1 | package com.datastax.astra.beam.fable;
2 |
3 | /*-
4 | * #%L
5 | * Beam SDK for Astra
6 | * --
7 | * Copyright (C) 2023 DataStax
8 | * --
9 | * Licensed under the Apache License, Version 2.0
10 | * You may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 | import com.datastax.oss.driver.api.core.CqlSession;
23 | import com.datastax.oss.driver.api.core.cql.Row;
24 | import org.apache.beam.sdk.io.astra.db.mapping.AstraDbMapper;
25 | import org.apache.beam.sdk.transforms.SerializableFunction;
26 |
27 | import java.util.concurrent.CompletionStage;
28 |
29 | public class FableDaoMapperFactoryFn implements SerializableFunction> {
30 |
31 | @Override
32 | public AstraDbMapper apply(CqlSession cqlSession) {
33 | // Product is not serializable, so we need to use the DAO to map to a serializable object
34 | FableDao dao = new FableDaoMapperBuilder(cqlSession).build()
35 | .getFableDao(cqlSession.getKeyspace().get());
36 |
37 | // Mapping to Serialize
38 | return new AstraDbMapperDelegate(dao);
39 | }
40 |
41 | public static class AstraDbMapperDelegate implements AstraDbMapper {
42 |
43 | FableDao dao;
44 |
45 | public AstraDbMapperDelegate(FableDao dao) {
46 | this.dao = dao;
47 | }
48 |
49 | @Override
50 | public FableDto mapRow(Row row) {
51 | return new FableDto(dao.mapRow(row));
52 | }
53 |
54 | @Override
55 | public CompletionStage deleteAsync(FableDto entity) {
56 | return dao.deleteAsync(entity.toFable());
57 | }
58 |
59 | @Override
60 | public CompletionStage saveAsync(FableDto entity) {
61 | return dao.saveAsync(entity.toFable());
62 | }
63 | }
64 |
65 |
66 | }
67 |
--------------------------------------------------------------------------------
/samples-dataflow/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 | samples-dataflow
7 | + samples-dataflow
8 | Sample Pipeline for Astra with Dataflow
9 |
10 |
11 | com.datastax.astra
12 | astra-beam-starter
13 | 1.0-SNAPSHOT
14 |
15 |
16 |
17 | 31.1-jre
18 | 26.10.0
19 | 2.10.0
20 | 2.26.1
21 | 4.16.0
22 |
23 |
24 |
25 |
26 |
27 | com.datastax.astra
28 | beam-sdks-java-io-astra
29 |
30 |
31 |
32 | com.google.cloud
33 | libraries-bom
34 | ${google.cloud.librairies}
35 | pom
36 |
37 |
38 |
39 | com.google.guava
40 | guava
41 | ${guava.version}
42 |
43 |
44 |
45 | com.google.cloud
46 | google-cloud-secretmanager
47 | ${google.cloud-secretmanager.version}
48 |
49 |
50 |
51 | com.google.cloud
52 | google-cloud-bigquery
53 | ${google.cloud-bigquery.version}
54 |
55 |
56 |
57 | org.apache.beam
58 | beam-sdks-java-io-google-cloud-platform
59 |
60 |
61 |
62 | org.apache.beam
63 | beam-runners-google-cloud-dataflow-java
64 | runtime
65 |
66 |
67 |
68 |
69 | org.slf4j
70 | slf4j-api
71 |
72 |
73 | ch.qos.logback
74 | logback-classic
75 |
76 |
77 |
78 |
79 |
80 |
81 | org.apache.maven.plugins
82 | maven-compiler-plugin
83 |
84 |
85 |
86 | com.datastax.oss
87 | java-driver-mapper-processor
88 | ${cassandra-driver4x.version}
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
--------------------------------------------------------------------------------
/samples-beam/src/main/java/com/datastax/astra/beam/genai/GenAI_01_ImportData.java:
--------------------------------------------------------------------------------
1 | package com.datastax.astra.beam.genai;
2 |
3 | import com.datastax.astra.beam.fable.Fable;
4 | import com.datastax.astra.beam.fable.FableDaoMapperFactoryFn;
5 | import com.datastax.astra.beam.fable.FableDto;
6 | import com.datastax.astra.beam.fable.SimpleFableDbMapper;
7 | import lombok.extern.slf4j.Slf4j;
8 | import org.apache.beam.sdk.Pipeline;
9 | import org.apache.beam.sdk.io.TextIO;
10 | import org.apache.beam.sdk.io.astra.db.AstraDbIO;
11 | import org.apache.beam.sdk.io.astra.db.options.AstraDbReadOptions;
12 | import org.apache.beam.sdk.io.astra.db.transforms.RunCqlQueryFn;
13 | import org.apache.beam.sdk.io.astra.db.utils.AstraSecureConnectBundleUtils;
14 | import org.apache.beam.sdk.options.Description;
15 | import org.apache.beam.sdk.options.PipelineOptionsFactory;
16 | import org.apache.beam.sdk.options.Validation;
17 | import org.apache.beam.sdk.transforms.MapElements;
18 | import org.apache.beam.sdk.transforms.SimpleFunction;
19 |
20 | /**
21 | * Create Embeddings
22 | **/
23 | @Slf4j
24 | public class GenAI_01_ImportData {
25 |
26 | /**
27 | * Flow Interface
28 | */
29 | public interface CsvImportOption extends AstraDbReadOptions {
30 |
31 | @Validation.Required
32 | @Description("Path of file to read from")
33 | String getCsvInput();
34 |
35 | @SuppressWarnings("unused")
36 | void setCsvInput(String csvFile);
37 | }
38 |
39 | /**
40 | * Main execution
41 | */
42 | public static void main(String[] args) {
43 | // Parse and Validate Parameters
44 | CsvImportOption options = PipelineOptionsFactory
45 | .fromArgs(args).withValidation()
46 | .as(CsvImportOption.class);
47 |
48 | // Load Secure Bundle from Local File System
49 | byte[] scbZip = AstraSecureConnectBundleUtils
50 | .loadFromFilePath(options.getAstraSecureConnectBundle());
51 |
52 | long top = System.currentTimeMillis();
53 | try {
54 | log.info("Parameters validations is successful, launching pipeline");
55 | Pipeline pipelineWrite = Pipeline.create(options);
56 | pipelineWrite
57 | // Read a CSV
58 | .apply("Read Data From Disk",
59 | TextIO.read().from(options.getCsvInput()))
60 |
61 | // Convert each CSV row to a LanguageCode bean
62 | .apply("Convert To Cassandra Bean", MapElements.via(new MapCsvRowToFable()))
63 |
64 | // Insert Results Into Astra
65 | .apply("Write Into Astra", AstraDbIO.write()
66 | .withToken(options.getAstraToken())
67 | .withSecureConnectBundle(scbZip)
68 | .withKeyspace(options.getAstraKeyspace())
69 | .withMapperFactoryFn(new SimpleFableDbMapper())
70 | .withEntity(FableDto.class));
71 |
72 | pipelineWrite.run().waitUntilFinish();
73 |
74 | } finally {
75 | log.info("Pipeline finished in {} millis", System.currentTimeMillis()-top);
76 | AstraDbIO.close();
77 | }
78 | }
79 |
80 | private static class MapCsvRowToFable extends SimpleFunction {
81 | @Override
82 | public FableDto apply(String input) {
83 | return new FableDto(Fable.fromCsvRow(input));
84 | }
85 | }
86 |
87 | }
88 |
--------------------------------------------------------------------------------
/samples-beam/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 | samples-beam
7 | + samples-beam
8 | Sample Pipeline for Astra with Beam
9 |
10 |
11 | com.datastax.astra
12 | astra-beam-starter
13 | 1.0-SNAPSHOT
14 |
15 |
16 |
17 | 4.16.0
18 | 1.18.28
19 | 4.1.94.Final
20 | 0.12.0
21 |
22 |
23 |
24 |
25 |
26 | com.datastax.astra
27 | beam-sdks-java-io-astra
28 |
29 |
30 |
31 |
32 | org.apache.beam
33 | beam-runners-direct-java
34 | runtime
35 |
36 |
37 |
38 | org.slf4j
39 | slf4j-api
40 |
41 |
42 | ch.qos.logback
43 | logback-classic
44 | runtime
45 |
46 |
47 | ch.qos.logback
48 | logback-core
49 | runtime
50 |
51 |
52 |
53 |
54 | com.theokanning.openai-gpt3-java
55 | service
56 | ${openai-java.version}
57 |
58 |
59 | io.netty
60 | netty-all
61 | ${netty.version}
62 |
63 |
64 | org.projectlombok
65 | lombok
66 | provided
67 | ${lombok.version}
68 |
69 |
70 | com.google.apis
71 | google-api-services-bigquery
72 | v2-rev20230422-2.0.0
73 | compile
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 | org.apache.maven.plugins
82 | maven-compiler-plugin
83 |
84 |
85 |
86 | org.projectlombok
87 | lombok
88 | ${lombok.version}
89 |
90 |
91 | com.datastax.oss
92 | java-driver-mapper-processor
93 | ${cassandra-driver4x.version}
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
--------------------------------------------------------------------------------
/samples-beam/src/main/java/com/datastax/astra/beam/fable/AbstractCassIOEntity.java:
--------------------------------------------------------------------------------
1 | package com.datastax.astra.beam.fable;
2 |
3 | import com.datastax.oss.driver.api.core.data.CqlVector;
4 | import com.datastax.oss.driver.api.mapper.annotations.CqlName;
5 | import com.datastax.oss.driver.api.mapper.annotations.PartitionKey;
6 |
7 | /**
8 | * CassIO Expects a specific format for the Entity.
9 | */
10 | public abstract class AbstractCassIOEntity {
11 |
12 | @PartitionKey
13 | @CqlName("document_id")
14 | protected String documentId;
15 |
16 | @CqlName("document")
17 | protected String document;
18 |
19 | @CqlName("embedding_vector")
20 | protected CqlVector vector = CqlVector.builder().build();
21 |
22 | @CqlName("metadata_blob")
23 | protected String metaData;
24 |
25 | public static String cqlAlterTableForVectorSearch1(String keyspace, String tableName, int dimension) {
26 | StringBuilder cql = new StringBuilder();
27 | cql.append("ALTER TABLE %s.%s ADD embedding_vector VECTOR");
28 | return String.format(cql.toString(), keyspace, tableName, String.valueOf(dimension));
29 | }
30 |
31 | public static String cqlAlterTableForVectorSearch2(String keyspace, String tableName) {
32 | StringBuilder cql = new StringBuilder();
33 | cql.append("ALTER TABLE %s.%s ADD metadata_blob TEXT");
34 | return String.format(cql.toString(), keyspace, tableName);
35 | }
36 |
37 | public static String cqlCreateIndexForVectorSearch(String keyspace, String tableName) {
38 | StringBuilder cql = new StringBuilder();
39 | cql.append("CREATE CUSTOM INDEX IF NOT EXISTS %s_embedding_vector_idx ON %s.%s (embedding_vector)");
40 | cql.append("USING 'org.apache.cassandra.index.sai.StorageAttachedIndex';");
41 | return String.format(cql.toString(), tableName, keyspace, tableName);
42 | }
43 |
44 | /**
45 | * Gets documentId
46 | *
47 | * @return value of documentId
48 | */
49 | public String getDocumentId() {
50 | return documentId;
51 | }
52 |
53 | /**
54 | * Set value for documentId
55 | *
56 | * @param documentId
57 | * new value for documentId
58 | */
59 | public void setDocumentId(String documentId) {
60 | this.documentId = documentId;
61 | }
62 |
63 | /**
64 | * Gets document
65 | *
66 | * @return value of document
67 | */
68 | public String getDocument() {
69 | return document;
70 | }
71 |
72 | /**
73 | * Set value for document
74 | *
75 | * @param document
76 | * new value for document
77 | */
78 | public void setDocument(String document) {
79 | this.document = document;
80 | }
81 |
82 | /**
83 | * Gets vector
84 | *
85 | * @return value of vector
86 | */
87 | public CqlVector getVector() {
88 | return vector;
89 | }
90 |
91 | /**
92 | * Set value for vector
93 | *
94 | * @param vector
95 | * new value for vector
96 | */
97 | public void setVector(CqlVector vector) {
98 | this.vector = vector;
99 | }
100 |
101 | /**
102 | * Gets metaData
103 | *
104 | * @return value of metaData
105 | */
106 | public String getMetaData() {
107 | return metaData;
108 | }
109 |
110 | /**
111 | * Set value for metaData
112 | *
113 | * @param metaData
114 | * new value for metaData
115 | */
116 | public void setMetaData(String metaData) {
117 | this.metaData = metaData;
118 | }
119 | }
120 |
--------------------------------------------------------------------------------
/samples-dataflow/src/test/resources/language-codes.csv:
--------------------------------------------------------------------------------
1 | aa,Afar
2 | ab,Abkhazian
3 | ae,Avestan
4 | af,Afrikaans
5 | ak,Akan
6 | am,Amharic
7 | an,Aragonese
8 | ar,Arabic
9 | as,Assamese
10 | av,Avaric
11 | ay,Aymara
12 | az,Azerbaijani
13 | ba,Bashkir
14 | be,Belarusian
15 | bg,Bulgarian
16 | bh,Bihari languages
17 | bi,Bislama
18 | bm,Bambara
19 | bn,Bengali
20 | bo,Tibetan
21 | br,Breton
22 | bs,Bosnian
23 | ca,Catalan; Valencian
24 | ce,Chechen
25 | ch,Chamorro
26 | co,Corsican
27 | cr,Cree
28 | cs,Czech
29 | cu,Church Slavic
30 | cv,Chuvash
31 | cy,Welsh
32 | da,Danish
33 | de,German
34 | dv,Divehi; Dhivehi; Maldivian
35 | dz,Dzongkha
36 | ee,Ewe
37 | el,"Greek, Modern (1453-)"
38 | en,English
39 | eo,Esperanto
40 | es,Spanish; Castilian
41 | et,Estonian
42 | eu,Basque
43 | fa,Persian
44 | ff,Fulah
45 | fi,Finnish
46 | fj,Fijian
47 | fo,Faroese
48 | fr,French
49 | fy,Western Frisian
50 | ga,Irish
51 | gd,Gaelic; Scottish Gaelic
52 | gl,Galician
53 | gn,Guarani
54 | gu,Gujarati
55 | gv,Manx
56 | ha,Hausa
57 | he,Hebrew
58 | hi,Hindi
59 | ho,Hiri Motu
60 | hr,Croatian
61 | ht,Haitian; Haitian Creole
62 | hu,Hungarian
63 | hy,Armenian
64 | hz,Herero
65 | ia,Interlingua (International Auxiliary Language Association)
66 | id,Indonesian
67 | ie,Interlingue; Occidental
68 | ig,Igbo
69 | ii,Sichuan Yi; Nuosu
70 | ik,Inupiaq
71 | io,Ido
72 | is,Icelandic
73 | it,Italian
74 | iu,Inuktitut
75 | ja,Japanese
76 | jv,Javanese
77 | ka,Georgian
78 | kg,Kongo
79 | ki,Kikuyu; Gikuyu
80 | kj,Kuanyama; Kwanyama
81 | kk,Kazakh
82 | kl,Kalaallisut; Greenlandic
83 | km,Central Khmer
84 | kn,Kannada
85 | ko,Korean
86 | kr,Kanuri
87 | ks,Kashmiri
88 | ku,Kurdish
89 | kv,Komi
90 | kw,Cornish
91 | ky,Kirghiz; Kyrgyz
92 | la,Latin
93 | lb,Luxembourgish; Letzeburgesch
94 | lg,Ganda
95 | li,Limburgan; Limburger; Limburgish
96 | ln,Lingala
97 | lo,Lao
98 | lt,Lithuanian
99 | lu,Luba-Katanga
100 | lv,Latvian
101 | mg,Malagasy
102 | mh,Marshallese
103 | mi,Maori
104 | mk,Macedonian
105 | ml,Malayalam
106 | mn,Mongolian
107 | mr,Marathi
108 | ms,Malay
109 | mt,Maltese
110 | my,Burmese
111 | na,Nauru
112 | nb,"Bokmål, Norwegian; Norwegian Bokmål"
113 | nd,"Ndebele, North; North Ndebele"
114 | ne,Nepali
115 | ng,Ndonga
116 | nl,Dutch; Flemish
117 | nn,"Norwegian Nynorsk; Nynorsk, Norwegian"
118 | no,Norwegian
119 | nr,"Ndebele, South; South Ndebele"
120 | nv,Navajo; Navaho
121 | ny,Chichewa; Chewa; Nyanja
122 | oc,Occitan (post 1500)
123 | oj,Ojibwa
124 | om,Oromo
125 | or,Oriya
126 | os,Ossetian; Ossetic
127 | pa,Panjabi; Punjabi
128 | pi,Pali
129 | pl,Polish
130 | ps,Pushto; Pashto
131 | pt,Portuguese
132 | qu,Quechua
133 | rm,Romansh
134 | rn,Rundi
135 | ro,Romanian; Moldavian; Moldovan
136 | ru,Russian
137 | rw,Kinyarwanda
138 | sa,Sanskrit
139 | sc,Sardinian
140 | sd,Sindhi
141 | se,Northern Sami
142 | sg,Sango
143 | si,Sinhala; Sinhalese
144 | sk,Slovak
145 | sl,Slovenian
146 | sm,Samoan
147 | sn,Shona
148 | so,Somali
149 | sq,Albanian
150 | sr,Serbian
151 | ss,Swati
152 | st,"Sotho, Southern"
153 | su,Sundanese
154 | sv,Swedish
155 | sw,Swahili
156 | ta,Tamil
157 | te,Telugu
158 | tg,Tajik
159 | th,Thai
160 | ti,Tigrinya
161 | tk,Turkmen
162 | tl,Tagalog
163 | tn,Tswana
164 | to,Tonga (Tonga Islands)
165 | tr,Turkish
166 | ts,Tsonga
167 | tt,Tatar
168 | tw,Twi
169 | ty,Tahitian
170 | ug,Uighur; Uyghur
171 | uk,Ukrainian
172 | ur,Urdu
173 | uz,Uzbek
174 | ve,Venda
175 | vi,Vietnamese
176 | vo,Volapük
177 | wa,Walloon
178 | wo,Wolof
179 | xh,Xhosa
180 | yi,Yiddish
181 | yo,Yoruba
182 | za,Zhuang; Chuang
183 | zh,Chinese
184 | zu,Zulu
185 |
--------------------------------------------------------------------------------
/samples-dataflow/src/main/java/com/datastax/astra/dataflow/domains/LanguageCode.java:
--------------------------------------------------------------------------------
1 | package com.datastax.astra.dataflow.domains;
2 |
3 | import com.datastax.oss.driver.api.core.type.DataTypes;
4 | import com.datastax.oss.driver.api.mapper.annotations.CqlName;
5 | import com.datastax.oss.driver.api.mapper.annotations.Entity;
6 | import com.datastax.oss.driver.api.mapper.annotations.PartitionKey;
7 | import com.datastax.oss.driver.api.querybuilder.SchemaBuilder;
8 | import com.google.api.services.bigquery.model.TableRow;
9 |
10 | import java.io.Serializable;
11 |
12 | /**
13 | * DTO for Language Code.
14 | */
15 | @Entity
16 | @CqlName(LanguageCode.TABLE_NAME)
17 | public class LanguageCode implements Serializable {
18 |
19 | /** Constants for mapping. */
20 | public static final String TABLE_NAME = "languages";
21 |
22 | @PartitionKey
23 | @CqlName("code")
24 | private String code;
25 |
26 | @CqlName("language")
27 | private String language;
28 |
29 | /**
30 | * Constructor
31 | */
32 | public LanguageCode() {
33 | }
34 |
35 | /**
36 | * Full Fledge constructor
37 | */
38 | public LanguageCode(String code, String language) {
39 | this.code = code;
40 | this.language = language;
41 | }
42 |
43 | /**
44 | * Help generating the Target Table if it does not exist.
45 | *
46 | * @return
47 | * create statement
48 | */
49 | public static LanguageCode fromCsvRow(String csvRow) {
50 | String[] chunks = csvRow.split(",");
51 | return new LanguageCode(chunks[0], chunks[1]);
52 | }
53 |
54 | /**
55 | * Convert to CSV Row.
56 | *
57 | * @return
58 | * csv Row.
59 | */
60 | public String toCsvRow() {
61 | return code + "," + language;
62 | }
63 |
64 | /**
65 | * Read From BigQuery table.
66 | *
67 | * @param row
68 | * current big query row
69 | * @return
70 | * current bean.
71 | */
72 | public static LanguageCode fromBigQueryTableRow(TableRow row) {
73 | return new LanguageCode((String) row.get("code"), (String) row.get("language"));
74 | }
75 |
76 | /**
77 | * Convert to BigQuery TableRow.
78 | * @return
79 | * big query table row
80 | */
81 | public TableRow toBigQueryTableRow() {
82 | TableRow row = new TableRow();
83 | row.set("code", this.code);
84 | row.set("language", this.language);
85 | return row;
86 | }
87 |
88 | /**
89 | * Map Csv Row to LanguageCode.
90 | * @param csvRow
91 | * @return
92 | */
93 | public static LanguageCode fromCsv(String csvRow) {
94 | String[] chunks = csvRow.split(",(?=([^\"]*\"[^\"]*\")*[^\"]*$)");
95 | return new LanguageCode(chunks[0], chunks[1]);
96 | }
97 |
98 | /**
99 | * Help generating the Target Table if it does not exist.
100 | *
101 | * @return
102 | * create statement
103 | */
104 | public static String cqlCreateTable() {
105 | return SchemaBuilder.createTable(TABLE_NAME)
106 | .ifNotExists()
107 | .withPartitionKey("code", DataTypes.TEXT)
108 | .withColumn("language", DataTypes.TEXT)
109 | .toString();
110 | }
111 |
112 | /**
113 | * Gets code
114 | *
115 | * @return value of code
116 | */
117 | public String getCode() {
118 | return code;
119 | }
120 |
121 | /**
122 | * Set value for code
123 | *
124 | * @param code
125 | * new value for code
126 | */
127 | public void setCode(String code) {
128 | this.code = code;
129 | }
130 |
131 | /**
132 | * Gets language
133 | *
134 | * @return value of language
135 | */
136 | public String getLanguage() {
137 | return language;
138 | }
139 |
140 | /**
141 | * Set value for language
142 | *
143 | * @param language
144 | * new value for language
145 | */
146 | public void setLanguage(String language) {
147 | this.language = language;
148 | }
149 |
150 |
151 | }
--------------------------------------------------------------------------------
/samples-dataflow/src/main/java/com/datastax/astra/dataflow/transforms/CassandraToBigQuerySchemaMapperFn.java:
--------------------------------------------------------------------------------
1 | package com.datastax.astra.dataflow.transforms;
2 |
3 |
4 | import com.datastax.oss.driver.api.core.CqlSession;
5 | import com.datastax.oss.driver.api.core.metadata.Metadata;
6 | import com.datastax.oss.driver.api.core.metadata.schema.ColumnMetadata;
7 | import com.datastax.oss.driver.api.core.metadata.schema.KeyspaceMetadata;
8 | import com.datastax.oss.driver.api.core.metadata.schema.TableMetadata;
9 | import com.datastax.oss.driver.api.core.type.DataType;
10 | import com.datastax.oss.driver.api.core.type.ListType;
11 | import com.datastax.oss.driver.api.core.type.SetType;
12 | import com.datastax.oss.driver.api.core.type.TupleType;
13 | import com.datastax.oss.protocol.internal.ProtocolConstants;
14 | import com.google.api.services.bigquery.model.TableFieldSchema;
15 | import com.google.api.services.bigquery.model.TableSchema;
16 | import com.google.cloud.bigquery.StandardSQLTypeName;
17 | import org.apache.beam.sdk.io.astra.db.AstraDbIO;
18 | import org.apache.beam.sdk.io.astra.db.CqlSessionHolder;
19 | import org.apache.beam.sdk.transforms.SerializableFunction;
20 |
21 | import java.util.ArrayList;
22 | import java.util.List;
23 | import java.util.stream.Collectors;
24 |
25 | /**
26 | * Will Convert a Cassandra Row into a Beam Row.
27 | */
28 | public class CassandraToBigQuerySchemaMapperFn implements SerializableFunction, TableSchema > {
29 |
30 | /**
31 | * Current table.
32 | */
33 | private final String table;
34 |
35 | /**
36 | * Current keyspace.
37 | */
38 | private final String keyspace;
39 |
40 | /**
41 | * Access Table Schema.
42 | *
43 | * @param keyspace
44 | * current keyspace
45 | * @param table
46 | * current table
47 | */
48 | public CassandraToBigQuerySchemaMapperFn(String keyspace, String table) {
49 | this.keyspace = keyspace;
50 | this.table = table;
51 | }
52 |
53 | @Override
54 | public TableSchema apply(AstraDbIO.Read> astraSource) {
55 | return readTableSchemaFromCassandraTable(
56 | CqlSessionHolder.getCqlSession(astraSource), keyspace, table);
57 | }
58 |
59 | /**
60 | * This function is meant to build a schema for destination table based on cassandra table schema.
61 | *
62 | * @param session
63 | * current session
64 | * @param keyspace
65 | * cassandra keyspace
66 | * @param table
67 | * cassandra table
68 | * @return
69 | */
70 | public static TableSchema readTableSchemaFromCassandraTable(CqlSession session, String keyspace, String table) {
71 | Metadata clusterMetadata = session.getMetadata();
72 | KeyspaceMetadata keyspaceMetadata = clusterMetadata.getKeyspace(keyspace).get();
73 | TableMetadata tableMetadata = keyspaceMetadata.getTable(table).get();
74 | TableSchema tableSchema = new TableSchema();
75 | List fieldList = new ArrayList<>();
76 | for(ColumnMetadata columnMetadata : tableMetadata.getColumns().values()) {
77 | TableFieldSchema fieldSchema = new TableFieldSchema();
78 | fieldSchema.setName(columnMetadata.getName().toString());
79 | fieldSchema.setType(mapCassandraToBigQueryType(columnMetadata.getType()).name());
80 | fieldSchema.setMode("NULLABLE");
81 | if (tableMetadata.getPrimaryKey().contains(columnMetadata)) {
82 | fieldSchema.setMode("REQUIRED");
83 | }
84 | int protocolCode = columnMetadata.getType().getProtocolCode();
85 | if (protocolCode == ProtocolConstants.DataType.LIST ||
86 | protocolCode == ProtocolConstants.DataType.SET ||
87 | protocolCode == ProtocolConstants.DataType.TUPLE ||
88 | protocolCode == ProtocolConstants.DataType.MAP) {
89 | fieldSchema.setMode("REPEATED");
90 | }
91 | fieldList.add(fieldSchema);
92 | }
93 | tableSchema.setFields(fieldList);
94 | return tableSchema;
95 | }
96 |
97 | /**
98 | * Map DataType to BigQuery StandardSQLTypeName.
99 | *
100 | * @param dataType
101 | * cassandra type
102 | * @return SQL Type.
103 | */
104 | private static StandardSQLTypeName mapCassandraToBigQueryType(DataType dataType) {
105 | switch (dataType.getProtocolCode()) {
106 | case ProtocolConstants.DataType.BOOLEAN:
107 | return StandardSQLTypeName.BOOL;
108 | case ProtocolConstants.DataType.INT:
109 | case ProtocolConstants.DataType.BIGINT:
110 | return StandardSQLTypeName.INT64;
111 | case ProtocolConstants.DataType.FLOAT:
112 | case ProtocolConstants.DataType.DOUBLE:
113 | return StandardSQLTypeName.FLOAT64;
114 | case ProtocolConstants.DataType.TIMESTAMP:
115 | return StandardSQLTypeName.TIMESTAMP;
116 | case ProtocolConstants.DataType.LIST:
117 | ListType listType = (ListType) dataType;
118 | return mapCassandraToBigQueryType(listType.getElementType());
119 | case ProtocolConstants.DataType.SET:
120 | SetType setType = (SetType) dataType;
121 | return mapCassandraToBigQueryType(setType.getElementType());
122 | case ProtocolConstants.DataType.TUPLE:
123 | case ProtocolConstants.DataType.MAP:
124 | return StandardSQLTypeName.STRUCT;
125 | case ProtocolConstants.DataType.ASCII:
126 | case ProtocolConstants.DataType.VARINT:
127 | // Add more cases for other Cassandra types as needed
128 | default:
129 | // Default to STRING if no mapping found
130 | return StandardSQLTypeName.STRING;
131 | }
132 | }
133 | }
--------------------------------------------------------------------------------
/samples-beam/src/main/java/com/datastax/astra/beam/genai/GenAI_02_CreateEmbeddings.java:
--------------------------------------------------------------------------------
1 | package com.datastax.astra.beam.genai;
2 |
3 | import com.datastax.astra.beam.fable.Fable;
4 | import com.datastax.astra.beam.fable.FableDaoMapperFactoryFn;
5 | import com.datastax.astra.beam.fable.FableDto;
6 | import com.datastax.astra.beam.fable.SimpleFableDbMapper;
7 | import com.theokanning.openai.embedding.EmbeddingRequest;
8 | import com.theokanning.openai.service.OpenAiService;
9 | import lombok.extern.slf4j.Slf4j;
10 | import org.apache.beam.sdk.Pipeline;
11 | import org.apache.beam.sdk.coders.SerializableCoder;
12 | import org.apache.beam.sdk.io.astra.db.AstraDbIO;
13 | import org.apache.beam.sdk.io.astra.db.options.AstraDbReadOptions;
14 | import org.apache.beam.sdk.io.astra.db.transforms.RunCqlQueryFn;
15 | import org.apache.beam.sdk.io.astra.db.utils.AstraSecureConnectBundleUtils;
16 | import org.apache.beam.sdk.options.Description;
17 | import org.apache.beam.sdk.options.PipelineOptionsFactory;
18 | import org.apache.beam.sdk.options.Validation;
19 | import org.apache.beam.sdk.transforms.DoFn;
20 | import org.apache.beam.sdk.transforms.ParDo;
21 |
22 | import java.util.Arrays;
23 | import java.util.stream.Collectors;
24 |
25 | /**
26 | * Create Embeddings
27 | */
28 | @Slf4j
29 | public class GenAI_02_CreateEmbeddings {
30 |
31 | /**
32 | * Flow Interface
33 | */
34 | public interface CreateEmbeddingsOptions extends AstraDbReadOptions {
35 |
36 | @Validation.Required
37 | @Description("Path of file to read from")
38 | String getOpenAiKey();
39 |
40 | @SuppressWarnings("unused")
41 | void setOpenAiKey(String key);
42 | }
43 |
44 | /**
45 | * Main execution
46 | */
47 | public static void main(String[] args) {
48 | // Parse and Validate Parameters
49 | CreateEmbeddingsOptions options = PipelineOptionsFactory
50 | .fromArgs(args).withValidation()
51 | .as(CreateEmbeddingsOptions.class);
52 |
53 | // Load Secure Bundle from Local File System
54 | byte[] scbZip = AstraSecureConnectBundleUtils
55 | .loadFromFilePath(options.getAstraSecureConnectBundle());
56 |
57 | long top = System.currentTimeMillis();
58 | try {
59 | log.info("Parameters validations is successful, launching pipeline");
60 | Pipeline genaiPipeline = Pipeline.create(options);
61 |
62 | // Read the table AS-IS
63 | genaiPipeline.apply("Read Table", AstraDbIO.read()
64 | .withToken(options.getAstraToken())
65 | .withKeyspace(options.getAstraKeyspace())
66 | .withSecureConnectBundle(scbZip)
67 | .withTable(options.getTable())
68 | .withCoder(SerializableCoder.of(FableDto.class))
69 | .withMapperFactoryFn(new SimpleFableDbMapper())
70 | .withEntity(FableDto.class))
71 |
72 | // Alter table to add Vector
73 | .apply("Alter Table to add the vector capability",
74 | new RunCqlQueryFn<>(options.getAstraToken(),
75 | scbZip, options.getAstraKeyspace(),
76 | Fable.cqlAlterTableForVectorSearch1(options.getAstraKeyspace(),
77 | "fable", 1536)))
78 |
79 | .apply("Alter Table to add the vector capability",
80 | new RunCqlQueryFn<>(options.getAstraToken(),
81 | scbZip, options.getAstraKeyspace(),
82 | Fable.cqlAlterTableForVectorSearch2(options.getAstraKeyspace(),
83 | "fable")))
84 |
85 | // Create Index on the table
86 | .apply("Alter Table to add the vector capability",
87 | new RunCqlQueryFn<>(options.getAstraToken(),
88 | scbZip, options.getAstraKeyspace(),
89 | Fable.cqlCreateIndexForVectorSearch(options.getAstraKeyspace(), "fable")))
90 |
91 | // Open AI Enrichment
92 | .apply("Embeddings transform embeddings",
93 | ParDo.of(new TransformEmbeddingFn(options.getOpenAiKey())))
94 |
95 | // Insert Results Into Astra
96 | .apply("Write Into Astra", AstraDbIO.write()
97 | .withToken(options.getAstraToken())
98 | .withSecureConnectBundle(scbZip)
99 | .withKeyspace(options.getAstraKeyspace())
100 | .withMapperFactoryFn(new FableDaoMapperFactoryFn())
101 | .withEntity(FableDto.class));
102 |
103 | genaiPipeline.run().waitUntilFinish();
104 |
105 | } finally {
106 | log.info("Pipeline finished in {} millis", System.currentTimeMillis()-top);
107 | AstraDbIO.close();
108 | }
109 | }
110 |
111 | private static class TransformEmbeddingFn extends DoFn {
112 |
113 | private String openApiKey;
114 |
115 | public TransformEmbeddingFn(String openApiKey) {
116 | this.openApiKey = openApiKey;
117 | }
118 | @ProcessElement
119 | public void processElement(@Element FableDto row, OutputReceiver receiver) {
120 | FableDto f = new FableDto();
121 | f.setDocument(row.getDocument());
122 | f.setDocumentId(row.getDocumentId());
123 | f.setTitle(row.getTitle());
124 | // Metadata
125 | f.setMetadata("{ \"processing\": \"openai\" }");
126 | // Random List
127 | /*
128 | List floatList = IntStream.range(0, 1536)
129 | .mapToObj(i -> new Random().nextFloat())
130 | .collect(Collectors.toList());
131 |
132 | */
133 | EmbeddingRequest request = EmbeddingRequest.builder()
134 | .model("text-embedding-ada-002")
135 | .input(Arrays.asList(row.getDocument()))
136 | .build();
137 | // only one request sent
138 | f.setVector(new OpenAiService(openApiKey).createEmbeddings(request)
139 | .getData().get(0)
140 | .getEmbedding().stream()
141 | .map(e -> e.floatValue())
142 | .collect(Collectors.toList()));
143 | log.info("Vector {}", f.getVector());
144 | receiver.output(f);
145 | }
146 | }
147 |
148 | }
149 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 | com.datastax.astra
7 | astra-beam-starter
8 | Workshop Beam
9 | 1.0-SNAPSHOT
10 | pom
11 |
12 |
13 | samples-beam
14 | samples-dataflow
15 |
16 |
17 |
18 | UTF-8
19 | 11
20 | 11
21 |
22 | 2.48.0
23 |
24 | 4.16.3
25 | 2.0.7
26 | 1.4.8
27 |
28 |
29 | 3.7.0
30 | 1.6.0
31 | 3.3.0
32 | 3.4.1
33 |
34 |
35 |
36 |
37 | apache.snapshots
38 | Apache Development Snapshot Repository
39 | https://repository.apache.org/content/repositories/snapshots/
40 |
41 | false
42 |
43 |
44 | true
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 | org.apache.beam
55 | beam-sdks-java-google-cloud-platform-bom
56 | ${beam.version}
57 | pom
58 | import
59 |
60 |
61 |
73 |
74 |
86 |
87 |
88 |
89 | com.datastax.astra
90 | beam-sdks-java-io-astra
91 | ${astra.beam-sdk.version}
92 |
93 |
94 |
95 | org.apache.beam
96 | beam-sdks-java-io-cassandra
97 | ${beam.version}
98 |
99 |
100 |
101 |
102 | org.slf4j
103 | slf4j-api
104 | ${slf4j.version}
105 |
106 |
107 | ch.qos.logback
108 | logback-classic
109 | ${logback.version}
110 | runtime
111 |
112 |
113 | ch.qos.logback
114 | logback-core
115 | ${logback.version}
116 | runtime
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 | org.apache.maven.plugins
127 | maven-compiler-plugin
128 | ${maven-compiler-plugin.version}
129 |
130 | ${maven.plugin.compiler.source}
131 | ${maven.plugin.compiler.target}
132 |
133 |
134 |
135 |
136 | org.apache.maven.plugins
137 | maven-enforcer-plugin
138 | ${maven-enforcer-plugin.version}
139 |
140 |
141 | enforce-maven
142 |
143 | enforce
144 |
145 |
146 |
147 |
148 | 3.0.5
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 | org.apache.maven.plugins
158 | maven-shade-plugin
159 | ${maven-shade-plugin.version}
160 |
161 |
162 | package
163 |
164 | shade
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 | *:*
173 |
174 | META-INF/*.SF
175 | META-INF/*.DSA
176 | META-INF/*.RSA
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 | org.codehaus.mojo
191 | exec-maven-plugin
192 | ${maven-exec-plugin.version}
193 |
194 | false
195 |
196 |
197 |
198 |
199 |
200 |
201 | https://github.com/datastax-examples/astra-dataflow-starter
202 |
203 |
204 | DataStax
205 | http://datastax.com
206 |
207 |
208 |
209 |
210 | Apache 2
211 | http://www.apache.org/licenses/LICENSE-2.0.html
212 | repo
213 |
214 |
215 |
216 |
217 |
--------------------------------------------------------------------------------
/samples-dataflow/src/main/java/com/datastax/astra/dataflow/AstraDb_To_BigQuery_Dynamic.java:
--------------------------------------------------------------------------------
1 | package com.datastax.astra.dataflow;
2 |
3 | import com.datastax.astra.dataflow.transforms.CassandraToBigQuerySchemaMapperFn;
4 | import com.datastax.astra.dataflow.utils.GoogleSecretManagerUtils;
5 | import com.datastax.oss.driver.api.core.CqlSession;
6 | import com.google.api.services.bigquery.model.TableReference;
7 | import com.google.api.services.bigquery.model.TableSchema;
8 | import com.google.cloud.bigquery.BigQuery;
9 | import com.google.cloud.bigquery.BigQueryOptions;
10 | import com.google.cloud.bigquery.DatasetId;
11 | import com.google.cloud.bigquery.DatasetInfo;
12 | import org.apache.beam.sdk.Pipeline;
13 | import org.apache.beam.sdk.coders.SerializableCoder;
14 | import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
15 | import org.apache.beam.sdk.io.astra.db.AstraDbIO;
16 | import org.apache.beam.sdk.io.astra.db.CqlSessionHolder;
17 | import org.apache.beam.sdk.io.astra.db.mapping.AstraDbMapper;
18 | import org.apache.beam.sdk.io.astra.db.mapping.BeamRowDbMapperFactoryFn;
19 | import org.apache.beam.sdk.io.astra.db.options.AstraDbReadOptions;
20 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
21 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryUtils;
22 | import org.apache.beam.sdk.options.Description;
23 | import org.apache.beam.sdk.options.PipelineOptionsFactory;
24 | import org.apache.beam.sdk.transforms.DoFn;
25 | import org.apache.beam.sdk.transforms.ParDo;
26 | import org.apache.beam.sdk.transforms.SerializableFunction;
27 | import org.apache.beam.sdk.values.Row;
28 | import org.slf4j.Logger;
29 | import org.slf4j.LoggerFactory;
30 |
31 | import java.io.Serializable;
32 |
33 | /**
34 | * Copy a Cassandra Table in BiQuery.
35 |
36 | export ASTRA_SECRET_TOKEN=projects/747469159044/secrets/astra-token/versions/2
37 | export ASTRA_SECRET_SECURE_BUNDLE=projects/747469159044/secrets/secure-connect-bundle-demo/versions/2
38 | export ASTRA_KEYSPACE=samples_dataflow
39 | export ASTRA_TABLE=languages
40 |
41 | export GCP_PROJECT_ID=integrations-379317
42 | export GCP_BIGQUERY_DATASET=dataflow_input_us
43 | export GCP_BIGQUERY_TABLE=fable
44 |
45 | mvn compile exec:java \
46 | -Dexec.mainClass=com.datastax.astra.dataflow.AstraDb_To_BigQuery_Dynamic \
47 | -Dexec.args="\
48 | --astraToken=${ASTRA_SECRET_TOKEN} \
49 | --astraSecureConnectBundle=${ASTRA_SECRET_SECURE_BUNDLE} \
50 | --astraKeyspace=samples_dataflow \
51 | --table=${ASTRA_TABLE} \
52 | --runner=DataflowRunner \
53 | --project=${GCP_PROJECT_ID} \
54 | --region=us-central1"
55 |
56 | */
57 | public class AstraDb_To_BigQuery_Dynamic {
58 |
59 | /**
60 | * Logger for the class.
61 | */
62 | private static final Logger LOGGER = LoggerFactory.getLogger(AstraDb_To_BigQuery_Dynamic.class);
63 |
64 | /**
65 | * Flow Interface
66 | * RefTableSet = :.
67 | */
68 | public interface AstraDbToBigQueryOptions extends AstraDbReadOptions, GcpOptions {
69 |
70 | @Description("BigQuery dataset name, if not provided will be set to Keyspace")
71 | String getBigQueryDataset();
72 |
73 | @SuppressWarnings("unused")
74 | void setBigQueryDataset(String dataset);
75 |
76 | @Description("BigQuery table name, if not provided will be set to Cassandra Table name")
77 | String getBigQueryTable();
78 |
79 | @SuppressWarnings("unused")
80 | void setBigQueryTable(String table);
81 | }
82 |
83 | /**
84 | * Main.
85 | */
86 | public static void main(String[] args) {
87 |
88 | try {
89 |
90 | /*
91 | * Pipeline Parsing.
92 | * BigQuery Dataset and Table have been made optional and
93 | * will be set to Keyspace and Table if not provided.
94 | */
95 | AstraDbToBigQueryOptions options = PipelineOptionsFactory
96 | .fromArgs(args).withValidation()
97 | .as(AstraDbToBigQueryOptions.class);
98 | LOGGER.info("Pipeline Validated");
99 | String bigQueryDataset = options.getBigQueryDataset();
100 | if (bigQueryDataset == null || "".equals(bigQueryDataset)) {
101 | bigQueryDataset = options.getAstraKeyspace();
102 | }
103 | LOGGER.info("Big Query dataset set to {}", bigQueryDataset);
104 | String bigQueryTable = options.getBigQueryTable();
105 | if (bigQueryTable == null || "".equals(bigQueryTable)) {
106 | bigQueryTable = options.getTable();
107 | }
108 | LOGGER.info("Big Query table set to {}", bigQueryTable);
109 |
110 | /*
111 | * Astra Credentials as stored in Google Secrets.
112 | * - astraToken could be provided as a String in the template (simpler ?)
113 | * - astraSecureBundle is a file (binary), could it be provided as template input ?
114 | */
115 | String astraToken = GoogleSecretManagerUtils.
116 | readTokenSecret(options.getAstraToken());
117 | byte[] astraSecureBundle = GoogleSecretManagerUtils.
118 | readSecureBundleSecret(options.getAstraSecureConnectBundle());
119 | LOGGER.info("Astra Credentials retrieved from Google Secrets");
120 |
121 | /*
122 | * If DataSet does not exist, creating DataSet
123 | * in same region as the worker.
124 | */
125 | BigQuery bigquery = BigQueryOptions.newBuilder()
126 | .setProjectId(options.getProject()).build().getService();
127 | if (bigquery.getDataset(DatasetId.of(options.getProject(), bigQueryDataset)) == null) {
128 | LOGGER.info("Dataset was not found: creating DataSet {} in region {}",
129 | bigQueryDataset, options.getWorkerRegion());
130 | bigquery.create(DatasetInfo.newBuilder(bigQueryDataset)
131 | .setLocation(options.getWorkerRegion())
132 | .build());
133 | LOGGER.info("Dataset Creation [OK]");
134 | }
135 |
136 | /*
137 | * Generic Serializer Cassandra Record => Beam Row.
138 | * - Most types are supported except UDT and Tuple.
139 | * - The mapper is created dynamically based on the table schema.
140 | * - List, Set are converted to ARRAY
141 | * - Columns part of the primary are not REQUIRED.
142 | */
143 | SerializableFunction> beamRowMapperFactory =
144 | new BeamRowDbMapperFactoryFn(options.getAstraKeyspace(), options.getTable());
145 |
146 | // Mapper Cassandra Table => BigQuery Schema
147 | SerializableFunction, TableSchema> bigQuerySchemaFactory =
148 | new CassandraToBigQuerySchemaMapperFn(options.getAstraKeyspace(), options.getTable());
149 | LOGGER.info("Serializer initializations [OK]");
150 |
151 | // Source: AstraDb
152 | AstraDbIO.Read astraSource = AstraDbIO.read()
153 | .withToken(astraToken)
154 | .withSecureConnectBundle(astraSecureBundle)
155 | .withKeyspace(options.getAstraKeyspace())
156 | .withTable(options.getTable())
157 | .withMinNumberOfSplits(5)
158 | .withMapperFactoryFn(beamRowMapperFactory)
159 | .withCoder(SerializableCoder.of(Row.class))
160 | .withEntity(Row.class);
161 | LOGGER.info("AstraDb Source initialization [OK]");
162 |
163 | // Sink: BigQuery
164 | BigQueryIO.Write bigQuerySink = BigQueryIO.write()
165 | .to(new TableReference()
166 | .setProjectId(options.getProject())
167 | .setDatasetId(bigQueryDataset)
168 | .setTableId(bigQueryTable))
169 | // Specialized function reading cassandra source table and mapping to BigQuery Schema
170 | .withSchema(bigQuerySchemaFactory.apply(astraSource))
171 | // Provided by google, convert a Beam Row to a BigQuery TableRow
172 | .withFormatFunction(BigQueryUtils::toTableRow)
173 | // Table Will be created if not exist
174 | .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
175 | .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND);
176 | LOGGER.info("BigQuery Sink initialization [OK]");
177 |
178 | // Run Pipeline
179 | Pipeline astraDbToBigQueryPipeline = Pipeline.create(options);
180 | astraDbToBigQueryPipeline
181 | .apply("Read From Astra", astraSource)
182 | .apply("Show", ParDo.of(new Slf4jBeamRowLoggerFn(LOGGER))).setCoder(SerializableCoder.of(Row.class)) // DEBUG
183 | .apply("Write To BigQuery", bigQuerySink);
184 | astraDbToBigQueryPipeline.run().waitUntilFinish();
185 |
186 | } finally {
187 | CqlSessionHolder.cleanup();
188 | }
189 | }
190 |
191 | /**
192 | * Was relevant to debug the pipeline and mapping
193 | */
194 | public static class Slf4jBeamRowLoggerFn extends DoFn implements Serializable {
195 |
196 | /** Logger of current Pipeline. */
197 | private Logger logger;
198 |
199 | /**
200 | * Constructor.
201 | *
202 | * @param log
203 | * current logger
204 | */
205 | public Slf4jBeamRowLoggerFn(Logger log) {
206 | this.logger = log;
207 | }
208 |
209 | @ProcessElement
210 | public void processElement(ProcessContext c) {
211 | if (LOGGER.isDebugEnabled()) {
212 | LOGGER.debug("Row: {}", c.element().toString());
213 | }
214 | c.output(c.element());
215 | }
216 | }
217 |
218 |
219 | }
220 |
--------------------------------------------------------------------------------
/.idea/compiler.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | # Workshop Apache Beam and Google DataFlow
3 |
4 | [](https://gitpod.io/#https://github.com/datastaxdevs/workshop-beam)
5 | [](http://www.apache.org/licenses/LICENSE-2.0)
6 | [](https://discord.com/widget?id=685554030159593522&theme=dark)
7 |
8 |
9 | ## 📋 Table of content
10 |
11 |
12 |
13 | [**HouseKeeping**](#housekeeping)
14 | - [Objectives](#objectives)
15 | - [Frequently asked questions](#frequently-asked-questions)
16 | - [Materials for the Session](#materials-for-the-session)
17 |
18 | [**LAB**](#1-database-initialization)
19 | - [01. Create Astra Account](#-1---create-your-datastax-astra-account)
20 | - [02. Create Astra Token](#-2---create-an-astra-token)
21 | - [03. Copy the token](#-3---copy-the-token-value-in-your-clipboard)
22 | - [04. Open Gitpod](#-4---open-gitpod)
23 | - [05. Setup CLI](#-5---set-up-the-cli-with-your-token)
24 | - [06. Create Database](#-6---create-destination-database-and-a-keyspace)
25 | - [07. Create Destination Table](#-7---create-destination-table)
26 | - [08. Setup env variables](#-8---setup-env-variables)
27 | - [09. Setup Project](#-9---setup-project)
28 | - [10. Run Importing Flow](#-10---run-importing-flow)
29 | - [11. Validate Data](#-11---validate-data)
30 |
31 | [**WalkThrough**](#walkthrough)
32 | - [01. Compute Embeddings](#-1-run-flow-compute)
33 | - [02. Show results](#-2-validate-output)
34 | - [03. Create Google Project](#-3-create-google-project)
35 | - [04. Enable project Billing](#-4-enable-billing)
36 | - [05. Save Project Id](#-5-save-project-id)
37 | - [06. Install gcloud CLI](#-6-download-and-install-gcoud-cli)
38 | - [07. Authenticate Against Google Cloud](#-7-authenticate-with-google-cloud)
39 | - [08. Select your project](#-8-set-your-project-)
40 | - [09. Enable Needed Apis](#-9-enable-needed-api)
41 | - [10. Setup Dataflow user](#-10-add-roles-to-dataflow-users)
42 | - [11. Create Secret](#11----create-secrets-for-the-project-in-secret-manager)
43 | - [12. Move in proper folder](#-12-make-sure-you-are-in-samples-dataflow-folder)
44 | - [13. Setup env var](#13--make-sure-you-have-those-variables-initialized)
45 | - [14. Run the pipeline](#14----run-the-pipeline)
46 | - [15. show Content of Table](#15----show-the-content-of-the-table)
47 |
48 | ----
49 | ## HouseKeeping
50 |
51 | ### Objectives
52 |
53 | * Introduce AstraDB and Vector Search capability
54 | * Give you an first understanding about Apache Beam and Google DataFlow
55 | * Discover NoSQL dsitributed databases and specially Apache Cassandra™.
56 | * Getting familiar with a few Google Cloud Platform services
57 |
58 | ### Frequently asked questions
59 |
60 |
61 |
62 | 1️⃣ Can I run this workshop on my computer?
63 |
64 | There is nothing preventing you from running the workshop on your own machine, If you do so, you will need the following
65 |
66 | git installed on your local system
67 | Java installed on your local system
68 | Maven installed on your local system
69 |
70 |
71 | In this readme, we try to provide instructions for local development as well - but keep in mind that the main focus is development on Gitpod, hence We can't guarantee live support about local development in order to keep on track with the schedule. However, we will do our best to give you the info you need to succeed.
72 |
73 |
74 |
75 | 2️⃣ What other prerequisites are required?
76 |
77 |
78 | You will need an enough *real estate* on screen, we will ask you to open a few windows and it does not file mobiles (tablets should be OK)
79 | You will need a GitHub account eventually a google account for the Google Authentication (optional)
80 | You will need an Astra account: don't worry, we'll work through that in the following
81 | As Intermediate level we expect you to know what java and maven are
82 |
83 |
84 |
85 |
86 |
87 | 3️⃣ Do I need to pay for anything for this workshop?
88 |
89 | No. All tools and services we provide here are FREE. FREE not only during the session but also after.
90 |
91 |
92 |
93 | 4️⃣ Will I get a certificate if I attend this workshop?
94 |
95 | Attending the session is not enough. You need to complete the homeworks detailed below and you will get a nice badge that you can share on linkedin or anywhere else *(open api badge)*
96 |
97 |
98 |
99 | ### Materials for the Session
100 |
101 | It doesn't matter if you join our workshop live or you prefer to work at your own pace,
102 | we have you covered. In this repository, you'll find everything you need for this workshop:
103 |
104 | - [Slide deck](/slides/slides.pdf)
105 | - [Discord chat](https://dtsx.io/discord)
106 |
107 | ----
108 |
109 | ## LAB
110 |
111 | #### ✅ `1` - Create your DataStax Astra account
112 |
113 | > ℹ️ Account creation tutorial is available in [awesome astra](https://awesome-astra.github.io/docs/pages/astra/create-account/)
114 |
115 |
116 | _click the image below or go to [https://astra.datastax./com](bit.ly/3QxhO6t)_
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 | #### ✅ `2` - Create an Astra Token
125 |
126 | > ℹ️ Token creation tutorial is available in [awesome astra](https://awesome-astra.github.io/docs/pages/astra/create-token/#c-procedure)
127 |
128 | - `Locate `Settings` (#1) in the menu on the left, then `Token Management` (#2)
129 |
130 | - Select the role `Organization Administrator` before clicking `[Generate Token]`
131 |
132 | 
133 |
134 | The Token is in fact three separate strings: a `Client ID`, a `Client Secret` and the `token` proper. You will need some of these strings to access the database, depending on the type of access you plan. Although the Client ID, strictly speaking, is not a secret, you should regard this whole object as a secret and make sure not to share it inadvertently (e.g. committing it to a Git repository) as it grants access to your databases.
135 |
136 | ```json
137 | {
138 | "ClientId": "ROkiiDZdvPOvHRSgoZtyAapp",
139 | "ClientSecret": "fakedfaked",
140 | "Token":"AstraCS:fake"
141 | }
142 | ```
143 |
144 | #### ✅ `3` - Copy the token value in your clipboard
145 |
146 | You can also leave the windo open to copy the value in a second.
147 |
148 | #### ✅ `4` - Open Gitpod
149 |
150 | >
151 | > ↗️ _Right Click and select open as a new Tab..._
152 | >
153 | > [](https://gitpod.io/#https://github.com/datastaxdevs/workshop-beam)
154 | >
155 |
156 |
157 | 
158 |
159 |
160 | #### ✅ `5` - Set up the CLI with your token
161 |
162 | _In gitpod, in a terminal window:_
163 |
164 | - Login
165 |
166 | ```bash
167 | astra login --token AstraCS:fake
168 | ```
169 |
170 | - Validate your are setup
171 |
172 | ```bash
173 | astra org
174 | ```
175 |
176 | > **Output**
177 | > ```
178 | > gitpod /workspace/workshop-beam (main) $ astra org
179 | > +----------------+-----------------------------------------+
180 | > | Attribute | Value |
181 | > +----------------+-----------------------------------------+
182 | > | Name | cedrick.lunven@datastax.com |
183 | > | id | f9460f14-9879-4ebe-83f2-48d3f3dce13c |
184 | > +----------------+-----------------------------------------+
185 | > ```
186 |
187 |
188 | #### ✅ `6` - Create destination Database and a keyspace
189 |
190 | > ℹ️ You can notice we enabled the Vector Search capability
191 |
192 | - Create db `workshop_beam` and wait for the DB to become active
193 |
194 | ```
195 | astra db create workshop_beam -k beam --vector --if-not-exists
196 | ```
197 |
198 | > 💻 Output
199 | >
200 | > 
201 |
202 | - List databases
203 |
204 | ```
205 | astra db list
206 | ```
207 |
208 | > 💻 Output
209 | >
210 | > 
211 |
212 | - Describe your db
213 |
214 | ```
215 | astra db describe workshop_beam
216 | ```
217 |
218 | > 💻 Output
219 | >
220 | > 
221 |
222 | #### ✅ `7` - Create Destination table
223 |
224 | - Create Table:
225 |
226 | ```bash
227 | astra db cqlsh workshop_beam -k beam \
228 | -e "CREATE TABLE IF NOT EXISTS fable(document_id TEXT PRIMARY KEY, title TEXT, document TEXT)"
229 | ```
230 |
231 | - Show Table:
232 |
233 | ```bash
234 | astra db cqlsh workshop_beam -k beam -e "SELECT * FROM fable"
235 | ```
236 |
237 | #### ✅ `8` - Setup env variables
238 |
239 | - Create `.env` file with variables
240 |
241 | ```bash
242 | astra db create-dotenv workshop_beam
243 | ```
244 |
245 | - Display the file
246 |
247 | ```bash
248 | cat .env
249 | ```
250 |
251 | - Load env variables
252 |
253 | ```
254 | set -a
255 | source .env
256 | set +a
257 | env | grep ASTRA
258 | ```
259 |
260 | #### ✅ `9` - Setup project
261 |
262 | This command will allows to validate that Java , maven and lombok are working as expected
263 |
264 | ```
265 | mvn clean compile
266 | ```
267 |
268 | #### ✅ `10` - Run Importing flow
269 |
270 | - Open the CSV. It is very short and simple for demo purpose (and open API prices laters :) ).
271 |
272 | ```bash
273 | /workspace/workshop-beam/samples-beam/src/main/resources/fables_of_fontaine.csv
274 | ```
275 |
276 | - Open the Java file with the code
277 |
278 | ```bash
279 | gp open /workspace/workshop-beam/samples-beam/src/main/java/com/datastax/astra/beam/genai/GenAI_01_ImportData.java
280 | ```
281 |
282 | 
283 |
284 |
285 | - Run the Flow
286 |
287 | ```
288 | cd samples-beam
289 | mvn clean compile exec:java \
290 | -Dexec.mainClass=com.datastax.astra.beam.genai.GenAI_01_ImportData \
291 | -Dexec.args="\
292 | --astraToken=${ASTRA_DB_APPLICATION_TOKEN} \
293 | --astraSecureConnectBundle=${ASTRA_DB_SECURE_BUNDLE_PATH} \
294 | --astraKeyspace=${ASTRA_DB_KEYSPACE} \
295 | --csvInput=`pwd`/src/main/resources/fables_of_fontaine.csv"
296 | ```
297 |
298 | #### ✅ `11` - Validate Data
299 |
300 | ```bash
301 | astra db cqlsh workshop_beam -k beam -e "SELECT * FROM fable"
302 | ```
303 |
304 | ----
305 |
306 | ## WalkThrough
307 |
308 | 
309 |
310 | We will now compute the embedding leveraging OpenAPI. It is not free, you need to provide your credit card to access the API. This part is a walkthrough. If you have an openAI key follow with me !
311 |
312 |
313 | - [Access OpenAI interface and create a key](https://platform.openai.com/account/api-keys)
314 |
315 | - [Learn more about the Embeddings API](https://platform.openai.com/docs/api-reference/embeddings)
316 |
317 | - [Leanr More about the third party library in use](https://platform.openai.com/docs/libraries/community-libraries)
318 |
319 |
320 | #### ✅ `1` Run Flow Compute
321 |
322 | - Setup Open AI
323 |
324 | ```
325 | export OPENAI_API_KEY=""
326 | ```
327 |
328 | - Open the Java file with the code
329 |
330 | ```bash
331 | gp open /workspace/workshop-beam/samples-beam/src/main/java/com/datastax/astra/beam/genai/GenAI_02_CreateEmbeddings.java
332 | ```
333 |
334 | - Run the flow
335 | ```
336 | mvn clean compile exec:java \
337 | -Dexec.mainClass=com.datastax.astra.beam.genai.GenAI_02_CreateEmbeddings \
338 | -Dexec.args="\
339 | --astraToken=${ASTRA_DB_APPLICATION_TOKEN} \
340 | --astraSecureConnectBundle=${ASTRA_DB_SECURE_BUNDLE_PATH} \
341 | --astraKeyspace=${ASTRA_DB_KEYSPACE} \
342 | --openAiKey=${OPENAI_API_KEY} \
343 | --table=fable"
344 | ```
345 |
346 | #### ✅ `2` Validate Output
347 |
348 | ```bash
349 | astra db cqlsh workshop_beam -k beam -e "SELECT * FROM fable"
350 | ```
351 |
352 | #### ✅ `3` Create Google Project
353 |
354 | 
355 |
356 | - Create GCP Project
357 |
358 | > Note: If you don't plan to keep the resources that you create in this guide, create a project instead of selecting an existing project. After you finish these steps, you can delete the project, removing all resources associated with the project. Create a new Project in Google Cloud Console or select an existing one.
359 |
360 | In the Google Cloud console, on the project selector page, select or [create a Google Cloud project](https://cloud.google.com/resource-manager/docs/creating-managing-projects)
361 |
362 | #### ✅ `4` Enable Billing
363 |
364 | Make sure that billing is enabled for your Cloud project. Learn how to [check if billing is enabled on a project](https://cloud.google.com/billing/docs/how-to/verify-billing-enabled)
365 |
366 | #### ✅ `5` Save project ID:
367 |
368 | _The project identifier is available in the column `ID`. We will need it so let's save it as an environment variable_
369 |
370 | ```bash
371 | export GCP_PROJECT_ID=integrations-379317
372 | export GCP_PROJECT_CODE=747469159044
373 | export GCP_USER=cedrick.lunven@datastax.com
374 | export GCP_COMPUTE_ENGINE=747469159044-compute@developer.gserviceaccount.com
375 | ```
376 |
377 | #### ✅ `6` Download and install gCoud CLI
378 |
379 | ```
380 | curl https://sdk.cloud.google.com | bash
381 | ```
382 |
383 | Do not forget to open a new Tab.
384 |
385 | #### ✅ `7` Authenticate with Google Cloud
386 |
387 | Run the following command to authenticate with Google Cloud:
388 |
389 | - Execute:
390 |
391 | ```
392 | gcloud auth login
393 | ```
394 |
395 | - Authenticate as your google Account
396 |
397 |
398 | #### ✅ `8` Set your project
399 |
400 | If you haven't set your project yet, use the following command to set your project ID:
401 |
402 | ```
403 | gcloud config set project ${GCP_PROJECT_ID}
404 | gcloud projects describe ${GCP_PROJECT_ID}
405 | ```
406 |
407 | #### ✅ `9` Enable needed API
408 |
409 | ```
410 | gcloud services enable dataflow compute_component \
411 | logging storage_component storage_api \
412 | bigquery pubsub datastore.googleapis.com \
413 | cloudresourcemanager.googleapis.com
414 | ```
415 |
416 | #### ✅ `10` Add Roles to `dataflow` users
417 |
418 | To complete the steps, your user account must have the Dataflow Admin role and the Service Account User role. The Compute Engine default service account must have the Dataflow Worker role. To add the required roles in the Google Cloud console:
419 |
420 | ```
421 | gcloud projects add-iam-policy-binding ${GCP_PROJECT_ID} \
422 | --member="user:${GCP_USER}" \
423 | --role=roles/iam.serviceAccountUser
424 | gcloud projects add-iam-policy-binding ${GCP_PROJECT_ID} \
425 | --member="serviceAccount:${GCP_COMPUTE_ENGINE}" \
426 | --role=roles/dataflow.admin
427 | gcloud projects add-iam-policy-binding ${GCP_PROJECT_ID} \
428 | --member="serviceAccount:${GCP_COMPUTE_ENGINE}" \
429 | --role=roles/dataflow.worker
430 | gcloud projects add-iam-policy-binding ${GCP_PROJECT_ID} \
431 | --member="serviceAccount:${GCP_COMPUTE_ENGINE}" \
432 | --role=roles/storage.objectAdmin
433 | ```
434 |
435 | #### `11` - ✅ [Create secrets for the project in secret manager](https://cloud.google.com/secret-manager/docs/creating-and-accessing-secrets#secretmanager-create-secret-gcloud).
436 |
437 | To connect to `AstraDB` you need a token (credentials) and a zip used to secure the transport. Those two inputs should be defined as _secrets_.
438 |
439 | ```
440 | gcloud secrets create astra-token \
441 | --data-file <(echo -n "${ASTRA_TOKEN}") \
442 | --replication-policy="automatic"
443 |
444 | gcloud secrets create cedrick-demo-scb \
445 | --data-file ${ASTRA_SCB_PATH} \
446 | --replication-policy="automatic"
447 |
448 | gcloud secrets add-iam-policy-binding cedrick-demo-scb \
449 | --member="serviceAccount:${GCP_COMPUTE_ENGINE}" \
450 | --role='roles/secretmanager.secretAccessor'
451 |
452 | gcloud secrets add-iam-policy-binding astra-token \
453 | --member="serviceAccount:${GCP_COMPUTE_ENGINE}" \
454 | --role='roles/secretmanager.secretAccessor'
455 |
456 | gcloud secrets list
457 | ```
458 |
459 | #### ✅ `12` Make sure you are in `samples-dataflow` folder
460 |
461 | ```bash
462 | cd samples-dataflow
463 | pwd
464 | ```
465 |
466 | #### `13` ✅ Make sure you have those variables initialized
467 |
468 | We assume the table `languages` exists and has been populated in `3.1`
469 |
470 | ```bash
471 | export ASTRA_SECRET_TOKEN=projects/747469159044/secrets/astra-token/versions/2
472 | export ASTRA_SECRET_SECURE_BUNDLE=projects/747469159044/secrets/secure-connect-bundle-demo/versions/1
473 | ```
474 |
475 | #### `14` - ✅ Run the pipeline
476 |
477 | ```bash
478 | mvn compile exec:java \
479 | -Dexec.mainClass=com.datastax.astra.dataflow.AstraDb_To_BigQuery_Dynamic \
480 | -Dexec.args="\
481 | --astraToken=${ASTRA_SECRET_TOKEN} \
482 | --astraSecureConnectBundle=${ASTRA_SECRET_SECURE_BUNDLE} \
483 | --keyspace=${ASTRA_KEYSPACE} \
484 | --table=fable \
485 | --runner=DataflowRunner \
486 | --project=${GCP_PROJECT_ID} \
487 | --region=us-central1"
488 | ```
489 |
490 | #### `15` - ✅ Show the Content of the Table
491 |
492 | A dataset with the keyspace name and a table
493 | with the table name have been created in BigQuery.
494 |
495 | ```bash
496 | bq head -n 10 ${ASTRA_KEYSPACE}.${ASTRA_TABLE}
497 | ```
498 |
499 |
500 | ----
501 | The END
502 |
--------------------------------------------------------------------------------