├── .gitignore
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── build.gradle
├── gradle
    └── wrapper
    │   ├── gradle-wrapper.jar
    │   └── gradle-wrapper.properties
├── gradlew
├── gradlew.bat
├── settings.gradle
└── src
    ├── main
        └── java
        │   └── com
        │       └── google
        │           └── cloud
        │               └── healthcare
        │                   ├── ImportPipeline.java
        │                   ├── PipelineRunner.java
        │                   ├── config
        │                       ├── CsvConfiguration.java
        │                       └── GcpConfiguration.java
        │                   ├── decompress
        │                       ├── BaseHandler.java
        │                       ├── CompressionAlgorithm.java
        │                       ├── DecompressHandler.java
        │                       ├── Decompressor.java
        │                       ├── GZipHandler.java
        │                       ├── LZ4Handler.java
        │                       ├── TarHandler.java
        │                       └── ZipHandler.java
        │                   ├── io
        │                       ├── ByteReader.java
        │                       ├── GcsInputReader.java
        │                       ├── GcsOutputWriter.java
        │                       ├── GcsOutputWriterFactory.java
        │                       ├── InputReader.java
        │                       ├── OutputWriter.java
        │                       └── OutputWriterFactory.java
        │                   ├── process
        │                       ├── pipeline
        │                       │   ├── BigQueryDestinations.java
        │                       │   ├── FillTableRowFn.java
        │                       │   ├── GcsReadChunksFn.java
        │                       │   └── csv
        │                       │   │   ├── CsvDetectSchemaFn.java
        │                       │   │   ├── CsvExtractHeadersFn.java
        │                       │   │   ├── CsvMergeSchemaFn.java
        │                       │   │   ├── CsvParseDataFn.java
        │                       │   │   ├── CsvSplitFn.java
        │                       │   │   ├── GcsSplitCsvFn.java
        │                       │   │   ├── QuoteType.java
        │                       │   │   └── advance
        │                       │   │       ├── CsvParseDataAdvanceFn.java
        │                       │   │       └── GcsSplitCsvAdvanceFn.java
        │                       └── schema
        │                       │   ├── FieldType.java
        │                       │   ├── GcpUtil.java
        │                       │   └── SchemaUtil.java
        │                   └── util
        │                       ├── PrettyPrinter.java
        │                       └── StringUtil.java
    └── test
        ├── java
            └── com
            │   └── google
            │       └── cloud
            │           └── healthcare
            │               ├── config
            │                   └── CsvConfigurationTest.java
            │               ├── decompress
            │                   ├── DecompressorTest.java
            │                   ├── LZ4HandlerTest.java
            │                   ├── TarHandlerTest.java
            │                   └── ZipHandlerTest.java
            │               ├── io
            │                   ├── ByteReaderTest.java
            │                   └── GcsOutputWriterTest.java
            │               ├── process
            │                   ├── pipeline
            │                   │   ├── GcsReadChunksFnTest.java
            │                   │   └── csv
            │                   │   │   ├── CsvDetectSchemaFnTest.java
            │                   │   │   ├── CsvExtractHeadersFnTest.java
            │                   │   │   ├── CsvMergeSchemaFnTest.java
            │                   │   │   ├── CsvParseDataFnTest.java
            │                   │   │   ├── FillTableRowFnTest.java
            │                   │   │   ├── GcsSplitCsvFnTest.java
            │                   │   │   └── advance
            │                   │   │       ├── CsvParseDataAdvanceFnTest.java
            │                   │   │       └── GcsSplitCsvAdvanceFnTest.java
            │                   └── schema
            │                   │   ├── FieldTypeTest.java
            │                   │   └── SchemaUtilTest.java
            │               └── util
            │                   └── StringUtilTest.java
        └── resources
            ├── test_input_advance_split.csv
            ├── test_input_all_lines_have_new_lines.csv
            ├── test_input_invalid_headers.csv
            ├── test_input_mixed_quotes.csv
            ├── test_input_no_split.csv
            ├── test_input_no_split2.csv
            ├── test_input_parse.csv
            ├── test_input_parse2.csv
            ├── test_input_valid_headers1.csv
            ├── test_input_valid_headers2.csv
            ├── test_input_with_quotes.csv
            └── test_input_without_quotes.csv


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | .gradle/
3 | out/
4 | **/*.iml
5 | build/
6 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # How to Contribute
 2 | 
 3 | We'd love to accept your patches and contributions to this project. There are
 4 | just a few small guidelines you need to follow.
 5 | 
 6 | ## Contributor License Agreement
 7 | 
 8 | Contributions to this project must be accompanied by a Contributor License
 9 | Agreement. You (or your employer) retain the copyright to your contribution;
10 | this simply gives us permission to use and redistribute your contributions as
11 | part of the project. Head over to <https://cla.developers.google.com/> to see
12 | your current agreements on file or to sign a new one.
13 | 
14 | You generally only need to submit a CLA once, so if you've already submitted one
15 | (even if it was for a different project), you probably don't need to do it
16 | again.
17 | 
18 | ## Code reviews
19 | 
20 | All submissions, including submissions by project members, require review. We
21 | use GitHub pull requests for this purpose. Consult
22 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
23 | information on using pull requests.
24 | 
25 | ## Community Guidelines
26 | 
27 | This project follows
28 | [Google's Open Source Community Guidelines](https://opensource.google.com/conduct/).
29 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # BigQuery Data Importer
  2 | 
  3 | The purpose of this tool is to import raw CSV (or CSV-like) data in
  4 | [GCS](https://cloud.google.com/storage/) to
  5 | [BigQuery](https://cloud.google.com/bigquery/).
  6 | 
  7 | At times the autodetect mode in BigQuery fails to detect the expected schema of
  8 | the source data, in which case it is required to iterate over all the data to
  9 | determine the correct one.
 10 | 
 11 | This tool tries to first decompress the source file if necessary, then attempts
 12 | to split (see details below) the source file into multiple small chunks and
 13 | infer schema for each chunk, at last merge all the schema into a final one. With
 14 | the help of the schema, the source data is converted to
 15 | [AVRO](https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-avro)
 16 | format to accelerate importing.
 17 | 
 18 | [Dataflow](https://cloud.google.com/dataflow/) is used to parallelize the import
 19 | process.
 20 | 
 21 | ### File Split
 22 | 
 23 | For large files, a series of preliminary split points are chosen by calculating
 24 | the number of chunks to split (based on the estimated chunk size, which is 2MB),
 25 | a search is initiated starting from each split point, looking for a point which
 26 | doesn't lie in the middle of a logical row (rows can span multiple lines).
 27 | 
 28 | In order to tell whether a point is a valid split point, quotes on each line are
 29 | categorized: opening (which starts a field in quotes), closing (which finishes a
 30 | field in quotes) or unknown. It is easy to determine the split points by looking
 31 | at the types of quotes on each line. For lines without quotes, the headers are
 32 | used to assist. Each chunk is processed independently going forward.
 33 | 
 34 | ### Advanced Mode
 35 | 
 36 | Sometimes the files to import are not standard CSVs, an advanced mode is
 37 | provided, where two regular expressions can be provided to describe how to split
 38 | records and fields respectively, the tool will use these regular expressions to
 39 | break the files into chunks and fields.
 40 | 
 41 | Please see the Usage section for how to use the advance mode.
 42 | 
 43 | ## Usage
 44 | 
 45 | ### Prerequisites
 46 | 
 47 | *   A GCP (Google Cloud Platform) project.
 48 | *   GCS, BigQuery and Dataflow APIs are enabled.
 49 |     *   The runner (either end user or service account as recommended below)
 50 |         needs to have the following roles at the project level:
 51 |         -   `roles/bigquery.dataViewer`
 52 |         -   `roles/bigquery.jobUser`
 53 |         -   `roles/bigquery.user`
 54 |         -   `roles/compute.viewer`
 55 |         -   `roles/dataflow.developer`
 56 |     *   The dataflow
 57 |         [controller service account](https://cloud.google.com/dataflow/docs/concepts/security-and-permissions#controller_service_account)
 58 |         needs `roles/storage.admin` on the temporary bucket (provided to the
 59 |         pipeline by flag `--temp_bucket`, see below). Besides, it needs
 60 |         `roles/bigquery.dataEditor` on the target BigQuery dataset.
 61 |         *   Alternatively, you could use a customized controller service account
 62 |             `--dataflow_controller_service_account` (which has to be
 63 |             roles/dataflow.worker). In this case you only have to manage one
 64 |             service account.
 65 | *   [Google Cloud SDK](https://cloud.google.com/sdk/) is installed.
 66 | *   JDK 8+ is installed.
 67 | *   Gradle is installed.
 68 | 
 69 | ### Import
 70 | 
 71 | For security reasons, it is recommended to run this tool with a
 72 | [service account](https://cloud.google.com/iam/docs/understanding-service-accounts).
 73 | It is assumed that you have a service account configured, and the JSON key has
 74 | been downloaded to your disk, for how to do that, please follow the tutorials
 75 | [here](https://cloud.google.com/iam/docs/creating-managing-service-accounts).
 76 | 
 77 | All the following should run on a console unless otherwise specified.
 78 | 
 79 | *   Switch the default project. Note you need to replace the project name with
 80 |     yours. You can skip this step if you only have one project.
 81 | 
 82 | `gcloud config set project <my-project>`
 83 | 
 84 | *   Run the import command. Note you need to replace the GCS URIs and BQ dataset
 85 |     with yours.
 86 | 
 87 | ```shell
 88 | ./gradlew run -PappArgs="[\
 89 | '--gcp_project_id', 'my-project',\
 90 | '--gcs_uri', 'gs://my-bucket/my-file.gz',\
 91 | '--bq_dataset', 'my-dataset',\
 92 | '--temp_bucket', 'my-temp-bucket',\
 93 | '--gcp_credentials', 'my-project-some-hash.json',\
 94 | '--dataflow_controller_service_account', 'my-dataflow@my-project-gserviceaccount.com',\
 95 | '--verbose', 'true'
 96 | ]"
 97 | ```
 98 | 
 99 | *   Leave the command running. Now you can track the import progress on the
100 |     [Dataflow tab](https://console.cloud.google.com/dataflow).
101 | 
102 | #### Explanation of Arguments:
103 | 
104 | *   `--gcp_project_id`: The GCP project in which the pipeline will be running.
105 | *   `--gcs_uri`: The URI of the input to import, it has to start with `gs://`
106 |     since this is expected to be a GCS URI.
107 | *   `--bq_dataset`: The BigQuery dataset to import the data to, the BigQuery
108 |     tables are created automatically with the names of files.
109 | *   `--temp_bucket`: A GCS bucket to store temporary artifacts, for example:
110 |     decompressed data, compiled Cloud Dataflow pipeline code etc. The data will
111 |     be removed after the pipeline finishes.
112 | *   `--gcp_credentials`: Credentials of the service account, this can be
113 |     downloaded from the console. Note using a service account is strongly
114 |     recommended. Please find the link to how to set up a service account at the
115 |     beginning of this README.
116 | *   `--dataflow_controller_service_account`: Optional. Set the Cloud Dataflow
117 |     service account which is used by the workers to access resources. Usually
118 |     this is set to be the same as the service account created to run this
119 |     pipeline. You don't have to set it if using the default GCE service account
120 |     is desired, but make sure the default service account has access to the
121 |     resources required to run the pipeline.
122 | *   `--verbose`: print verbose error messages for debugging, can be omitted.
123 | 
124 | #### Custom CSV Options
125 | 
126 | *   `--csv_delimiter`: Character used to separate columns, typically ','. For
127 |     multiple character delimiter (non-standard), use advanced mode.
128 | *   `--csv_quote`: Quote character, typically '"' in standard CSV.
129 | *   `--csv_record_seperator`: Single or multiple characters to separate rows,
130 |     typically CR, LF, or CRLF depending on the platform.
131 | 
132 | #### Advanced Mode
133 | 
134 | All the regular expression should conform to the
135 | [Java spec](https://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html).
136 | 
137 | *   `--csv_delimiter_regex`: A regular expression used to separate columns.
138 |     Usually with a lookahead and lookbehind group (but not mandatory). For
139 |     example: "(?<=\\\\d{5, 10}),(?=\\\\w{2})", the tool will break each row with
140 |     this regular expression, i.e. the comma will be stripped.
141 | *   `--csv_record_separator_regex`: A regular expression used to separate
142 |     records. Usually with a forward- and a behind- looking group (but not
143 |     mandatory). For example: "\\r\\n(?=\\\\d{5})", the tool will set the split
144 |     point after the new line character.
145 | 
146 | Note that in terms of splitting the files, advanced mode is typically slower
147 | than normal mode, and these options are not compatible with those of normal
148 | mode.
149 | 
150 | ## Limitations
151 | 
152 | *   Right now this tool processes standard CSVs (i.e. follows
153 |     [RFC4180](https://tools.ietf.org/html/rfc4180)) and CSV-like files which
154 |     have meaningful record and field separators (meaning can be written as
155 |     regular expressions in Advanced mode).
156 | *   This tool takes one file at a time, but you can zip or tar multiple files
157 |     for the tool to process.
158 | *   All files are required to have headers, which will be used as column names
159 |     in BigQuery, headers will be transformed into proper format accepted by
160 |     BigQuery if necessary.
161 | *   The base names of the files are used as the table names in BigQuery, so make
162 |     sure there are no files share the same name(s).
163 | 


--------------------------------------------------------------------------------
/build.gradle:
--------------------------------------------------------------------------------
 1 | group 'com.google.cloud.healthcare'
 2 | version '0.1.0'
 3 | 
 4 | apply plugin: 'java'
 5 | apply plugin: 'application'
 6 | 
 7 | sourceCompatibility = 1.8
 8 | 
 9 | repositories {
10 |     mavenCentral()
11 | }
12 | 
13 | run {
14 |     if (project.hasProperty("appArgs")) {
15 |         args Eval.me(appArgs)
16 |     }
17 | }
18 | 
19 | mainClassName = "com.google.cloud.healthcare.ImportPipeline"
20 | 
21 | task copyTestResources(type: Copy) {
22 |     from "${projectDir}/src/test/resources"
23 |     into "${buildDir}/classes/test"
24 | }
25 | 
26 | configurations.all {
27 |     resolutionStrategy {
28 |         force 'org.mockito:mockito-core:2.8.9'
29 |     }
30 | }
31 | 
32 | dependencies {
33 |     implementation 'com.google.cloud:google-cloud-storage:1.51.0'
34 |     implementation 'com.google.cloud:google-cloud-bigquery:1.51.0'
35 |     implementation 'com.google.guava:guava:23.5-jre'
36 |     implementation 'org.apache.commons:commons-compress:1.18'
37 |     implementation 'org.apache.beam:beam-sdks-java-core:2.8.0'
38 |     implementation 'org.apache.beam:beam-sdks-java-extensions-google-cloud-platform-core:2.8.0'
39 |     implementation 'org.apache.avro:avro:1.8.2'
40 |     implementation 'com.univocity:univocity-parsers:2.7.6'
41 |     implementation 'org.apache.beam:beam-runners-google-cloud-dataflow-java:2.8.0'
42 |     implementation 'org.apache.beam:beam-runners-direct-java:2.8.0'
43 |     implementation 'commons-io:commons-io:2.6'
44 |     implementation 'com.github.pcj:google-options:1.0.0'
45 |     implementation 'org.slf4j:slf4j-jdk14:1.7.25'
46 | 
47 |     testImplementation 'junit:junit:4.12'
48 |     testImplementation 'org.powermock:powermock-api-mockito2:1.7.4'
49 |     testImplementation 'org.powermock:powermock-module-junit4:1.7.4'
50 |     testImplementation 'org.hamcrest:hamcrest-all:1.3'
51 |     testImplementation 'org.apache.beam:beam-runners-direct-java:2.8.0'
52 |     testImplementation 'org.apache.commons:commons-compress:1.18'
53 | }
54 | 


--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/bigquery-data-importer/ea3e5b02ccbb57d6a7f850b1cc840fc0dff898c1/gradle/wrapper/gradle-wrapper.jar


--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-5.4.1-bin.zip
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 | 


--------------------------------------------------------------------------------
/gradlew:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env sh
  2 | 
  3 | #
  4 | # Copyright 2015 the original author or authors.
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #      http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | #
 18 | 
 19 | ##############################################################################
 20 | ##
 21 | ##  Gradle start up script for UN*X
 22 | ##
 23 | ##############################################################################
 24 | 
 25 | # Attempt to set APP_HOME
 26 | # Resolve links: $0 may be a link
 27 | PRG="$0"
 28 | # Need this for relative symlinks.
 29 | while [ -h "$PRG" ] ; do
 30 |     ls=`ls -ld "$PRG"`
 31 |     link=`expr "$ls" : '.*-> \(.*\)$'`
 32 |     if expr "$link" : '/.*' > /dev/null; then
 33 |         PRG="$link"
 34 |     else
 35 |         PRG=`dirname "$PRG"`"/$link"
 36 |     fi
 37 | done
 38 | SAVED="`pwd`"
 39 | cd "`dirname \"$PRG\"`/" >/dev/null
 40 | APP_HOME="`pwd -P`"
 41 | cd "$SAVED" >/dev/null
 42 | 
 43 | APP_NAME="Gradle"
 44 | APP_BASE_NAME=`basename "$0"`
 45 | 
 46 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
 47 | DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
 48 | 
 49 | # Use the maximum available, or set MAX_FD != -1 to use that value.
 50 | MAX_FD="maximum"
 51 | 
 52 | warn () {
 53 |     echo "$*"
 54 | }
 55 | 
 56 | die () {
 57 |     echo
 58 |     echo "$*"
 59 |     echo
 60 |     exit 1
 61 | }
 62 | 
 63 | # OS specific support (must be 'true' or 'false').
 64 | cygwin=false
 65 | msys=false
 66 | darwin=false
 67 | nonstop=false
 68 | case "`uname`" in
 69 |   CYGWIN* )
 70 |     cygwin=true
 71 |     ;;
 72 |   Darwin* )
 73 |     darwin=true
 74 |     ;;
 75 |   MINGW* )
 76 |     msys=true
 77 |     ;;
 78 |   NONSTOP* )
 79 |     nonstop=true
 80 |     ;;
 81 | esac
 82 | 
 83 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
 84 | 
 85 | # Determine the Java command to use to start the JVM.
 86 | if [ -n "$JAVA_HOME" ] ; then
 87 |     if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
 88 |         # IBM's JDK on AIX uses strange locations for the executables
 89 |         JAVACMD="$JAVA_HOME/jre/sh/java"
 90 |     else
 91 |         JAVACMD="$JAVA_HOME/bin/java"
 92 |     fi
 93 |     if [ ! -x "$JAVACMD" ] ; then
 94 |         die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
 95 | 
 96 | Please set the JAVA_HOME variable in your environment to match the
 97 | location of your Java installation."
 98 |     fi
 99 | else
100 |     JAVACMD="java"
101 |     which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
102 | 
103 | Please set the JAVA_HOME variable in your environment to match the
104 | location of your Java installation."
105 | fi
106 | 
107 | # Increase the maximum file descriptors if we can.
108 | if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
109 |     MAX_FD_LIMIT=`ulimit -H -n`
110 |     if [ $? -eq 0 ] ; then
111 |         if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
112 |             MAX_FD="$MAX_FD_LIMIT"
113 |         fi
114 |         ulimit -n $MAX_FD
115 |         if [ $? -ne 0 ] ; then
116 |             warn "Could not set maximum file descriptor limit: $MAX_FD"
117 |         fi
118 |     else
119 |         warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
120 |     fi
121 | fi
122 | 
123 | # For Darwin, add options to specify how the application appears in the dock
124 | if $darwin; then
125 |     GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
126 | fi
127 | 
128 | # For Cygwin, switch paths to Windows format before running java
129 | if $cygwin ; then
130 |     APP_HOME=`cygpath --path --mixed "$APP_HOME"`
131 |     CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
132 |     JAVACMD=`cygpath --unix "$JAVACMD"`
133 | 
134 |     # We build the pattern for arguments to be converted via cygpath
135 |     ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
136 |     SEP=""
137 |     for dir in $ROOTDIRSRAW ; do
138 |         ROOTDIRS="$ROOTDIRS$SEP$dir"
139 |         SEP="|"
140 |     done
141 |     OURCYGPATTERN="(^($ROOTDIRS))"
142 |     # Add a user-defined pattern to the cygpath arguments
143 |     if [ "$GRADLE_CYGPATTERN" != "" ] ; then
144 |         OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
145 |     fi
146 |     # Now convert the arguments - kludge to limit ourselves to /bin/sh
147 |     i=0
148 |     for arg in "$@" ; do
149 |         CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
150 |         CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
151 | 
152 |         if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
153 |             eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
154 |         else
155 |             eval `echo args$i`="\"$arg\""
156 |         fi
157 |         i=$((i+1))
158 |     done
159 |     case $i in
160 |         (0) set -- ;;
161 |         (1) set -- "$args0" ;;
162 |         (2) set -- "$args0" "$args1" ;;
163 |         (3) set -- "$args0" "$args1" "$args2" ;;
164 |         (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
165 |         (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
166 |         (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
167 |         (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
168 |         (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
169 |         (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
170 |     esac
171 | fi
172 | 
173 | # Escape application args
174 | save () {
175 |     for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
176 |     echo " "
177 | }
178 | APP_ARGS=$(save "$@")
179 | 
180 | # Collect all arguments for the java command, following the shell quoting and substitution rules
181 | eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
182 | 
183 | # by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong
184 | if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then
185 |   cd "$(dirname "$0")"
186 | fi
187 | 
188 | exec "$JAVACMD" "$@"
189 | 


--------------------------------------------------------------------------------
/gradlew.bat:
--------------------------------------------------------------------------------
  1 | @rem
  2 | @rem Copyright 2015 the original author or authors.
  3 | @rem
  4 | @rem Licensed under the Apache License, Version 2.0 (the "License");
  5 | @rem you may not use this file except in compliance with the License.
  6 | @rem You may obtain a copy of the License at
  7 | @rem
  8 | @rem      http://www.apache.org/licenses/LICENSE-2.0
  9 | @rem
 10 | @rem Unless required by applicable law or agreed to in writing, software
 11 | @rem distributed under the License is distributed on an "AS IS" BASIS,
 12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | @rem See the License for the specific language governing permissions and
 14 | @rem limitations under the License.
 15 | @rem
 16 | 
 17 | @if "%DEBUG%" == "" @echo off
 18 | @rem ##########################################################################
 19 | @rem
 20 | @rem  Gradle startup script for Windows
 21 | @rem
 22 | @rem ##########################################################################
 23 | 
 24 | @rem Set local scope for the variables with windows NT shell
 25 | if "%OS%"=="Windows_NT" setlocal
 26 | 
 27 | set DIRNAME=%~dp0
 28 | if "%DIRNAME%" == "" set DIRNAME=.
 29 | set APP_BASE_NAME=%~n0
 30 | set APP_HOME=%DIRNAME%
 31 | 
 32 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
 33 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
 34 | 
 35 | @rem Find java.exe
 36 | if defined JAVA_HOME goto findJavaFromJavaHome
 37 | 
 38 | set JAVA_EXE=java.exe
 39 | %JAVA_EXE% -version >NUL 2>&1
 40 | if "%ERRORLEVEL%" == "0" goto init
 41 | 
 42 | echo.
 43 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
 44 | echo.
 45 | echo Please set the JAVA_HOME variable in your environment to match the
 46 | echo location of your Java installation.
 47 | 
 48 | goto fail
 49 | 
 50 | :findJavaFromJavaHome
 51 | set JAVA_HOME=%JAVA_HOME:"=%
 52 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
 53 | 
 54 | if exist "%JAVA_EXE%" goto init
 55 | 
 56 | echo.
 57 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
 58 | echo.
 59 | echo Please set the JAVA_HOME variable in your environment to match the
 60 | echo location of your Java installation.
 61 | 
 62 | goto fail
 63 | 
 64 | :init
 65 | @rem Get command-line arguments, handling Windows variants
 66 | 
 67 | if not "%OS%" == "Windows_NT" goto win9xME_args
 68 | 
 69 | :win9xME_args
 70 | @rem Slurp the command line arguments.
 71 | set CMD_LINE_ARGS=
 72 | set _SKIP=2
 73 | 
 74 | :win9xME_args_slurp
 75 | if "x%~1" == "x" goto execute
 76 | 
 77 | set CMD_LINE_ARGS=%*
 78 | 
 79 | :execute
 80 | @rem Setup the command line
 81 | 
 82 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
 83 | 
 84 | @rem Execute Gradle
 85 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
 86 | 
 87 | :end
 88 | @rem End local scope for the variables with windows NT shell
 89 | if "%ERRORLEVEL%"=="0" goto mainEnd
 90 | 
 91 | :fail
 92 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
 93 | rem the _cmd.exe /c_ return code!
 94 | if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
 95 | exit /b 1
 96 | 
 97 | :mainEnd
 98 | if "%OS%"=="Windows_NT" endlocal
 99 | 
100 | :omega
101 | 


--------------------------------------------------------------------------------
/settings.gradle:
--------------------------------------------------------------------------------
1 | rootProject.name = 'import'
2 | 
3 | 


--------------------------------------------------------------------------------
/src/main/java/com/google/cloud/healthcare/ImportPipeline.java:
--------------------------------------------------------------------------------
  1 | // Copyright 2019 Google LLC
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | package com.google.cloud.healthcare;
 16 | 
 17 | import com.google.cloud.healthcare.config.CsvConfiguration;
 18 | import com.google.cloud.healthcare.config.GcpConfiguration;
 19 | import com.google.common.base.Preconditions;
 20 | import com.google.common.base.Strings;
 21 | import com.google.devtools.common.options.Converter;
 22 | import com.google.devtools.common.options.Option;
 23 | import com.google.devtools.common.options.OptionsBase;
 24 | import com.google.devtools.common.options.OptionsParser;
 25 | import com.google.devtools.common.options.OptionsParsingException;
 26 | import java.io.IOException;
 27 | import java.util.Collections;
 28 | 
 29 | /**
 30 |  * This is the entrance class for the importing pipeline.
 31 |  */
 32 | public class ImportPipeline {
 33 | 
 34 |   /** For converting string input to characters */
 35 |   public static class CharOptionsConverter implements Converter<Character> {
 36 |     @Override
 37 |     public Character convert(String input) throws OptionsParsingException {
 38 |       if (Strings.isNullOrEmpty(input) || input.length() > 1) {
 39 |         throw new OptionsParsingException("Not a character.");
 40 |       }
 41 | 
 42 |       return input.charAt(0);
 43 |     }
 44 | 
 45 |     @Override
 46 |     public String getTypeDescription() {
 47 |       return "CSV delimiter";
 48 |     }
 49 |   }
 50 | 
 51 |   /** Customized option parser. */
 52 |   public static class PipelineOptions extends OptionsBase {
 53 | 
 54 |     @Option(
 55 |         name = "help",
 56 |         abbrev = 'h',
 57 |         help = "Print usage information.",
 58 |         defaultValue = "false")
 59 |     public boolean help;
 60 | 
 61 |     @Option(
 62 |         name = "csv_delimiter",
 63 |         help = "The delimiter used to separator CSV columns.",
 64 |         category = "csv",
 65 |         converter = CharOptionsConverter.class,
 66 |         defaultValue = ",")
 67 |     public char csvDelimiter;
 68 | 
 69 |     @Option(
 70 |         name = "csv_quote",
 71 |         help = "The quote character used in the file.",
 72 |         category = "csv",
 73 |         converter = CharOptionsConverter.class,
 74 |         defaultValue = "\"")
 75 |     public char csvQuoteChar;
 76 | 
 77 |     @Option(
 78 |         name = "csv_record_separator",
 79 |         help = "Row separator, typically this is either '\n' or '\r\n'.",
 80 |         category = "csv",
 81 |         defaultValue = "\n")
 82 |     public String csvRecordSeparator;
 83 | 
 84 |     @Option(
 85 |         name = "csv_delimiter_regex",
 86 |         help = "Optional: A regular expression used to separate fields in a record. Lookahead and"
 87 |             + "lookbehind can be used. This has to be used in combination with"
 88 |             + "csv_record_separator_regex.",
 89 |         category = "csv",
 90 |         defaultValue = "")
 91 |     public String csvDelimiterRegex;
 92 | 
 93 |     @Option(
 94 |         name = "csv_record_separator_regex",
 95 |         help = "Optional: A regular expression used to separate records. This has to be used in"
 96 |             + "combination wtih csv_delimiter_regex.",
 97 |         category = "csv",
 98 |         defaultValue = "")
 99 |     public String csvRecordSeparatorRegex;
100 | 
101 |     // TODO(b/120795556): update this to take a list of URIs (file or directory, can have
102 |     // wildcards).
103 |     @Option(
104 |         name = "gcs_uri",
105 |         help = "The URI of the source file on GCS.",
106 |         defaultValue = "")
107 |     public String gcsUri;
108 | 
109 |     @Option(
110 |         name = "bq_dataset",
111 |         help = "The BigQuery dataset to import the data.",
112 |         defaultValue = "")
113 |     public String bigQueryDataset;
114 | 
115 |     @Option(
116 |         name = "temp_bucket",
117 |         help = "Used to store temporary files.",
118 |         defaultValue = "")
119 |     public String tempBucket;
120 | 
121 |     @Option(
122 |         name = "gcp_project_id",
123 |         help = "The project id used to run the pipeline.",
124 |         defaultValue = "")
125 |     public String projectId;
126 | 
127 |     @Option(
128 |         name = "gcp_credentials",
129 |         help = "Path to the credentials (usually a .json file) of a service account used to access"
130 |             + "resources (GCS, Dataflow, BigQuery), current users credentials will be used if not"
131 |             + "specified.",
132 |         defaultValue = "")
133 |     public String gcpCredentials;
134 | 
135 |     @Option(
136 |         name = "dataflow_controller_service_account",
137 |         help = "Customized Dataflow controller service account, see"
138 |             + "https://cloud.google.com/dataflow/docs/concepts/security-and-permissions"
139 |             + "#controller_service_account. The default will be used if not specified.",
140 |         defaultValue = "")
141 |     public String dataflowControllerServiceAccount;
142 | 
143 |     @Option(
144 |         name = "verbose",
145 |         abbrev = 'v',
146 |         help = "Whether to output verbose messages.",
147 |         defaultValue = "false")
148 |     public boolean verbose;
149 | 
150 |     public void printUsage(OptionsParser parser) {
151 |       System.out.println("Usage: java -jar import.jar OPTIONS");
152 |       System.out.println(
153 |           parser.describeOptions(
154 |               Collections.emptyMap(), OptionsParser.HelpVerbosity.LONG));
155 |     }
156 |   }
157 | 
158 |   private static void validateAndConstructOptions(PipelineOptions options) throws IOException {
159 |     Preconditions.checkArgument(!Strings.isNullOrEmpty(options.gcsUri),
160 |         "GCS URI is required to provide the source file.");
161 |     Preconditions.checkArgument(!Strings.isNullOrEmpty(options.bigQueryDataset),
162 |         "BigQuery dataset and table are required.");
163 |     Preconditions.checkArgument(!Strings.isNullOrEmpty(options.tempBucket),
164 |         "Temporary bucket is required.");
165 |     boolean isCsvRecordSeparatorRegexSet = !Strings.isNullOrEmpty(options.csvRecordSeparatorRegex);
166 |     boolean isCsvDelimiterRegexSet = !Strings.isNullOrEmpty(options.csvDelimiterRegex);
167 |     Preconditions.checkArgument(!(isCsvDelimiterRegexSet ^ isCsvRecordSeparatorRegexSet),
168 |         "csv_delimiter_regex and csv_record_separator_regex need to be specified together.");
169 | 
170 |     CsvConfiguration.getInstance()
171 |         .withRecordSeparator(options.csvRecordSeparator)
172 |         .withDelimiter(options.csvDelimiter)
173 |         .withQuote(options.csvQuoteChar);
174 | 
175 |     if (isCsvDelimiterRegexSet && isCsvRecordSeparatorRegexSet) {
176 |       CsvConfiguration.getInstance()
177 |           .withDelimiterRegex(options.csvDelimiterRegex)
178 |           .withRecordSeparatorRegex(options.csvRecordSeparatorRegex);
179 |     }
180 | 
181 |     GcpConfiguration.getInstance().withCredentials(options.gcpCredentials);
182 |   }
183 | 
184 |   public static void main(String[] argv) {
185 |     OptionsParser parser = OptionsParser.newOptionsParser(PipelineOptions.class);
186 |     parser.parseAndExitUponError(argv);
187 |     PipelineOptions options = parser.getOptions(PipelineOptions.class);
188 | 
189 |     if (options.help) {
190 |       options.printUsage(parser);
191 |       return;
192 |     }
193 | 
194 |     try {
195 |       validateAndConstructOptions(options);
196 |       PipelineRunner.run(options.projectId, options.dataflowControllerServiceAccount,
197 |           options.bigQueryDataset, options.tempBucket, options.gcsUri);
198 |     } catch (Exception e) {
199 |       if (options.verbose) {
200 |         throw new RuntimeException(e);
201 |       } else {
202 |         System.out.println(e.getMessage());
203 |       }
204 |     }
205 |   }
206 | }
207 | 


--------------------------------------------------------------------------------
/src/main/java/com/google/cloud/healthcare/config/CsvConfiguration.java:
--------------------------------------------------------------------------------
  1 | // Copyright 2019 Google LLC
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | package com.google.cloud.healthcare.config;
 16 | 
 17 | import com.google.common.base.Preconditions;
 18 | import com.google.common.base.Strings;
 19 | import java.io.Serializable;
 20 | import java.util.regex.Pattern;
 21 | import java.util.regex.PatternSyntaxException;
 22 | 
 23 | /**
 24 |  * This singleton class stores all configurations at run-time.
 25 |  */
 26 | public class CsvConfiguration implements Serializable {
 27 | 
 28 |   private static CsvConfiguration INSTANCE = new CsvConfiguration();
 29 | 
 30 |   // Configurations for CSV-like inputs.
 31 |   // The character(s) used to separate fields, typically comma for standard CSV.
 32 |   private Character delimiter = ',';
 33 | 
 34 |   // In standard CSV files, quotes are added around fields with special characters such as newlines
 35 |   // and delimiters. This is required.
 36 |   private Character quote = '"';
 37 | 
 38 |   // Character(s) used to separate records.
 39 |   private String recordSeparator = "\n";
 40 | 
 41 |   // Regular expression for line delimiter. This takes higher precedence over delimiter if
 42 |   // specified.
 43 |   private Pattern delimiterRegex;
 44 | 
 45 |   // Regular expression for record separator. This takes higher precedence over recordSeparator if
 46 |   // specified.
 47 |   private Pattern recordSeparatorRegex;
 48 | 
 49 |   // Whether to ignore surrounding spaces.
 50 |   private boolean ignoreSurroundingSpaces = true;
 51 | 
 52 |   private CsvConfiguration() {
 53 |   }
 54 | 
 55 |   public static CsvConfiguration getInstance() {
 56 |     return INSTANCE;
 57 |   }
 58 | 
 59 |   public CsvConfiguration withDelimiter(Character delimiter) {
 60 |     this.delimiter = delimiter;
 61 |     return this;
 62 |   }
 63 | 
 64 |   public Character getDelimiter() {
 65 |     return delimiter;
 66 |   }
 67 | 
 68 |   public CsvConfiguration withQuote(Character quote) {
 69 |     Preconditions.checkArgument(quote != null, "Quote is required.");
 70 |     this.quote = quote;
 71 |     return this;
 72 |   }
 73 | 
 74 |   /**
 75 |    * Returns the character(s) used in input to enclose fields that have special characters.
 76 |    */
 77 |   public Character getQuote() {
 78 |     return quote;
 79 |   }
 80 | 
 81 |   public CsvConfiguration withRecordSeparator(String recordSeparator) {
 82 |     Preconditions.checkArgument(!Strings.isNullOrEmpty(recordSeparator),
 83 |         "Record separator cannot be null or empty.");
 84 |     this.recordSeparator = recordSeparator;
 85 |     return this;
 86 |   }
 87 | 
 88 |   public String getRecordSeparator() {
 89 |     return recordSeparator;
 90 |   }
 91 | 
 92 |   public CsvConfiguration withIgnoreSurroundingSpaces(boolean ignoreSurroundingSpaces) {
 93 |     this.ignoreSurroundingSpaces = ignoreSurroundingSpaces;
 94 |     return this;
 95 |   }
 96 | 
 97 |   public boolean isIgnoreSurroundingSpaces() {
 98 |     return ignoreSurroundingSpaces;
 99 |   }
100 | 
101 |   public Pattern getDelimiterRegex() {
102 |     return delimiterRegex;
103 |   }
104 | 
105 |   public CsvConfiguration withDelimiterRegex(String delimiterRegex) {
106 |     try {
107 |       this.delimiterRegex = Pattern.compile(delimiterRegex);
108 |     } catch (PatternSyntaxException e) {
109 |       throw new IllegalArgumentException(e);
110 |     }
111 |     return this;
112 |   }
113 | 
114 |   public Pattern getRecordSeparatorRegex() {
115 |     return recordSeparatorRegex;
116 |   }
117 | 
118 |   public CsvConfiguration withRecordSeparatorRegex(String recordSeparatorRegex) {
119 |     try {
120 |       this.recordSeparatorRegex = Pattern.compile(recordSeparatorRegex);
121 |     } catch (PatternSyntaxException e) {
122 |       throw new IllegalArgumentException(e);
123 |     }
124 |     return this;
125 |   }
126 | }
127 | 


--------------------------------------------------------------------------------
/src/main/java/com/google/cloud/healthcare/config/GcpConfiguration.java:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Google LLC
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package com.google.cloud.healthcare.config;
16 | 
17 | import com.google.auth.oauth2.GoogleCredentials;
18 | import com.google.common.base.Strings;
19 | import com.google.common.collect.ImmutableSet;
20 | import java.io.FileInputStream;
21 | import java.io.IOException;
22 | import java.io.Serializable;
23 | import java.util.Collection;
24 | 
25 | /**
26 |  * Singleton that stores all GCP related configurations.
27 |  */
28 | public class GcpConfiguration implements Serializable {
29 | 
30 |   // See https://developers.google.com/identity/protocols/googlescopes.
31 |   private static final Collection<String> SCOPES = ImmutableSet.of(
32 |       // General.
33 |       "https://www.googleapis.com/auth/cloud-platform",
34 |       // Dataflow.
35 |       "https://www.googleapis.com/auth/compute",
36 |       "https://www.googleapis.com/auth/userinfo.email",
37 |       // Cloud Storage.
38 |       "https://www.googleapis.com/auth/devstorage.full_control",
39 |       // BigQuery.
40 |       "https://www.googleapis.com/auth/bigquery"
41 |   );
42 | 
43 |   private static GcpConfiguration INSTANCE = new GcpConfiguration();
44 | 
45 |   private GcpConfiguration() {
46 |   }
47 | 
48 |   public static GcpConfiguration getInstance() {
49 |     return INSTANCE;
50 |   }
51 | 
52 |   private GoogleCredentials credentials;
53 | 
54 |   public GoogleCredentials getCredentials() {
55 |     return credentials;
56 |   }
57 | 
58 |   public GcpConfiguration withCredentials(String credentials) throws IOException {
59 |     if (!Strings.isNullOrEmpty(credentials)) {
60 |       this.credentials = GoogleCredentials.fromStream(new FileInputStream(credentials))
61 |           .createScoped(SCOPES);
62 |     }
63 |     return this;
64 |   }
65 | }
66 | 


--------------------------------------------------------------------------------
/src/main/java/com/google/cloud/healthcare/decompress/BaseHandler.java:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Google LLC
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package com.google.cloud.healthcare.decompress;
16 | 
17 | import com.google.cloud.healthcare.io.OutputWriterFactory;
18 | 
19 | /**
20 |  * A base decompression handler which keeps an {@link OutputWriterFactory} for producing writers
21 |  * to destination.
22 |  */
23 | abstract class BaseHandler implements DecompressHandler {
24 |   protected OutputWriterFactory factory;
25 | 
26 |   BaseHandler(OutputWriterFactory factory) {
27 |     this.factory = factory;
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/src/main/java/com/google/cloud/healthcare/decompress/CompressionAlgorithm.java:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Google LLC
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package com.google.cloud.healthcare.decompress;
16 | 
17 | import java.util.Arrays;
18 | import java.util.HashSet;
19 | import java.util.Optional;
20 | import java.util.Set;
21 | 
22 | /**
23 |  * Here we are using the MIME type from GCS to determine if the file is compressed.
24 |  *
25 |  * Those mime types are extracted from IANA and Wikipedia:
26 |  * https://www.iana.org/assignments/media-types/media-types.xhtml and
27 |  * https://en.wikipedia.org/wiki/List_of_archive_formats
28 |  */
29 | enum CompressionAlgorithm {
30 |   // Supported
31 |   GZIP(true, "application/gzip", "application/x-gzip"),
32 |   ZIP(true, "application/zip"),
33 |   TAR(true, "application/x-tar"),
34 |   LZ4(true, "application/x-lz4"),
35 |   // Unsupported.
36 |   RAR(false, "application/vnd.rar", "application/x-rar-compressed"),
37 |   SEVEN_Z(false, "application/x-7z-compressed"),
38 |   BZIP(false, "application/x-bzip"),
39 |   BZIP2(false, "application/x-bzip2"),
40 |   LZ(false, "application/x-lzip"),
41 |   LZMA(false, "application/x-lzma"),
42 |   LZO(false, "application/x-lzop"),
43 |   XZ(false, "application/x-xz"),
44 |   Z(false, "application/x-compress");
45 | 
46 |   private final String[] mimeTypes;
47 |   private final boolean supported;
48 | 
49 |   CompressionAlgorithm(boolean supported, String... mimeTypes) {
50 |     this.mimeTypes = mimeTypes;
51 |     this.supported = supported;
52 |   }
53 | 
54 |   boolean isSupported() {
55 |     return supported;
56 |   }
57 | 
58 |   Set<String> getMimeTypes() {
59 |     return new HashSet<>(Arrays.asList(mimeTypes));
60 |   }
61 | 
62 |   static Optional<CompressionAlgorithm> valueOfMimeType(String mimeType) {
63 |     return Arrays.stream(values())
64 |         .filter(a -> a.getMimeTypes().contains(mimeType))
65 |         .findAny();
66 |   }
67 | }
68 | 


--------------------------------------------------------------------------------
/src/main/java/com/google/cloud/healthcare/decompress/DecompressHandler.java:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Google LLC
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package com.google.cloud.healthcare.decompress;
16 | 
17 | import com.google.cloud.healthcare.io.InputReader;
18 | import java.io.IOException;
19 | import java.util.List;
20 | 
21 | /**
22 |  * Common interface for handling compressed data.
23 |  */
24 | public interface DecompressHandler {
25 | 
26 |   /**
27 |    * Handles compressed streams, and returns all the destinations for extracted files.
28 |    */
29 |   List<String> handle(InputReader reader) throws IOException;
30 | }
31 | 


--------------------------------------------------------------------------------
/src/main/java/com/google/cloud/healthcare/decompress/Decompressor.java:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Google LLC
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package com.google.cloud.healthcare.decompress;
16 | 
17 | import com.google.cloud.healthcare.io.InputReader;
18 | import com.google.cloud.healthcare.io.OutputWriter;
19 | import com.google.cloud.healthcare.io.OutputWriterFactory;
20 | import com.google.common.collect.Lists;
21 | import java.io.IOException;
22 | import java.util.List;
23 | import java.util.Optional;
24 | 
25 | /**
26 |  * Decompressor tries to decompress content from an {@link InputReader} and writes decompressed
27 |  * data to a different location.
28 |  */
29 | public class Decompressor {
30 | 
31 |   private OutputWriterFactory factory;
32 | 
33 |   public Decompressor(OutputWriterFactory factory) {
34 |     this.factory = factory;
35 |   }
36 | 
37 |   /**
38 |    * Decompresses the content provided by the {@link InputReader} and writes the decompressed data
39 |    * to a location specified by the {@link OutputWriter}.
40 |    *
41 |    * @return a boolean indicating whether the content is decompressed.
42 |    * @throws IOException
43 |    */
44 |   public List<String> decompress(InputReader reader) throws IOException {
45 |     String contentType = reader.getContentType();
46 |     Optional<CompressionAlgorithm> algoOpt = CompressionAlgorithm.valueOfMimeType(contentType);
47 |     // Check if the file is compressed.
48 |     if (!algoOpt.isPresent()) {
49 |       return Lists.newArrayList(reader.getName());
50 |     }
51 | 
52 |     CompressionAlgorithm algo = algoOpt.get();
53 |     if (!algo.isSupported()) {
54 |       throw new RuntimeException(contentType + " is not supported.");
55 |     }
56 | 
57 |     switch (algo) {
58 |       case GZIP:
59 |         return new GZipHandler(factory).handle(reader);
60 |       case ZIP:
61 |         return new ZipHandler(factory).handle(reader);
62 |       case TAR:
63 |         return new TarHandler(factory).handle(reader);
64 |       case LZ4:
65 |         return new LZ4Handler(factory).handle(reader);
66 |       default:
67 |         // Should not reach here.
68 |         throw new RuntimeException(
69 |             String.format("Reached a branch %s which should not be visited.", algo));
70 |     }
71 |   }
72 | }
73 | 


--------------------------------------------------------------------------------
/src/main/java/com/google/cloud/healthcare/decompress/GZipHandler.java:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Google LLC
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package com.google.cloud.healthcare.decompress;
16 | 
17 | import com.google.cloud.healthcare.io.InputReader;
18 | import com.google.cloud.healthcare.io.OutputWriter;
19 | import com.google.cloud.healthcare.io.OutputWriterFactory;
20 | import com.google.common.base.Strings;
21 | import com.google.common.collect.Lists;
22 | import com.google.common.io.ByteStreams;
23 | import java.io.IOException;
24 | import java.nio.channels.Channels;
25 | import java.nio.channels.WritableByteChannel;
26 | import java.nio.file.Paths;
27 | import java.util.List;
28 | import java.util.zip.GZIPInputStream;
29 | import org.apache.commons.io.FilenameUtils;
30 | 
31 | class GZipHandler extends BaseHandler {
32 | 
33 |   GZipHandler(OutputWriterFactory factory) {
34 |     super(factory);
35 |   }
36 | 
37 |   @Override
38 |   public List<String> handle(InputReader reader) throws IOException {
39 |     String origName = reader.getName();
40 |     if (Strings.isNullOrEmpty(origName)) {
41 |       throw new IllegalStateException("Cannot get the name of the input file.");
42 |     }
43 | 
44 |     String outputName = FilenameUtils.removeExtension(Paths.get(origName).getFileName().toString());
45 |     OutputWriter writer = factory.getOutputWriter(outputName);
46 |     try (GZIPInputStream gis =
47 |         new GZIPInputStream(Channels.newInputStream(reader.getReadChannel()));
48 |         WritableByteChannel writeChannel = writer.getWriteChannel()) {
49 |       ByteStreams.copy(Channels.newChannel(gis), writeChannel);
50 |       return Lists.newArrayList(writer.getName());
51 |     }
52 |   }
53 | }
54 | 


--------------------------------------------------------------------------------
/src/main/java/com/google/cloud/healthcare/decompress/LZ4Handler.java:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Google LLC
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package com.google.cloud.healthcare.decompress;
16 | 
17 | import com.google.cloud.healthcare.io.InputReader;
18 | import com.google.cloud.healthcare.io.OutputWriter;
19 | import com.google.cloud.healthcare.io.OutputWriterFactory;
20 | import com.google.common.base.Strings;
21 | import com.google.common.collect.Lists;
22 | import com.google.common.io.ByteStreams;
23 | import java.io.IOException;
24 | import java.nio.channels.Channels;
25 | import java.nio.channels.WritableByteChannel;
26 | import java.nio.file.Paths;
27 | import java.util.List;
28 | import org.apache.commons.compress.compressors.lz4.FramedLZ4CompressorInputStream;
29 | import org.apache.commons.io.FilenameUtils;
30 | 
31 | /**
32 |  * Handles LZ4 compressed files. We only support LZ4 framed format.
33 |  *
34 |  * LZ4 frame format: https://github.com/lz4/lz4/blob/master/doc/lz4_Frame_format.md
35 |  * LZ4 block format: https://github.com/lz4/lz4/blob/master/doc/lz4_Block_format.md
36 |  */
37 | class LZ4Handler extends BaseHandler {
38 | 
39 |   LZ4Handler(OutputWriterFactory factory) {
40 |     super(factory);
41 |   }
42 | 
43 |   @Override
44 |   public List<String> handle(InputReader reader) throws IOException {
45 |     String origName = reader.getName();
46 |     if (Strings.isNullOrEmpty(origName)) {
47 |       throw new IllegalStateException("Cannot get the name of the input file.");
48 |     }
49 |     String outputName = FilenameUtils.removeExtension(Paths.get(origName).getFileName().toString());
50 |     OutputWriter writer = factory.getOutputWriter(outputName);
51 |     try (FramedLZ4CompressorInputStream flz4is = new FramedLZ4CompressorInputStream(
52 |         Channels.newInputStream(reader.getReadChannel()))) {
53 |       WritableByteChannel writeChannel = writer.getWriteChannel();
54 |       ByteStreams.copy(Channels.newChannel(flz4is), writeChannel);
55 |       return Lists.newArrayList(writer.getName());
56 |     }
57 |   }
58 | }
59 | 


--------------------------------------------------------------------------------
/src/main/java/com/google/cloud/healthcare/decompress/TarHandler.java:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Google LLC
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package com.google.cloud.healthcare.decompress;
16 | 
17 | import com.google.cloud.healthcare.io.InputReader;
18 | import com.google.cloud.healthcare.io.OutputWriter;
19 | import com.google.cloud.healthcare.io.OutputWriterFactory;
20 | import com.google.common.collect.Lists;
21 | import com.google.common.io.ByteStreams;
22 | import java.io.IOException;
23 | import java.nio.channels.Channels;
24 | import java.nio.channels.WritableByteChannel;
25 | import java.nio.file.Paths;
26 | import java.util.List;
27 | import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
28 | import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
29 | import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
30 | 
31 | class TarHandler extends BaseHandler {
32 | 
33 |   TarHandler(OutputWriterFactory factory) {
34 |     super(factory);
35 |   }
36 | 
37 |   @Override
38 |   public List<String> handle(InputReader reader) throws IOException {
39 |     List<String> files = Lists.newArrayList();
40 | 
41 |     // TODO(b/121029418): Not all TAR archives are compressed, we need to check the filenames to be
42 |     // sure. Also consider refactoring the handlers to adopt the decorator pattern.
43 |     try (TarArchiveInputStream tis = new TarArchiveInputStream(
44 |         new GzipCompressorInputStream(Channels.newInputStream(reader.getReadChannel())))) {
45 | 
46 |       TarArchiveEntry entry;
47 |       while ((entry = tis.getNextTarEntry()) != null) {
48 |         if (!entry.isDirectory()) {
49 |           OutputWriter writer = factory.getOutputWriter(Paths.get(entry.getName()).toString());
50 |           try (WritableByteChannel writeChannel = writer.getWriteChannel()) {
51 |             ByteStreams.copy(Channels.newChannel(tis), writeChannel);
52 |             files.add(writer.getName());
53 |           }
54 |         }
55 |       }
56 |     }
57 | 
58 |     return files;
59 |   }
60 | }
61 | 


--------------------------------------------------------------------------------
/src/main/java/com/google/cloud/healthcare/decompress/ZipHandler.java:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Google LLC
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package com.google.cloud.healthcare.decompress;
16 | 
17 | import com.google.cloud.healthcare.io.InputReader;
18 | import com.google.cloud.healthcare.io.OutputWriter;
19 | import com.google.cloud.healthcare.io.OutputWriterFactory;
20 | import com.google.common.collect.Lists;
21 | import com.google.common.io.ByteStreams;
22 | import java.io.IOException;
23 | import java.nio.channels.Channels;
24 | import java.nio.channels.WritableByteChannel;
25 | import java.nio.file.Paths;
26 | import java.util.List;
27 | import java.util.zip.ZipEntry;
28 | import java.util.zip.ZipInputStream;
29 | import org.slf4j.Logger;
30 | import org.slf4j.LoggerFactory;
31 | 
32 | /**
33 |  * Handles ZIP compressed files. ZIP is special because it is also an archive format.
34 |  */
35 | class ZipHandler extends BaseHandler {
36 |   private static final Logger LOG = LoggerFactory.getLogger(ZipHandler.class);
37 | 
38 |   ZipHandler(OutputWriterFactory factory) {
39 |     super(factory);
40 |   }
41 | 
42 |   @Override
43 |   public List<String> handle(InputReader reader) throws IOException {
44 |     ZipInputStream zis = new ZipInputStream(Channels.newInputStream(reader.getReadChannel()));
45 | 
46 |     List<String> files = Lists.newArrayList();
47 |     ZipEntry entry = zis.getNextEntry();
48 |     while (entry != null) {
49 |       if (!entry.isDirectory()) {
50 |         OutputWriter writer = factory.getOutputWriter(Paths.get(entry.getName()).toString());
51 |         try (WritableByteChannel writeChannel = writer.getWriteChannel()) {
52 |           ByteStreams.copy(Channels.newChannel(zis), writeChannel);
53 |           files.add(writer.getName());
54 |         }
55 |       }
56 | 
57 |       entry = zis.getNextEntry();
58 |     }
59 | 
60 |     zis.closeEntry();
61 |     zis.close();
62 |     return files;
63 |   }
64 | }
65 | 


--------------------------------------------------------------------------------
/src/main/java/com/google/cloud/healthcare/io/ByteReader.java:
--------------------------------------------------------------------------------
  1 | // Copyright 2019 Google LLC
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | package com.google.cloud.healthcare.io;
 16 | 
 17 | import com.google.common.collect.Lists;
 18 | import java.io.IOException;
 19 | import java.nio.ByteBuffer;
 20 | import java.nio.channels.SeekableByteChannel;
 21 | import java.util.List;
 22 | 
 23 | /**
 24 |  * Wrapper around a {@link SeekableByteChannel} with some useful methods.
 25 |  */
 26 | public class ByteReader {
 27 | 
 28 |   private static final int BUFFER_SIZE = 8192;
 29 | 
 30 |   public static final byte NEWLINE_FEED = 0xA;
 31 |   public static final byte CARRIAGE_RETURN = 0XD;
 32 | 
 33 |   private final SeekableByteChannel channel;
 34 | 
 35 |   public ByteReader(SeekableByteChannel ch) {
 36 |     this.channel = ch;
 37 |   }
 38 | 
 39 |   /**
 40 |    * Reads a line from the channel, the read byte array doesn't include line feed or carriage
 41 |    * return, the position of the channel is set to after the line terminators.
 42 |    */
 43 |   public byte[] readLine() throws IOException {
 44 |     long currPos = channel.position();
 45 | 
 46 |     ByteBuffer byteBuffer = ByteBuffer.allocate(BUFFER_SIZE);
 47 |     List<Byte> readBytes = Lists.newArrayList();
 48 | 
 49 |     boolean carriageReturn = false;
 50 |     while (channel.read(byteBuffer) > 0) {
 51 |       byteBuffer.flip();
 52 | 
 53 |       while (byteBuffer.hasRemaining()) {
 54 |         byte b = byteBuffer.get();
 55 | 
 56 |         if (b == NEWLINE_FEED || carriageReturn) {
 57 |           long nextPos = currPos + readBytes.size();
 58 |           // Exclude the line feed and previous carriage return.
 59 |           if (b == NEWLINE_FEED) {
 60 |             nextPos++;
 61 |           }
 62 |           if (carriageReturn) {
 63 |             nextPos++;
 64 |           }
 65 | 
 66 |           channel.position(nextPos);
 67 |           return toArray(readBytes);
 68 |         } else if (b == CARRIAGE_RETURN) {
 69 |           // Mark carriage return, check if next byte is line feed.
 70 |           carriageReturn = true;
 71 |         } else {
 72 |           readBytes.add(b);
 73 |         }
 74 |       }
 75 | 
 76 |       byteBuffer.clear();
 77 |     }
 78 | 
 79 |     return toArray(readBytes);
 80 |   }
 81 | 
 82 |   /**
 83 |    * Reads all the bytes until end (exclusive).
 84 |    */
 85 |   public byte[] readUntil(long end) throws IOException {
 86 |     long currPos = channel.position();
 87 | 
 88 |     ByteBuffer byteBuffer = ByteBuffer.allocate(BUFFER_SIZE);
 89 |     List<Byte> readBytes = Lists.newArrayList();
 90 | 
 91 |     while (channel.read(byteBuffer) > 0) {
 92 |       byteBuffer.flip();
 93 | 
 94 |       while (byteBuffer.hasRemaining()) {
 95 |         if (readBytes.size() >= end - currPos) {
 96 |           return toArray(readBytes);
 97 |         }
 98 | 
 99 |         readBytes.add(byteBuffer.get());
100 |       }
101 | 
102 |       byteBuffer.clear();
103 |     }
104 | 
105 |     return toArray(readBytes);
106 |   }
107 | 
108 |   private static byte[] toArray(List<Byte> bytes) {
109 |     byte[] result = new byte[bytes.size()];
110 |     for (int i = 0; i < bytes.size(); i++) {
111 |       result[i] = bytes.get(i);
112 |     }
113 |     return result;
114 |   }
115 | }
116 | 


--------------------------------------------------------------------------------
/src/main/java/com/google/cloud/healthcare/io/GcsInputReader.java:
--------------------------------------------------------------------------------
  1 | // Copyright 2019 Google LLC
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | package com.google.cloud.healthcare.io;
 16 | 
 17 | import com.google.auth.oauth2.GoogleCredentials;
 18 | import com.google.cloud.ReadChannel;
 19 | import com.google.cloud.healthcare.process.schema.GcpUtil;
 20 | import com.google.cloud.healthcare.util.StringUtil;
 21 | import com.google.cloud.storage.Blob;
 22 | import com.google.cloud.storage.BlobId;
 23 | import com.google.cloud.storage.Storage;
 24 | import java.io.IOException;
 25 | import java.io.ObjectInputStream;
 26 | import java.io.Serializable;
 27 | import java.nio.channels.ReadableByteChannel;
 28 | 
 29 | /**
 30 |  * GcsInputReader is a wrapper of a Google Cloud Storage (GCS) object that makes it easier to read
 31 |  * the content.
 32 |  */
 33 | public class GcsInputReader implements InputReader, Serializable {
 34 | 
 35 |   private String bucket;
 36 |   private String path;
 37 |   private GoogleCredentials credentials;
 38 |   private transient Blob blob;
 39 | 
 40 |   public GcsInputReader(GoogleCredentials credentials, String bucket, String path) {
 41 |     this.bucket = bucket;
 42 |     this.path = path;
 43 |     this.credentials = credentials;
 44 |     initBlob();
 45 |   }
 46 | 
 47 |   public GcsInputReader(GoogleCredentials credentials, String gcsUri) {
 48 |     String[] parts = StringUtil.splitGcsUri(gcsUri);
 49 |     this.bucket = parts[0];
 50 |     this.path = parts[1];
 51 |     this.credentials = credentials;
 52 |     initBlob();
 53 |   }
 54 | 
 55 |   @Override
 56 |   public String getContentType() {
 57 |     return blob.getContentType();
 58 |   }
 59 | 
 60 |   @Override
 61 |   public ReadableByteChannel getReadChannel() {
 62 |     ReadChannel readCh = blob.reader();
 63 |     readCh.setChunkSize(CHUNK_SIZE);
 64 |     return readCh;
 65 |   }
 66 | 
 67 |   @Override
 68 |   public String getName() {
 69 |     return StringUtil.generateGcsUri(bucket, path);
 70 |   }
 71 | 
 72 |   @Override
 73 |   public long getSize() {
 74 |     return blob.getSize();
 75 |   }
 76 | 
 77 |   public String getBucket() {
 78 |     return bucket;
 79 |   }
 80 | 
 81 |   public String getPath() {
 82 |     return path;
 83 |   }
 84 | 
 85 |   private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
 86 |     in.defaultReadObject();
 87 |     initBlob();
 88 |   }
 89 | 
 90 |   private void initBlob() {
 91 |     Storage storage = GcpUtil.getGcsClient(credentials);
 92 |     blob = storage.get(BlobId.of(bucket, path));
 93 |   }
 94 | 
 95 |   @Override
 96 |   public boolean equals(Object obj) {
 97 |     if (obj == null) {
 98 |       return false;
 99 |     }
100 | 
101 |     if (!(obj instanceof GcsInputReader)) {
102 |       return false;
103 |     }
104 | 
105 |     GcsInputReader anotherReader = (GcsInputReader) obj;
106 |     return bucket.equals(anotherReader.getBucket()) && path.equals(anotherReader.getPath());
107 |   }
108 | 
109 |   @Override
110 |   public int hashCode() {
111 |     return 31 * bucket.hashCode() + path.hashCode();
112 |   }
113 | }
114 | 


--------------------------------------------------------------------------------
/src/main/java/com/google/cloud/healthcare/io/GcsOutputWriter.java:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Google LLC
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package com.google.cloud.healthcare.io;
16 | 
17 | import com.google.auth.oauth2.GoogleCredentials;
18 | import com.google.cloud.healthcare.process.schema.GcpUtil;
19 | import com.google.cloud.healthcare.util.StringUtil;
20 | import com.google.cloud.storage.Blob;
21 | import com.google.cloud.storage.BlobInfo;
22 | import com.google.cloud.storage.Storage;
23 | import java.nio.channels.WritableByteChannel;
24 | 
25 | /**
26 |  * GcsOutputWriter is a wrapper of a GCS object that makes it easier to write.
27 |  */
28 | public class GcsOutputWriter implements OutputWriter {
29 | 
30 |   private final String bucket;
31 |   private final String path;
32 |   private final GoogleCredentials credentials;
33 |   private Storage storage;
34 | 
35 |   public GcsOutputWriter(GoogleCredentials credentials, String bucket, String path) {
36 |     this.bucket = bucket;
37 |     this.path = path;
38 |     this.credentials = credentials;
39 |     initStorage();
40 |   }
41 | 
42 |   public GcsOutputWriter(GoogleCredentials credentials, String gcsUri) {
43 |     String[] parts = StringUtil.splitGcsUri(gcsUri);
44 |     bucket = parts[0];
45 |     path = parts[1];
46 |     this.credentials = credentials;
47 |     initStorage();
48 |   }
49 | 
50 |   @Override
51 |   public WritableByteChannel getWriteChannel() {
52 |     Blob blob = storage.create(BlobInfo.newBuilder(bucket, path).build());
53 |     return blob.writer();
54 |   }
55 | 
56 |   @Override
57 |   public String getName() {
58 |     return StringUtil.generateGcsUri(bucket, path);
59 |   }
60 | 
61 |   private void initStorage() {
62 |     storage = GcpUtil.getGcsClient(credentials);
63 |   }
64 | }
65 | 


--------------------------------------------------------------------------------
/src/main/java/com/google/cloud/healthcare/io/GcsOutputWriterFactory.java:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Google LLC
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package com.google.cloud.healthcare.io;
16 | 
17 | import com.google.auth.oauth2.GoogleCredentials;
18 | import com.google.cloud.healthcare.util.StringUtil;
19 | import com.google.common.base.Strings;
20 | 
21 | /**
22 |  * Produces {@link GcsOutputWriter}s to store data on GCS.
23 |  */
24 | public class GcsOutputWriterFactory implements OutputWriterFactory {
25 | 
26 |   private final String bucket;
27 |   private final String path;
28 |   private final GoogleCredentials credentials;
29 | 
30 |   /**
31 |    * Note here the path represents a logical folder, all {@link OutputWriter} created will point to
32 |    * a file within the folder.
33 |    */
34 |   public GcsOutputWriterFactory(GoogleCredentials credentials, String bucket, String path) {
35 |     this.bucket = bucket;
36 |     this.path = path;
37 |     this.credentials = credentials;
38 |   }
39 | 
40 |   public GcsOutputWriterFactory(GoogleCredentials credentials, String gcsUri) {
41 |     String[] parts = StringUtil.splitGcsUri(gcsUri);
42 |     bucket = parts[0];
43 |     path = parts[1];
44 |     this.credentials = credentials;
45 |   }
46 | 
47 |   @Override
48 |   public OutputWriter getOutputWriter(String name) {
49 |     String completePath = String.format("%s/%s", path, name).replaceAll("^/+", "");
50 |     if (Strings.isNullOrEmpty(completePath)) {
51 |       throw new IllegalArgumentException("Complete path cannot be empty.");
52 |     }
53 |     return new GcsOutputWriter(credentials, bucket, completePath);
54 |   }
55 | }
56 | 


--------------------------------------------------------------------------------
/src/main/java/com/google/cloud/healthcare/io/InputReader.java:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Google LLC
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package com.google.cloud.healthcare.io;
16 | 
17 | import java.nio.channels.ReadableByteChannel;
18 | import javax.annotation.Nullable;
19 | 
20 | /**
21 |  * InputReader abstracts the input and provide useful methods for reading the content.
22 |  */
23 | public interface InputReader {
24 | 
25 |   int CHUNK_SIZE = 4 * 1024 * 1024; // 4MB.
26 | 
27 |   /**
28 |    * @return the content type of the input.
29 |    */
30 |   String getContentType();
31 | 
32 |   /**
33 |    * @return an {@link ReadableByteChannel} to read the content of the input.
34 |    */
35 |   ReadableByteChannel getReadChannel();
36 | 
37 |   /**
38 |    * @return the size of the input. -1 is returned if the concrete size is unknown.
39 |    */
40 |   long getSize();
41 | 
42 |   /**
43 |    * @return the original name of the input, null if not available.
44 |    */
45 |   @Nullable String getName();
46 | }
47 | 


--------------------------------------------------------------------------------
/src/main/java/com/google/cloud/healthcare/io/OutputWriter.java:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Google LLC
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package com.google.cloud.healthcare.io;
16 | 
17 | import java.nio.channels.WritableByteChannel;
18 | 
19 | /**
20 |  * A common interface for writing to a location.
21 |  */
22 | public interface OutputWriter {
23 | 
24 |   /**
25 |    * Fetches an {@link WritableByteChannel} which points to a file specified as output.
26 |    */
27 |   WritableByteChannel getWriteChannel();
28 | 
29 |   /**
30 |    * @return the name of the object this writer writes to, null if not available.
31 |    */
32 |   String getName();
33 | }
34 | 


--------------------------------------------------------------------------------
/src/main/java/com/google/cloud/healthcare/io/OutputWriterFactory.java:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Google LLC
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package com.google.cloud.healthcare.io;
16 | 
17 | /**
18 |  * Creates {@link OutputWriter}s which point to the designated destinations.
19 |  */
20 | public interface OutputWriterFactory {
21 | 
22 |   /**
23 |    * @return an {@link OutputWriter} that points to an object specified by the name.
24 |    */
25 |   OutputWriter getOutputWriter(String name);
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/src/main/java/com/google/cloud/healthcare/process/pipeline/BigQueryDestinations.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2019 Google LLC.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.google.cloud.healthcare.process.pipeline;
18 | 
19 | import com.google.api.services.bigquery.model.TableReference;
20 | import com.google.api.services.bigquery.model.TableRow;
21 | import com.google.api.services.bigquery.model.TableSchema;
22 | import com.google.cloud.healthcare.process.schema.FieldType;
23 | import com.google.cloud.healthcare.process.schema.GcpUtil;
24 | import com.google.cloud.healthcare.process.schema.SchemaUtil;
25 | import com.google.common.collect.ImmutableList;
26 | import java.util.List;
27 | import java.util.Map;
28 | import org.apache.beam.sdk.io.gcp.bigquery.DynamicDestinations;
29 | import org.apache.beam.sdk.io.gcp.bigquery.TableDestination;
30 | import org.apache.beam.sdk.values.KV;
31 | import org.apache.beam.sdk.values.PCollectionView;
32 | import org.apache.beam.sdk.values.ValueInSingleWindow;
33 | 
34 | /**
35 |  * A {@link DynamicDestinations} used by {@link org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO}
36 |  * to indicate the destination BigQuery table for each record. Each record will be assigned by its
37 |  * URI. The schemas and headers are passed in to construct {@link TableSchema} from {@code
38 |  * FieldType[]}. Note that {@link TableSchema} is not serializable.
39 |  */
40 | public class BigQueryDestinations extends DynamicDestinations<KV<String, TableRow>, String> {
41 | 
42 |   private final PCollectionView<Map<String, FieldType[]>> schemasView;
43 |   private final PCollectionView<Map<String, String[]>> headersView;
44 |   private final String projectId;
45 |   private final String datasetId;
46 | 
47 |   public BigQueryDestinations(
48 |       PCollectionView<Map<String, FieldType[]>> schemasView,
49 |       PCollectionView<Map<String, String[]>> headersView,
50 |       String projectId,
51 |       String datasetId) {
52 |     this.schemasView = schemasView;
53 |     this.headersView = headersView;
54 |     this.projectId = projectId;
55 |     this.datasetId = datasetId;
56 |   }
57 | 
58 |   @Override
59 |   public List<PCollectionView<?>> getSideInputs() {
60 |     return ImmutableList.of(schemasView, headersView);
61 |   }
62 | 
63 |   @Override
64 |   public String getDestination(ValueInSingleWindow<KV<String, TableRow>> element) {
65 |     return element.getValue().getKey();
66 |   }
67 | 
68 |   @Override
69 |   public TableDestination getTable(String destination) {
70 |     TableReference tableReference =
71 |         GcpUtil.getBigQueryTableReference(projectId, datasetId, destination);
72 | 
73 |     return new TableDestination(tableReference, null);
74 |   }
75 | 
76 |   @Override
77 |   public TableSchema getSchema(String destination) {
78 |     // TODO(b/135939392): Avoid reconstructing TableSchema for each record.
79 |     FieldType[] schema = sideInput(schemasView).get(destination);
80 |     String[] header = sideInput(headersView).get(destination);
81 |     return SchemaUtil.generateBigQueryTableSchema(header, schema);
82 |   }
83 | }
84 | 


--------------------------------------------------------------------------------
/src/main/java/com/google/cloud/healthcare/process/pipeline/FillTableRowFn.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2019 Google LLC.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.google.cloud.healthcare.process.pipeline;
18 | 
19 | import com.google.api.services.bigquery.model.TableRow;
20 | import com.google.cloud.healthcare.process.schema.FieldType;
21 | import com.google.cloud.healthcare.process.schema.SchemaUtil;
22 | import com.google.common.base.Strings;
23 | import java.util.Map;
24 | import org.apache.beam.sdk.transforms.DoFn;
25 | import org.apache.beam.sdk.values.KV;
26 | import org.apache.beam.sdk.values.PCollectionView;
27 | 
28 | /**
29 |  * A {@link DoFn} that fill data into {@link TableRow}s accepted by {@link
30 |  * org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO}. The schemas and headers are passed in as side
31 |  * inputs.
32 |  */
33 | public class FillTableRowFn extends DoFn<KV<String, String[]>, KV<String, TableRow>> {
34 | 
35 |   private final PCollectionView<Map<String, FieldType[]>> schemasView;
36 |   private final PCollectionView<Map<String, String[]>> headersView;
37 | 
38 |   public FillTableRowFn(
39 |       PCollectionView<Map<String, FieldType[]>> schemasView,
40 |       PCollectionView<Map<String, String[]>> headersView) {
41 |     this.schemasView = schemasView;
42 |     this.headersView = headersView;
43 |   }
44 | 
45 |   @ProcessElement
46 |   public void fill(
47 |       @Element KV<String, String[]> in,
48 |       OutputReceiver<KV<String, TableRow>> out,
49 |       ProcessContext ctx) {
50 |     FieldType[] schema = ctx.sideInput(schemasView).get(in.getKey());
51 |     String[] header = ctx.sideInput(headersView).get(in.getKey());
52 |     String[] record = in.getValue();
53 | 
54 |     TableRow tableRow = new TableRow();
55 |     for (int i = 0; i < record.length; i++) {
56 |       String field = record[i];
57 |       if (!Strings.isNullOrEmpty(field)) {
58 |         String key = header[i];
59 |         FieldType type = schema[i];
60 | 
61 |         switch (type) {
62 |           case INT:
63 |             tableRow.put(key, SchemaUtil.convertToInteger(field));
64 |             break;
65 |           case LONG:
66 |             tableRow.put(key, SchemaUtil.convertToLong(field));
67 |             break;
68 |           case DOUBLE:
69 |             tableRow.put(key, SchemaUtil.convertToDouble(field));
70 |             break;
71 |           // TODO(b/120794993): Skip date/time conversion based on a flag supplied by users.
72 |           case TIME:
73 |             tableRow.put(key, SchemaUtil.convertToTime(field));
74 |             break;
75 |           case DATE:
76 |             tableRow.put(key, SchemaUtil.convertToDate(field));
77 |             break;
78 |           case DATETIME:
79 |             tableRow.put(key, SchemaUtil.convertToDateTime(field));
80 |             break;
81 |           case BOOLEAN:
82 |             tableRow.put(key, SchemaUtil.isTrue(field));
83 |             break;
84 |           default:
85 |             tableRow.put(key, field);
86 |             break;
87 |         }
88 |       }
89 |     }
90 | 
91 |     out.output(KV.of(in.getKey(), tableRow));
92 |   }
93 | }
94 | 


--------------------------------------------------------------------------------
/src/main/java/com/google/cloud/healthcare/process/pipeline/GcsReadChunksFn.java:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Google LLC
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package com.google.cloud.healthcare.process.pipeline;
16 | 
17 | import com.google.cloud.healthcare.config.GcpConfiguration;
18 | import com.google.cloud.healthcare.process.schema.GcpUtil;
19 | import com.google.common.collect.Lists;
20 | import com.google.common.io.ByteStreams;
21 | import java.io.IOException;
22 | import java.io.InputStream;
23 | import java.nio.channels.Channels;
24 | import java.nio.channels.ReadableByteChannel;
25 | import java.util.Collections;
26 | import java.util.List;
27 | import java.util.Set;
28 | import org.apache.beam.sdk.transforms.DoFn;
29 | import org.apache.beam.sdk.values.KV;
30 | 
31 | /** A beam {@link DoFn} that reads chunked data from GCS according to split points. */
32 | public class GcsReadChunksFn extends DoFn<KV<String, Set<Long>>, KV<String, byte[]>> {
33 | 
34 |   private final GcpConfiguration config;
35 | 
36 |   public GcsReadChunksFn(GcpConfiguration config) {
37 |     this.config = config;
38 |   }
39 | 
40 |   @ProcessElement
41 |   public void generate(ProcessContext ctx) {
42 |     KV<String, Set<Long>> input = ctx.element();
43 | 
44 |     List<Long> splitPoints = Lists.newArrayList(input.getValue());
45 |     Collections.sort(splitPoints);
46 | 
47 |     String name = input.getKey();
48 | 
49 |     try (ReadableByteChannel channel = GcpUtil.openGcsFile(config.getCredentials(), name);
50 |         InputStream is = Channels.newInputStream(channel)) {
51 |       ByteStreams.skipFully(is, splitPoints.get(0));
52 |       for (int i = 0; i < splitPoints.size() - 1; i++) {
53 |         int len = (int) (splitPoints.get(i + 1) - splitPoints.get(i));
54 |         byte[] content = new byte[len];
55 |         ByteStreams.readFully(is, content, 0, len);
56 |         ctx.output(KV.of(name, content));
57 |       }
58 |     } catch (IOException e) {
59 |       throw new RuntimeException(e);
60 |     }
61 |   }
62 | }
63 | 


--------------------------------------------------------------------------------
/src/main/java/com/google/cloud/healthcare/process/pipeline/csv/CsvDetectSchemaFn.java:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Google LLC
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package com.google.cloud.healthcare.process.pipeline.csv;
16 | 
17 | import com.google.cloud.healthcare.process.schema.FieldType;
18 | import com.google.cloud.healthcare.process.schema.SchemaUtil;
19 | import java.util.Arrays;
20 | import java.util.List;
21 | import org.apache.beam.sdk.transforms.DoFn;
22 | import org.apache.beam.sdk.values.KV;
23 | 
24 | /**
25 |  * A {@link DoFn} that detects the schema for a chunk of data.
26 |  */
27 | public class CsvDetectSchemaFn extends
28 |     DoFn<KV<String, String[]>, KV<String, List<FieldType>>> {
29 | 
30 |   @ProcessElement
31 |   public void detect(ProcessContext ctx) {
32 |     KV<String, String[]> input = ctx.element();
33 | 
34 |     ctx.output(KV.of(input.getKey(), SchemaUtil.infer(Arrays.asList(input.getValue()))));
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/src/main/java/com/google/cloud/healthcare/process/pipeline/csv/CsvExtractHeadersFn.java:
--------------------------------------------------------------------------------
  1 | // Copyright 2019 Google LLC
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | package com.google.cloud.healthcare.process.pipeline.csv;
 16 | 
 17 | import com.google.cloud.healthcare.config.CsvConfiguration;
 18 | import com.google.cloud.healthcare.process.schema.FieldType;
 19 | import com.google.cloud.healthcare.process.schema.SchemaUtil;
 20 | import com.google.common.base.Strings;
 21 | import com.google.common.collect.Sets;
 22 | import java.io.BufferedReader;
 23 | import java.io.IOException;
 24 | import java.io.InputStreamReader;
 25 | import java.nio.channels.Channels;
 26 | import java.util.Arrays;
 27 | import java.util.List;
 28 | import org.apache.beam.sdk.io.FileIO.ReadableFile;
 29 | import org.apache.beam.sdk.transforms.DoFn;
 30 | import org.apache.beam.sdk.values.KV;
 31 | 
 32 | /**
 33 |  * A {@link DoFn} that extracts the first line of a {@link ReadableFile} as headers and apply
 34 |  * simple validations.
 35 |  */
 36 | public class CsvExtractHeadersFn extends DoFn<ReadableFile, KV<String, String[]>> {
 37 | 
 38 |   @ProcessElement
 39 |   public void extract(ProcessContext ctx) throws IOException {
 40 |     ReadableFile file = ctx.element();
 41 | 
 42 |     try (BufferedReader reader = new BufferedReader(
 43 |         new InputStreamReader(Channels.newInputStream(file.open())))) {
 44 |       String line = reader.readLine();
 45 | 
 46 |       String delimiter = CsvConfiguration.getInstance().getDelimiter().toString();
 47 |       String[] headers = normalizeHeaders(line.split(delimiter));
 48 |       if (!validateHeaders(headers)) {
 49 |         throw new IllegalArgumentException(String.format(
 50 |             "%s of file %s contains invalid headers. Headers are required for every file.",
 51 |             line, file.getMetadata().resourceId().toString()));
 52 |       }
 53 | 
 54 |       ctx.output(KV.of(file.getMetadata().resourceId().toString(), headers));
 55 |     }
 56 |   }
 57 | 
 58 |   /**
 59 |    * Normalizes headers by removing quotes (only those at the beginning and at the end), and
 60 |    * replacing any non-word characters with underscores to meet the requirements on column names
 61 |    * from both AVRO and BigQuery.
 62 |    *
 63 |    * Headers might collide after the normalization, in which case we'll abort.
 64 |    */
 65 |   private static String[] normalizeHeaders(String[] headers) {
 66 |     for (int i = 0; i < headers.length; i++) {
 67 |       headers[i] = headers[i]
 68 |           // Surrounding quotes.
 69 |           .replaceAll("^\"|\"$", "")
 70 |           // Non-word chars.
 71 |           .replaceAll("\\W", "_");
 72 |     }
 73 |     return headers;
 74 |   }
 75 | 
 76 |   /**
 77 |    * Checks if the first line are really headers.
 78 |    *
 79 |    * The heuristics deployed here are:
 80 |    *
 81 |    * 1. There are no duplicate values
 82 |    * 2. There are no non-string values
 83 |    * 3. There are no empty values
 84 |    *
 85 |    * @return false if we know for sure that these are not valid headers, true meaning based on our
 86 |    * heuristics these look like valid headers.
 87 |    */
 88 |   private static boolean validateHeaders(String[] headers) {
 89 |     // Empty values.
 90 |     for (String header : headers) {
 91 |       if (Strings.isNullOrEmpty(header) || Strings.isNullOrEmpty(header.trim())) {
 92 |         return false;
 93 |       }
 94 |     }
 95 | 
 96 |     // Non-string values.
 97 |     List<FieldType> types =  SchemaUtil.infer(Arrays.asList(headers));
 98 |     if (types.stream().anyMatch(t -> t != FieldType.STRING)) {
 99 |       return false;
100 |     }
101 | 
102 |     // Duplicate values.
103 |     return Sets.newHashSet(headers).size() == headers.length;
104 |   }
105 | }
106 | 


--------------------------------------------------------------------------------
/src/main/java/com/google/cloud/healthcare/process/pipeline/csv/CsvMergeSchemaFn.java:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Google LLC
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package com.google.cloud.healthcare.process.pipeline.csv;
16 | 
17 | import com.google.cloud.healthcare.process.schema.FieldType;
18 | import com.google.cloud.healthcare.process.schema.SchemaUtil;
19 | import com.google.common.collect.Lists;
20 | import com.google.common.collect.Streams;
21 | import java.util.List;
22 | import org.apache.beam.sdk.transforms.Combine.CombineFn;
23 | 
24 | /**
25 |  * A {@link CombineFn} which merges all pieces of schemas and generates a complete schema for
26 |  * importing to BigQuery later.
27 |  */
28 | public class CsvMergeSchemaFn extends CombineFn<List<FieldType>, List<FieldType>, FieldType[]> {
29 | 
30 |   @Override
31 |   public List<FieldType> createAccumulator() {
32 |     return Lists.newArrayList();
33 |   }
34 | 
35 |   @Override
36 |   public List<FieldType> addInput(List<FieldType> accumulator, List<FieldType> input) {
37 |     return SchemaUtil.merge(accumulator, input);
38 |   }
39 | 
40 |   @Override
41 |   public List<FieldType> mergeAccumulators(Iterable<List<FieldType>> accumulators) {
42 |     return Streams.stream(accumulators).reduce(SchemaUtil::merge).get();
43 |   }
44 | 
45 |   @Override
46 |   public FieldType[] extractOutput(List<FieldType> accumulator) {
47 |     return accumulator.toArray(new FieldType[0]);
48 |   }
49 | }
50 | 


--------------------------------------------------------------------------------
/src/main/java/com/google/cloud/healthcare/process/pipeline/csv/CsvParseDataFn.java:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Google LLC
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package com.google.cloud.healthcare.process.pipeline.csv;
16 | 
17 | import com.google.cloud.healthcare.config.CsvConfiguration;
18 | import com.univocity.parsers.csv.CsvFormat;
19 | import com.univocity.parsers.csv.CsvParser;
20 | import com.univocity.parsers.csv.CsvParserSettings;
21 | import java.io.ByteArrayInputStream;
22 | import org.apache.beam.sdk.transforms.DoFn;
23 | import org.apache.beam.sdk.values.KV;
24 | 
25 | /** A {@link DoFn} which parses a chunk of data into CSV. */
26 | public class CsvParseDataFn extends DoFn<KV<String, byte[]>, KV<String, String[]>> {
27 |   // The maximum number of rows in the CSV file, increasing this number will cause the job to
28 |   // use more memory.
29 |   private static final int MAX_COLUMNS = 8192;
30 | 
31 |   private final CsvConfiguration config;
32 |   private CsvParser parser;
33 | 
34 |   public CsvParseDataFn(CsvConfiguration config) {
35 |     this.config = config;
36 |   }
37 | 
38 |   @Setup
39 |   public void setUp() {
40 |     CsvFormat format = new CsvFormat();
41 |     format.setDelimiter(config.getDelimiter());
42 |     format.setQuote(config.getQuote());
43 |     format.setQuoteEscape(config.getQuote());
44 |     format.setLineSeparator(config.getRecordSeparator());
45 |     CsvParserSettings settings = new CsvParserSettings();
46 |     settings.setFormat(format);
47 |     settings.setMaxColumns(MAX_COLUMNS);
48 |     settings.setMaxCharsPerColumn(-1);
49 |     settings.setNormalizeLineEndingsWithinQuotes(false);
50 | 
51 |     parser = new CsvParser(settings);
52 |   }
53 | 
54 |   @ProcessElement
55 |   public void parse(ProcessContext ctx) {
56 |     KV<String, byte[]> input = ctx.element();
57 |     parser
58 |         .parseAll(new ByteArrayInputStream(input.getValue()))
59 |         .forEach(row -> ctx.output(KV.of(input.getKey(), row)));
60 |   }
61 | }
62 | 


--------------------------------------------------------------------------------
/src/main/java/com/google/cloud/healthcare/process/pipeline/csv/CsvSplitFn.java:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Google LLC
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package com.google.cloud.healthcare.process.pipeline.csv;
16 | 
17 | import com.google.cloud.healthcare.config.CsvConfiguration;
18 | import com.google.cloud.healthcare.io.ByteReader;
19 | import com.google.common.collect.Sets;
20 | import java.io.IOException;
21 | import java.nio.channels.SeekableByteChannel;
22 | import java.util.Map;
23 | import java.util.Set;
24 | import javax.annotation.Nullable;
25 | import org.apache.beam.sdk.io.FileIO.ReadableFile;
26 | import org.apache.beam.sdk.transforms.DoFn;
27 | import org.apache.beam.sdk.values.KV;
28 | import org.apache.beam.sdk.values.PCollectionView;
29 | 
30 | /**
31 |  * Base class for splitting CSV files.
32 |  */
33 | public abstract class CsvSplitFn extends DoFn<ReadableFile, KV<String, Set<Long>>> {
34 |   protected static int CHUNK_SIZE = 2 * 1024 * 1024; // 2MB.
35 | 
36 |   private PCollectionView<Map<String, String[]>> headersView;
37 |   protected final CsvConfiguration config;
38 |   protected String[] headers;
39 | 
40 |   public CsvSplitFn(CsvConfiguration config,
41 |       PCollectionView<Map<String, String[]>> headersView) {
42 |     this.config = config;
43 |     this.headersView = headersView;
44 |   }
45 | 
46 |   // TODO(b/122103201): Parallelize the split process.
47 |   @ProcessElement
48 |   public void split(ProcessContext ctx) throws IOException {
49 |     ReadableFile file = ctx.element();
50 |     String name = file.getMetadata().resourceId().toString();
51 |     headers = ctx.sideInput(headersView).get(name);
52 | 
53 |     SeekableByteChannel ch = file.openSeekable();
54 | 
55 |     long size = file.getMetadata().sizeBytes();
56 |     int splitPoints = (int) (size / CHUNK_SIZE);
57 | 
58 |     Set<Long> points = Sets.newHashSet();
59 |     seekStartPoint(ch);
60 |     points.add(ch.position());
61 | 
62 |     for (int i = 0; i < splitPoints; i++) {
63 |       Long splitPoint = calcSplitPoint(ch,
64 |           startSplitCheckPosition(i),
65 |           Math.min(startSplitCheckPosition(i + 1), size));
66 |       if (splitPoint != null) {
67 |         points.add(splitPoint);
68 |       }
69 |     }
70 | 
71 |     points.add(size);
72 |     ctx.output(KV.of(name, points));
73 |   }
74 | 
75 |   // Calculates the split point in the range specified as parameters (start inclusive, end
76 |   // exclusive). If no valid split point is found, null will be returned.
77 |   @Nullable
78 |   protected abstract Long calcSplitPoint(SeekableByteChannel ch, long start, long end)
79 |       throws IOException;
80 | 
81 |   /**
82 |    * Returns the position of the first byte on second row, since we require first row to be headers.
83 |    */
84 |   private void seekStartPoint(SeekableByteChannel ch) throws IOException {
85 |     if (ch.position() != 0L) {
86 |       throw new IllegalStateException("Position of the cursor has to be at the beginning of file");
87 |     }
88 |     new ByteReader(ch).readLine();
89 |   }
90 | 
91 |   /** Calculates the starting search position for each split point. */
92 |   private long startSplitCheckPosition(int i) {
93 |     return (i + 1) * (long) CHUNK_SIZE;
94 |   }
95 | }
96 | 


--------------------------------------------------------------------------------
/src/main/java/com/google/cloud/healthcare/process/pipeline/csv/GcsSplitCsvFn.java:
--------------------------------------------------------------------------------
  1 | // Copyright 2019 Google LLC
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | package com.google.cloud.healthcare.process.pipeline.csv;
 16 | 
 17 | import com.google.cloud.healthcare.config.CsvConfiguration;
 18 | import com.google.cloud.healthcare.io.ByteReader;
 19 | import com.google.common.base.Strings;
 20 | import com.google.common.collect.Lists;
 21 | import java.io.IOException;
 22 | import java.nio.channels.SeekableByteChannel;
 23 | import java.util.List;
 24 | import java.util.Map;
 25 | import javax.annotation.Nullable;
 26 | import org.apache.beam.sdk.transforms.DoFn;
 27 | import org.apache.beam.sdk.values.PCollectionView;
 28 | 
 29 | /**
 30 |  * An Apache Beam {@link DoFn} which splits a large file into smaller chunks. This is done by first
 31 |  * choosing a few positions to start the splitting process, then looking for valid split points in
 32 |  * each block. The rule for determining a valid split point is:
 33 |  *
 34 |  *  1. If we know for sure based on the categorization of quotes (see {@link #detectQuotes(byte[])}
 35 |  *  and {@link #splitOffset(List)}), we record it;
 36 |  *  2. Otherwise, e.g. there is no quote on one line, we count the number of columns for each
 37 |  *  adjacent lines, if the number is over count of headers plus one, we record it.
 38 |  *
 39 |  * Note that in rare cases a valid split point cannot be found in the block, in which case we will
 40 |  * not split.
 41 |  */
 42 | public class GcsSplitCsvFn extends CsvSplitFn {
 43 | 
 44 |   public GcsSplitCsvFn(CsvConfiguration config,
 45 |       PCollectionView<Map<String, String[]>> headersView) {
 46 |     super(config, headersView);
 47 |   }
 48 | 
 49 |   @Nullable
 50 |   @Override
 51 |   protected Long calcSplitPoint(SeekableByteChannel ch, long start, long end) throws IOException {
 52 |     ch.position(start);
 53 | 
 54 |     ByteReader reader = new ByteReader(ch);
 55 |     // Discard the first line since the start byte likely lies in the middle of a complete line.
 56 |     reader.readLine();
 57 | 
 58 |     long currPos = ch.position();
 59 |     String prevLine = null;
 60 |     while (currPos < end) {
 61 |       byte[] line = reader.readLine();
 62 |       List<QuoteType> quotes = detectQuotes(line);
 63 |       // No quotes in this line.
 64 |       if (quotes.isEmpty()) {
 65 |         String currLine = new String(line);
 66 |         if (!Strings.isNullOrEmpty(prevLine) && validSplitPoint(prevLine, currLine)) {
 67 |           // The fields in this and previous line determines a valid split point.
 68 |           return currPos;
 69 |         } else {
 70 |           prevLine = currLine;
 71 |         }
 72 |       } else {
 73 |         // Set the prevLine mark as null since we have quotes.
 74 |         prevLine = null;
 75 |       }
 76 | 
 77 |       Offset offset = splitOffset(quotes);
 78 |       // If the first quote in the line is open.
 79 |       if (offset == Offset.START) {
 80 |         return currPos;
 81 |       } else if (offset == Offset.END) { // If the last quote in the line is closed.
 82 |         return ch.position();
 83 |       }
 84 |       currPos = ch.position();
 85 |     }
 86 | 
 87 |     return null;
 88 |   }
 89 | 
 90 |   /**
 91 |    * Checks whether the point between two lines is a valid split point. This is only done if neither
 92 |    * line has quote characters.
 93 |    */
 94 |   private boolean validSplitPoint(String prevLine, String currLine) {
 95 |     String delimiter = String.valueOf(config.getDelimiter());
 96 |     return prevLine.split(delimiter).length + currLine.split(delimiter).length
 97 |         > headers.length + 1;
 98 |   }
 99 | 
100 |   /**
101 |    * Detects the type of quotes in a line.
102 |    *
103 |    * The rules are:
104 |    * - Any quote that is directly preceded or directly followed by a quote is potentially an escaped
105 |    *   quote;
106 |    * - Any quote that is directly preceded by a separator character, and not directly followed by a
107 |    *   separator character is an open quote;
108 |    * - Any quote that is not directly preceded by a separator character, but that is directly
109 |    *   followed by a separator character is a close quote;
110 |    * - All rest is unknown.
111 |    *
112 |    * Example:
113 |    * 1999,Chevy,"Venture ""Extended Edition"""
114 |    * 1997,Ford,E350,"Super, luxurious truck"
115 |    *
116 |    * The first quote on first line is an opening quote since it follows a delimiter (,) and not
117 |    * followed by another separator. The second and third quotes are of type unknown, since they are
118 |    * escaping and escaped quotes. Next three are the same (Note the last quote should have been a
119 |    * closing quote, but we are unable to detect it).
120 |    *
121 |    * On the second line, the second quote is a closing quote because it doesn't follow another quote
122 |    * or a separator, but is followed by a separator (new line or EOF).
123 |    */
124 |   private List<QuoteType> detectQuotes(byte[] bytes) {
125 |     char quote = CsvConfiguration.getInstance().getQuote();
126 | 
127 |     List<QuoteType> quotes = Lists.newArrayList();
128 |     for (int i = 0; i < bytes.length; i++) {
129 |       if (bytes[i] != quote) {
130 |         continue;
131 |       }
132 | 
133 |       byte prev;
134 |       if (i == 0) {
135 |         // For first byte, treat the previous byte as separator.
136 |         prev = '\n';
137 |       } else {
138 |         prev = bytes[i - 1];
139 |       }
140 | 
141 |       byte next;
142 |       if (i == bytes.length - 1) {
143 |         // For last byte, treat the next byte as separator.
144 |         next = '\n';
145 |       } else {
146 |         next = bytes[i + 1];
147 |       }
148 | 
149 |       if (prev == quote || next == quote) {
150 |         quotes.add(QuoteType.UNKNOWN);
151 |       } else if (isSeparator(prev) && !isSeparator(next)) {
152 |         quotes.add(QuoteType.OPEN);
153 |       } else if (!isSeparator(prev) && isSeparator(next)) {
154 |         quotes.add(QuoteType.CLOSED);
155 |       } else {
156 |         quotes.add(QuoteType.UNKNOWN);
157 |       }
158 |     }
159 | 
160 |     return quotes;
161 |   }
162 | 
163 |   /**
164 |    * Determines what offset relative to current line is a split point. This is based on the fact
165 |    * that if the last quote on a line is a close quote, the end of this line should be valid split
166 |    * point, for same reason, if the first quote is an open quote, the start of this line should be
167 |    * a valid split point. In all rest cases, we are not sure if the start or end of line is valid.
168 |    */
169 |   private Offset splitOffset(List<QuoteType> quotes) {
170 |     int size = quotes.size();
171 | 
172 |     if (size == 0) {
173 |       return Offset.UNKNOWN;
174 |     }
175 | 
176 |     if (quotes.get(0) == QuoteType.OPEN) {
177 |       return Offset.START;
178 |     }
179 | 
180 |     if (quotes.get(size - 1) == QuoteType.CLOSED) {
181 |       return Offset.END;
182 |     }
183 | 
184 |     return Offset.UNKNOWN;
185 |   }
186 | 
187 |   // TODO(b/123358409): handle multiple bytes as delimiter.
188 |   private boolean isSeparator(byte b) {
189 |     return b == config.getDelimiter()
190 |         || b == ByteReader.NEWLINE_FEED
191 |         || b == ByteReader.CARRIAGE_RETURN;
192 |   }
193 | 
194 |   private enum Offset {
195 |     START,
196 |     END,
197 |     UNKNOWN;
198 |   }
199 | }
200 | 


--------------------------------------------------------------------------------
/src/main/java/com/google/cloud/healthcare/process/pipeline/csv/QuoteType.java:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Google LLC
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package com.google.cloud.healthcare.process.pipeline.csv;
16 | 
17 | /**
18 |  * The type for quote characters (not necessarily literal quotes).
19 |  */
20 | public enum QuoteType {
21 |   OPEN, // Marks the start of a quoted field in CSV.
22 |   CLOSED, // Marks the end of a quoted field in CSV.
23 |   UNKNOWN
24 | }
25 | 


--------------------------------------------------------------------------------
/src/main/java/com/google/cloud/healthcare/process/pipeline/csv/advance/CsvParseDataAdvanceFn.java:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Google LLC
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package com.google.cloud.healthcare.process.pipeline.csv.advance;
16 | 
17 | import com.google.cloud.healthcare.config.CsvConfiguration;
18 | import java.util.Arrays;
19 | import java.util.regex.Pattern;
20 | import org.apache.beam.sdk.transforms.DoFn;
21 | import org.apache.beam.sdk.values.KV;
22 | 
23 | /**
24 |  * Parses a chunk of data with the regular expressions. Only the default encoding on JVM (UTF-8) is
25 |  * supported.
26 |  */
27 | public class CsvParseDataAdvanceFn extends DoFn<KV<String, byte[]>, KV<String, String[]>> {
28 |   private final CsvConfiguration config;
29 | 
30 |   public CsvParseDataAdvanceFn(CsvConfiguration config) {
31 |     this.config = config;
32 |   }
33 | 
34 |   @ProcessElement
35 |   public void parse(ProcessContext ctx) {
36 |     Pattern recordSplitPattern = config.getRecordSeparatorRegex();
37 |     Pattern fieldSplitPattern = config.getDelimiterRegex();
38 | 
39 |     KV<String, byte[]> input = ctx.element();
40 |     String name = input.getKey();
41 |     byte[] bytes = input.getValue();
42 | 
43 |     // TODO(b/123357928): Support other encodings.
44 |     String content = new String(bytes);
45 |     String[] records = recordSplitPattern.split(content);
46 |     Arrays.stream(records)
47 |         .map(fieldSplitPattern::split).forEach(fields -> ctx.output(KV.of(name, fields)));
48 |   }
49 | }
50 | 


--------------------------------------------------------------------------------
/src/main/java/com/google/cloud/healthcare/process/pipeline/csv/advance/GcsSplitCsvAdvanceFn.java:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Google LLC
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package com.google.cloud.healthcare.process.pipeline.csv.advance;
16 | 
17 | import com.google.cloud.healthcare.config.CsvConfiguration;
18 | import com.google.cloud.healthcare.io.ByteReader;
19 | import com.google.cloud.healthcare.process.pipeline.csv.CsvSplitFn;
20 | import java.io.IOException;
21 | import java.nio.channels.SeekableByteChannel;
22 | import java.util.Map;
23 | import java.util.regex.Matcher;
24 | import java.util.regex.Pattern;
25 | import javax.annotation.Nullable;
26 | import org.apache.beam.sdk.values.PCollectionView;
27 | 
28 | /**
29 |  * Splits a non-standard CSV file into chunks. Users of the pipeline need to provide two
30 |  * regular expression, for record separator and line delimiter respectively.
31 |  */
32 | public class GcsSplitCsvAdvanceFn extends CsvSplitFn {
33 | 
34 |   public GcsSplitCsvAdvanceFn(CsvConfiguration config,
35 |       PCollectionView<Map<String, String[]>> headersView) {
36 |     super(config, headersView);
37 |   }
38 | 
39 |   @Nullable
40 |   @Override
41 |   protected Long calcSplitPoint(SeekableByteChannel ch, long start, long end) throws IOException {
42 |     Pattern pattern = config.getRecordSeparatorRegex();
43 |     ch.position(start);
44 |     ByteReader reader = new ByteReader(ch);
45 | 
46 |     byte[] bytes = reader.readUntil(end);
47 |     // TODO(b/123357928): Support encodings other than UTF-8.
48 |     String content = new String(bytes);
49 |     Matcher matcher = pattern.matcher(content);
50 |     if (matcher.find()) {
51 |       return start + content.substring(0, matcher.end()).getBytes().length;
52 |     }
53 | 
54 |     return null;
55 |   }
56 | }
57 | 


--------------------------------------------------------------------------------
/src/main/java/com/google/cloud/healthcare/process/schema/FieldType.java:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Google LLC
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package com.google.cloud.healthcare.process.schema;
16 | 
17 | /**
18 |  * Definition of possible types. Note that FLOAT is not listed here because we always try to parse
19 |  * to DOUBLE for better precision.
20 |  */
21 | public enum FieldType {
22 |   INT,
23 |   LONG,
24 |   DOUBLE,
25 |   BOOLEAN,
26 |   DATE,
27 |   TIME,
28 |   DATETIME,
29 |   STRING,
30 |   UNKNOWN;
31 | 
32 |   // TODO(b/123357900): Use bitwise for calculation.
33 |   /** Calculate the common super type of two types. */
34 |   static FieldType getCommonType(FieldType s, FieldType t) {
35 |     if (s == t) {
36 |       return s;
37 |     }
38 | 
39 |     if (s == UNKNOWN) {
40 |       return t;
41 |     }
42 |     if (t == UNKNOWN) {
43 |       return s;
44 |     }
45 | 
46 |     if ((s == DATETIME && t == DATE)
47 |         || (s == DATE && t == DATETIME)
48 |         || (s == TIME && t == DATETIME)
49 |         || (s == DATETIME && t == TIME)) {
50 |       return DATETIME;
51 |     }
52 | 
53 |     if ((s == DOUBLE && (t == LONG || t == INT))
54 |         || (t == DOUBLE && (s == LONG || s == INT))) {
55 |       return DOUBLE;
56 |     }
57 |     if ((s == LONG && t == INT) || (t == LONG && s == INT)) {
58 |       return LONG;
59 |     }
60 | 
61 |     return STRING;
62 |   }
63 | }
64 | 


--------------------------------------------------------------------------------
/src/main/java/com/google/cloud/healthcare/process/schema/GcpUtil.java:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Google LLC
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package com.google.cloud.healthcare.process.schema;
16 | 
17 | import com.google.api.services.bigquery.model.TableReference;
18 | import com.google.auth.oauth2.GoogleCredentials;
19 | import com.google.cloud.bigquery.BigQuery;
20 | import com.google.cloud.bigquery.BigQueryOptions;
21 | import com.google.cloud.healthcare.io.GcsInputReader;
22 | import com.google.cloud.healthcare.util.StringUtil;
23 | import com.google.cloud.storage.Storage;
24 | import com.google.cloud.storage.StorageOptions;
25 | import java.nio.channels.ReadableByteChannel;
26 | import javax.annotation.Nullable;
27 | 
28 | /** Utility methods for GCP related operations. */
29 | public class GcpUtil {
30 | 
31 |   /** Instantiates a GCS client. If the credentials is null, then the default one is used. */
32 |   public static Storage getGcsClient(@Nullable GoogleCredentials credentials) {
33 |     if (credentials != null) {
34 |       return StorageOptions.newBuilder().setCredentials(credentials).build().getService();
35 |     } else {
36 |       return StorageOptions.getDefaultInstance().getService();
37 |     }
38 |   }
39 | 
40 |   /** Instantiates a GCS client. If the credentials is null, then the default one is used. */
41 |   public static BigQuery getBqClient(@Nullable GoogleCredentials credentials) {
42 |     if (credentials != null) {
43 |       return BigQueryOptions.newBuilder().setCredentials(credentials).build().getService();
44 |     } else {
45 |       return BigQueryOptions.getDefaultInstance().getService();
46 |     }
47 |   }
48 | 
49 |   /** Opens a file on GCS. If the credentials is null, then the default one is used. */
50 |   public static ReadableByteChannel openGcsFile(
51 |       @Nullable GoogleCredentials credentials, String uri) {
52 |     return new GcsInputReader(credentials, uri).getReadChannel();
53 |   }
54 | 
55 |   /** Build a {@link TableReference} containing metadata of a BigQuery table. */
56 |   public static TableReference getBigQueryTableReference(
57 |       String projectId, String datasetId, String uri) {
58 |     // TODO(b/134162118): Check and fix BigQuery table ID if it is invalid.
59 |     String tableId = StringUtil.getGcsBaseName(uri);
60 |     return new TableReference().setProjectId(projectId).setDatasetId(datasetId).setTableId(tableId);
61 |   }
62 | }
63 | 


--------------------------------------------------------------------------------
/src/main/java/com/google/cloud/healthcare/process/schema/SchemaUtil.java:
--------------------------------------------------------------------------------
  1 | // Copyright 2019 Google LLC
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | package com.google.cloud.healthcare.process.schema;
 16 | 
 17 | import com.google.api.services.bigquery.model.TableFieldSchema;
 18 | import com.google.api.services.bigquery.model.TableSchema;
 19 | import com.google.cloud.healthcare.util.PrettyPrinter;
 20 | import com.google.common.base.Preconditions;
 21 | import com.google.common.base.Strings;
 22 | import com.google.common.collect.Streams;
 23 | import java.time.LocalDate;
 24 | import java.time.LocalDateTime;
 25 | import java.time.LocalTime;
 26 | import java.time.OffsetDateTime;
 27 | import java.time.ZonedDateTime;
 28 | import java.time.format.DateTimeFormatter;
 29 | import java.time.format.DateTimeParseException;
 30 | import java.util.ArrayList;
 31 | import java.util.Arrays;
 32 | import java.util.List;
 33 | import java.util.stream.Collectors;
 34 | import java.util.stream.IntStream;
 35 | 
 36 | /** Utility class for handling schemas. */
 37 | public class SchemaUtil {
 38 | 
 39 |   /**
 40 |    * Infer schema from a list of values. The typical input is a row in a CSV file.
 41 |    *
 42 |    * @param input the values to infer schema from
 43 |    * @return inferred schema for a record
 44 |    */
 45 |   public static List<FieldType> infer(List<String> input) {
 46 |     return input.stream().map(SchemaUtil::infer).collect(Collectors.toList());
 47 |   }
 48 | 
 49 |   private static FieldType infer(String value) {
 50 |     // Note that the order of conversion attempts matter here.
 51 |     if (Strings.isNullOrEmpty(value)) {
 52 |       return FieldType.UNKNOWN;
 53 |     }
 54 | 
 55 |     if (convertToBoolean(value)) {
 56 |       return FieldType.BOOLEAN;
 57 |     }
 58 | 
 59 |     if (convertToInteger(value) != null) {
 60 |       return FieldType.INT;
 61 |     }
 62 | 
 63 |     if (convertToLong(value) != null) {
 64 |       return FieldType.LONG;
 65 |     }
 66 | 
 67 |     if (convertToDouble(value) != null) {
 68 |       return FieldType.DOUBLE;
 69 |     }
 70 | 
 71 |     if (convertToDate(value) != null) {
 72 |       return FieldType.DATE;
 73 |     }
 74 | 
 75 |     if (convertToTime(value) != null) {
 76 |       return FieldType.TIME;
 77 |     }
 78 | 
 79 |     if (convertToDateTime(value) != null) {
 80 |       return FieldType.DATETIME;
 81 |     }
 82 | 
 83 |     return FieldType.STRING;
 84 |   }
 85 | 
 86 |   public static TableSchema generateBigQueryTableSchema(String[] headers, FieldType[] types) {
 87 |     // TODO(b/121042931): We should support ignoring bad rows to some number.
 88 |     if (headers.length != types.length) {
 89 |       throw new IllegalArgumentException(
 90 |           String.format(
 91 |               "Encountered invalid input:\nheaders: %s\ntypes: %s",
 92 |               PrettyPrinter.print(Arrays.asList(headers)),
 93 |               PrettyPrinter.print(Arrays.asList(types))));
 94 |     }
 95 |     List<TableFieldSchema> fields = new ArrayList<>();
 96 |     for (int i = 0; i < types.length; i++) {
 97 |       TableFieldSchema field = new TableFieldSchema().setName(headers[i]).setMode("NULLABLE");
 98 |       switch (types[i]) {
 99 |         case BOOLEAN:
100 |           field.setType("BOOL");
101 |           break;
102 |         case INT:
103 |         case LONG:
104 |           field.setType("INT64");
105 |           break;
106 |         case DOUBLE:
107 |           field.setType("FLOAT64");
108 |           break;
109 |         case DATE:
110 |           field.setType("DATE");
111 |           break;
112 |         case TIME:
113 |           field.setType("TIME");
114 |           break;
115 |         case DATETIME:
116 |           field.setType("DATETIME");
117 |           break;
118 |         default:
119 |           field.setType("STRING");
120 |           break;
121 |       }
122 |       fields.add(field);
123 |     }
124 |     return new TableSchema().setFields(fields);
125 |   }
126 | 
127 |   /**
128 |    * Merges a series of AVRO schema. The rule is simple: we choose the most generic type, e.g. we
129 |    * choose string between int and string.
130 |    *
131 |    * @param types a list of schema to merge
132 |    * @return the merged schema
133 |    */
134 |   public static List<FieldType> merge(Iterable<List<FieldType>> types) {
135 |     // The results must always present, so we skip isPresent() check.
136 |     return Streams.stream(types).reduce(SchemaUtil::merge).get();
137 |   }
138 | 
139 |   public static List<FieldType> merge(List<FieldType> s, List<FieldType> t) {
140 |     if (s.isEmpty()) {
141 |       return t;
142 |     }
143 |     if (t.isEmpty()) {
144 |       return s;
145 |     }
146 |     Preconditions.checkArgument(
147 |         s.size() == t.size(), "Number of fields in both schema should match.");
148 |     return IntStream.range(0, s.size())
149 |         .mapToObj(i -> FieldType.getCommonType(s.get(i), t.get(i)))
150 |         .collect(Collectors.toList());
151 |   }
152 | 
153 |   public static Integer convertToInteger(String value) {
154 |     try {
155 |       return Integer.parseInt(value);
156 |     } catch (NumberFormatException e) {
157 |       return null;
158 |     }
159 |   }
160 | 
161 |   public static Long convertToLong(String value) {
162 |     try {
163 |       return Long.parseLong(value);
164 |     } catch (NumberFormatException e) {
165 |       return null;
166 |     }
167 |   }
168 | 
169 |   public static Double convertToDouble(String value) {
170 |     try {
171 |       return Double.parseDouble(value);
172 |     } catch (NumberFormatException e) {
173 |       return null;
174 |     }
175 |   }
176 | 
177 |   /**
178 |    * Boolean is different because {@link Boolean#parseBoolean(String)} always return true or false.
179 |    */
180 |   private static boolean convertToBoolean(String value) {
181 |     return "True".equalsIgnoreCase(value)
182 |         || "False".equalsIgnoreCase(value)
183 |         || "Yes".equalsIgnoreCase(value)
184 |         || "No".equalsIgnoreCase(value)
185 |         || "Y".equalsIgnoreCase(value)
186 |         || "N".equalsIgnoreCase(value)
187 |         || "T".equalsIgnoreCase(value)
188 |         || "F".equalsIgnoreCase(value);
189 |   }
190 | 
191 |   public static boolean isTrue(String value) {
192 |     return "True".equalsIgnoreCase(value)
193 |         || "Yes".equalsIgnoreCase(value)
194 |         || "Y".equalsIgnoreCase(value)
195 |         || "T".equalsIgnoreCase(value);
196 |   }
197 | 
198 |   public static String convertToDate(String value) {
199 |     try {
200 |       return LocalDate.parse(value, DateTimeFormatter.ISO_DATE)
201 |           .format(DateTimeFormatter.ISO_LOCAL_DATE);
202 |     } catch (DateTimeParseException e) {
203 |       return null;
204 |     }
205 |   }
206 | 
207 |   public static String convertToTime(String value) {
208 |     try {
209 |       return LocalTime.parse(value, DateTimeFormatter.ISO_TIME)
210 |           .format(DateTimeFormatter.ISO_LOCAL_TIME);
211 |     } catch (DateTimeParseException e) {
212 |       return null;
213 |     }
214 |   }
215 | 
216 |   // TODO(b/121042936): Support customized format.
217 |   public static String convertToDateTime(String value) {
218 |     LocalDateTime localDateTime = convertToLocalDateTime(value);
219 |     if (localDateTime != null) {
220 |       return localDateTime.format(DateTimeFormatter.ISO_LOCAL_DATE_TIME);
221 |     }
222 | 
223 |     OffsetDateTime offsetDateTime = convertToOffsetDateTime(value);
224 |     if (offsetDateTime != null) {
225 |       return offsetDateTime.format(DateTimeFormatter.ISO_LOCAL_DATE_TIME);
226 |     }
227 | 
228 |     ZonedDateTime zonedDateTime = convertToZonedDateTime(value);
229 |     if (zonedDateTime != null) {
230 |       return zonedDateTime.format(DateTimeFormatter.ISO_LOCAL_DATE_TIME);
231 |     }
232 | 
233 |     return null;
234 |   }
235 | 
236 |   private static LocalDateTime convertToLocalDateTime(String value) {
237 |     try {
238 |       return LocalDateTime.parse(value, DateTimeFormatter.ISO_LOCAL_DATE_TIME);
239 |     } catch (DateTimeParseException e) {
240 |       return null;
241 |     }
242 |   }
243 | 
244 |   private static OffsetDateTime convertToOffsetDateTime(String value) {
245 |     try {
246 |       return OffsetDateTime.parse(value, DateTimeFormatter.ISO_OFFSET_DATE_TIME);
247 |     } catch (DateTimeParseException e) {
248 |       return null;
249 |     }
250 |   }
251 | 
252 |   private static ZonedDateTime convertToZonedDateTime(String value) {
253 |     try {
254 |       return ZonedDateTime.parse(value, DateTimeFormatter.ISO_ZONED_DATE_TIME);
255 |     } catch (DateTimeParseException e) {
256 |       return null;
257 |     }
258 |   }
259 | }
260 | 


--------------------------------------------------------------------------------
/src/main/java/com/google/cloud/healthcare/util/PrettyPrinter.java:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Google LLC
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package com.google.cloud.healthcare.util;
16 | 
17 | import com.google.common.base.Joiner;
18 | 
19 | /**
20 |  * Utility class that helps print objects nicely.
21 |  */
22 | public class PrettyPrinter {
23 | 
24 |   /** Concatenates elements from an {@link Iterable} with comma for output. */
25 |   public static String print(Iterable<?> list) {
26 |     return Joiner.on(", ").join(list);
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------
/src/main/java/com/google/cloud/healthcare/util/StringUtil.java:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Google LLC
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package com.google.cloud.healthcare.util;
16 | 
17 | import com.google.common.base.Preconditions;
18 | import com.google.common.base.Strings;
19 | import org.apache.commons.io.FilenameUtils;
20 | 
21 | /** Utility class for processing strings. */
22 | public class StringUtil {
23 | 
24 |   private static final String GCS_URI_PREFIX = "gs://";
25 | 
26 |   public static String[] splitGcsUri(String gcsUri) {
27 |     Preconditions.checkArgument(!Strings.isNullOrEmpty(gcsUri),
28 |         "gcsUri cannot be null or empty.");
29 |     Preconditions.checkArgument(gcsUri.startsWith(GCS_URI_PREFIX),
30 |         "gcsUri has to start with gs://");
31 |     String trimmedUri = gcsUri.replaceFirst(GCS_URI_PREFIX, "");
32 |     String[] parts = trimmedUri.split("/", 2);
33 |     if (parts.length != 2) {
34 |       throw new IllegalArgumentException("Invalid GCS URI, should contain both bucket and path.");
35 |     }
36 |     return parts;
37 |   }
38 | 
39 |   public static String getGcsBaseName(String gcsUri) {
40 |     String[] parts = splitGcsUri(gcsUri);
41 |     return getGcsBaseNameByPath(parts[1]);
42 |   }
43 | 
44 |   private static String getGcsBaseNameByPath(String path) {
45 |     return FilenameUtils.getBaseName(path);
46 |   }
47 | 
48 |   public static String getGcsDecompressUri(String bucket) {
49 |     return String.format("%s/decompress", getGcsTempDir(bucket));
50 |   }
51 | 
52 |   public static String getGcsTempDir(String bucket) {
53 |     Preconditions.checkArgument(!Strings.isNullOrEmpty(bucket), "Bucket cannot be null or empty.");
54 |     return String.format("%s%s/temp", GCS_URI_PREFIX, bucket);
55 |   }
56 | 
57 |   public static String generateGcsUri(String bucket, String path) {
58 |     Preconditions.checkArgument(!Strings.isNullOrEmpty(bucket),
59 |         "Bucket cannot be null or empty");
60 |     Preconditions.checkArgument(!Strings.isNullOrEmpty(path),
61 |         "Path cannot be null or empty");
62 |     return String.format("%s%s/%s", GCS_URI_PREFIX, bucket, path);
63 |   }
64 | }
65 | 


--------------------------------------------------------------------------------
/src/test/java/com/google/cloud/healthcare/config/CsvConfigurationTest.java:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Google LLC
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package com.google.cloud.healthcare.config;
16 | 
17 | import static org.junit.Assert.assertEquals;
18 | import static org.junit.Assert.assertTrue;
19 | 
20 | import org.junit.Test;
21 | 
22 | public class CsvConfigurationTest {
23 | 
24 |   @Test(expected = IllegalArgumentException.class)
25 |   public void build_noQuote_throwException() {
26 |     CsvConfiguration.getInstance().withQuote(null);
27 |   }
28 | 
29 |   @Test
30 |   public void build_fieldsMatch() {
31 |     Character delimiter = ',';
32 |     String[] headers = new String[] {"name"};
33 |     Character quote = '"';
34 |     String recordSeparator = "\r\n";
35 |     boolean ignoreSurroundingSpaces = true;
36 |     CsvConfiguration conf = CsvConfiguration.getInstance()
37 |         .withDelimiter(delimiter)
38 |         .withQuote(quote)
39 |         .withRecordSeparator(recordSeparator)
40 |         .withIgnoreSurroundingSpaces(ignoreSurroundingSpaces);
41 | 
42 |     assertEquals("Fields should match original values.", delimiter, conf.getDelimiter());
43 |     assertEquals("Fields should match original values.", quote, conf.getQuote());
44 |     assertEquals("Fields should match original values.", recordSeparator, conf.getRecordSeparator());
45 |     assertTrue("Fields should match original values.", ignoreSurroundingSpaces);
46 |   }
47 | }


--------------------------------------------------------------------------------
/src/test/java/com/google/cloud/healthcare/decompress/DecompressorTest.java:
--------------------------------------------------------------------------------
  1 | // Copyright 2019 Google LLC
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | package com.google.cloud.healthcare.decompress;
 16 | 
 17 | import static org.junit.Assert.assertEquals;
 18 | import static org.mockito.Matchers.anyString;
 19 | import static org.mockito.Mockito.when;
 20 | import static org.mockito.MockitoAnnotations.initMocks;
 21 | 
 22 | import com.google.cloud.healthcare.io.GcsOutputWriterFactory;
 23 | import com.google.cloud.healthcare.io.InputReader;
 24 | import com.google.cloud.healthcare.io.OutputWriter;
 25 | import com.google.common.collect.Lists;
 26 | import java.io.ByteArrayInputStream;
 27 | import java.io.ByteArrayOutputStream;
 28 | import java.io.IOException;
 29 | import java.nio.channels.Channels;
 30 | import java.util.List;
 31 | import java.util.zip.GZIPOutputStream;
 32 | import java.util.zip.ZipEntry;
 33 | import java.util.zip.ZipOutputStream;
 34 | import org.apache.commons.compress.compressors.lz4.FramedLZ4CompressorOutputStream;
 35 | import org.junit.After;
 36 | import org.junit.Before;
 37 | import org.junit.Test;
 38 | import org.mockito.Mock;
 39 | 
 40 | /** Test for decompressor. */
 41 | public class DecompressorTest {
 42 | 
 43 |   private static final String TEXT = "Google Cloud Platform, offered by Google, is a suite of "
 44 |       + "cloud computing services that runs on the same infrastructure that Google uses internally "
 45 |       + "for its end-user products, such as Google Search and YouTube.";
 46 | 
 47 |   private static final String TXT_MIME_TYPE = "application/txt";
 48 |   private static final String RAR_MIME_TYPE =
 49 |       CompressionAlgorithm.RAR.getMimeTypes().iterator().next();
 50 |   private static final String GCS_NAME = "gs://bucket/path/to/file/compressed.file";
 51 |   private static final String DECOMPRESSED_NAME = "gs://bucket/path/to/file/decompressed.file";
 52 | 
 53 |   @Mock
 54 |   private GcsOutputWriterFactory factory;
 55 | 
 56 |   @Mock
 57 |   private InputReader reader;
 58 | 
 59 |   @Mock
 60 |   private OutputWriter writer;
 61 | 
 62 |   private ByteArrayOutputStream outputStream;
 63 | 
 64 |   private byte[] gzipCompressedData;
 65 |   private byte[] zipCompressedData;
 66 |   private byte[] lz4CompressedData;
 67 | 
 68 |   @Before
 69 |   public void setUp() throws Exception {
 70 |     initMocks(this);
 71 | 
 72 |     outputStream = new ByteArrayOutputStream();
 73 | 
 74 |     ByteArrayOutputStream os = new ByteArrayOutputStream();
 75 |     GZIPOutputStream gos = new GZIPOutputStream(os);
 76 |     gos.write(TEXT.getBytes());
 77 |     gos.flush();
 78 |     gos.close();
 79 | 
 80 |     gzipCompressedData = os.toByteArray();
 81 | 
 82 |     os = new ByteArrayOutputStream();
 83 |     ZipOutputStream zos = new ZipOutputStream(os);
 84 |     zos.putNextEntry(new ZipEntry("test.txt"));
 85 |     zos.write(TEXT.getBytes());
 86 |     zos.closeEntry();
 87 |     zos.flush();
 88 |     zos.close();
 89 | 
 90 |     zipCompressedData = os.toByteArray();
 91 | 
 92 |     os = new ByteArrayOutputStream();
 93 | 
 94 |     FramedLZ4CompressorOutputStream flz4os = new FramedLZ4CompressorOutputStream(os);
 95 |     flz4os.write(TEXT.getBytes());
 96 |     flz4os.flush();
 97 |     flz4os.close();
 98 | 
 99 |     lz4CompressedData = os.toByteArray();
100 |   }
101 | 
102 |   @After
103 |   public void tearDown() throws Exception {
104 |     outputStream.close();
105 |   }
106 | 
107 |   @Test
108 |   public void decompressGZip_contentMatch() throws IOException {
109 |     when(reader.getContentType()).thenReturn(
110 |         CompressionAlgorithm.GZIP.getMimeTypes().iterator().next());
111 |     when(reader.getReadChannel()).thenReturn(
112 |         Channels.newChannel(new ByteArrayInputStream(gzipCompressedData)));
113 |     when(reader.getName()).thenReturn(GCS_NAME);
114 |     when(factory.getOutputWriter(anyString())).thenReturn(writer);
115 |     when(writer.getWriteChannel()).thenReturn(Channels.newChannel(outputStream));
116 |     when(writer.getName()).thenReturn(DECOMPRESSED_NAME);
117 | 
118 |     List<String> files = new Decompressor(factory).decompress(reader);
119 |     assertEquals(
120 |         "File paths should match.",
121 |         Lists.newArrayList(DECOMPRESSED_NAME),
122 |         files);
123 |     assertEquals(
124 |         "Decompressed content should match the original.",
125 |         TEXT,
126 |         new String(outputStream.toByteArray()));
127 |   }
128 | 
129 |   @Test
130 |   public void decompressZip_contentMatch() throws IOException {
131 |     when(reader.getContentType()).thenReturn(
132 |         CompressionAlgorithm.ZIP.getMimeTypes().iterator().next());
133 |     when(reader.getReadChannel()).thenReturn(
134 |         Channels.newChannel(new ByteArrayInputStream(zipCompressedData)));
135 |     when(reader.getName()).thenReturn(GCS_NAME);
136 |     when(factory.getOutputWriter(anyString())).thenReturn(writer);
137 |     when(writer.getWriteChannel()).thenReturn(Channels.newChannel(outputStream));
138 |     when(writer.getName()).thenReturn(DECOMPRESSED_NAME);
139 | 
140 |     List<String> files = new Decompressor(factory).decompress(reader);
141 |     assertEquals(
142 |         "File paths should match.",
143 |         Lists.newArrayList(DECOMPRESSED_NAME),
144 |         files);
145 |     assertEquals(
146 |         "Decompressed content should match the original.",
147 |         TEXT,
148 |         new String(outputStream.toByteArray()));
149 |   }
150 | 
151 |   @Test
152 |   public void decompressLZ4_contentMatch() throws IOException {
153 |     when(reader.getContentType()).thenReturn(
154 |         CompressionAlgorithm.LZ4.getMimeTypes().iterator().next());
155 |     when(reader.getReadChannel()).thenReturn(
156 |         Channels.newChannel(new ByteArrayInputStream(lz4CompressedData)));
157 |     when(reader.getName()).thenReturn(GCS_NAME);
158 |     when(factory.getOutputWriter(anyString())).thenReturn(writer);
159 |     when(writer.getWriteChannel()).thenReturn(Channels.newChannel(outputStream));
160 |     when(writer.getName()).thenReturn(DECOMPRESSED_NAME);
161 | 
162 |     List<String> files = new Decompressor(factory).decompress(reader);
163 |     assertEquals(
164 |         "File paths should match.",
165 |         Lists.newArrayList(DECOMPRESSED_NAME),
166 |         files);
167 |     assertEquals(
168 |         "Decompressed content should match the original.",
169 |         TEXT,
170 |         new String(outputStream.toByteArray()));
171 |   }
172 | 
173 |   @Test
174 |   public void notCompressed_notDecompressed() throws IOException {
175 |     when(reader.getContentType()).thenReturn(TXT_MIME_TYPE);
176 |     when(reader.getName()).thenReturn(GCS_NAME);
177 |     assertEquals(
178 |         "Decompress should return false.",
179 |         Lists.newArrayList(GCS_NAME),
180 |         new Decompressor(new GcsOutputWriterFactory(null, "bucket", "path")).decompress(reader));
181 |   }
182 | 
183 |   @Test(expected = RuntimeException.class)
184 |   public void notSupportCompression_notDecompressed() throws IOException {
185 |     when(reader.getContentType()).thenReturn(RAR_MIME_TYPE);
186 |     new Decompressor(new GcsOutputWriterFactory(null, "bucket", "path")).decompress(reader);
187 |   }
188 | }


--------------------------------------------------------------------------------
/src/test/java/com/google/cloud/healthcare/decompress/LZ4HandlerTest.java:
--------------------------------------------------------------------------------
 1 | package com.google.cloud.healthcare.decompress;
 2 | 
 3 | import static org.junit.Assert.assertEquals;
 4 | import static org.mockito.Mockito.mock;
 5 | import static org.mockito.MockitoAnnotations.initMocks;
 6 | import static org.powermock.api.mockito.PowerMockito.when;
 7 | 
 8 | import com.google.cloud.healthcare.io.InputReader;
 9 | import com.google.cloud.healthcare.io.OutputWriter;
10 | import com.google.cloud.healthcare.io.OutputWriterFactory;
11 | import java.io.ByteArrayInputStream;
12 | import java.io.ByteArrayOutputStream;
13 | import java.io.IOException;
14 | import java.nio.channels.Channels;
15 | import org.apache.commons.compress.compressors.lz4.FramedLZ4CompressorOutputStream;
16 | import org.junit.Before;
17 | import org.junit.Test;
18 | import org.mockito.Mock;
19 | 
20 | /** Test for LZ4Handler. */
21 | public class LZ4HandlerTest {
22 | 
23 |   private static final String TEXT = "LZ4 is lossless compression algorithm, providing "
24 |       + "compression speed > 500 MB/s per core, scalable with multi-cores CPU. It features "
25 |       + "an extremely fast decoder, with speed in multiple GB/s per core, typically reaching RAM "
26 |       + "speed limits on multi-core systems.";
27 | 
28 |   @Mock
29 |   private OutputWriterFactory factory;
30 | 
31 |   @Mock
32 |   private InputReader reader;
33 | 
34 |   private byte[] lz4CompressedData;
35 | 
36 |   @Before
37 |   public void setUp() throws Exception {
38 |     initMocks(this);
39 | 
40 |     ByteArrayOutputStream os = new ByteArrayOutputStream();
41 | 
42 |     FramedLZ4CompressorOutputStream flz4os = new FramedLZ4CompressorOutputStream(os);
43 |     flz4os.write(TEXT.getBytes());
44 |     flz4os.flush();
45 |     flz4os.close();
46 | 
47 |     lz4CompressedData = os.toByteArray();
48 |   }
49 | 
50 |   @Test
51 |   public void handleLZ4_contentMatch() throws IOException {
52 |     when(reader.getReadChannel()).thenReturn(
53 |         Channels.newChannel(new ByteArrayInputStream(lz4CompressedData)));
54 |     when(reader.getName()).thenReturn("data");
55 | 
56 |     ByteArrayOutputStream os = new ByteArrayOutputStream();
57 | 
58 |     OutputWriter ow = mock(OutputWriter.class);
59 |     when(ow.getWriteChannel()).thenReturn(Channels.newChannel(os));
60 |     when(factory.getOutputWriter("data")).thenReturn(ow);
61 | 
62 |     new LZ4Handler(factory).handle(reader);
63 | 
64 |     assertEquals("Content should match.", TEXT, new String(os.toByteArray()));
65 |   }
66 | 
67 | }
68 | 


--------------------------------------------------------------------------------
/src/test/java/com/google/cloud/healthcare/decompress/TarHandlerTest.java:
--------------------------------------------------------------------------------
  1 | // Copyright 2019 Google LLC
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | package com.google.cloud.healthcare.decompress;
 16 | 
 17 | import static org.junit.Assert.assertEquals;
 18 | import static org.mockito.Mockito.mock;
 19 | import static org.mockito.Mockito.when;
 20 | import static org.mockito.MockitoAnnotations.initMocks;
 21 | 
 22 | import com.google.cloud.healthcare.io.InputReader;
 23 | import com.google.cloud.healthcare.io.OutputWriter;
 24 | import com.google.cloud.healthcare.io.OutputWriterFactory;
 25 | import com.google.common.collect.Lists;
 26 | import java.io.ByteArrayInputStream;
 27 | import java.io.ByteArrayOutputStream;
 28 | import java.io.IOException;
 29 | import java.nio.channels.Channels;
 30 | import java.util.List;
 31 | import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
 32 | import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
 33 | import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream;
 34 | import org.junit.Before;
 35 | import org.junit.Test;
 36 | import org.mockito.Mock;
 37 | 
 38 | /** Test for TarHandler. */
 39 | public class TarHandlerTest {
 40 | 
 41 |   private static final String[] CONTENT = new String[] {
 42 |     "first_file", "second_file", "folder/", "folder/first_file", "folder/second_file"
 43 |   };
 44 | 
 45 |   @Mock
 46 |   private OutputWriterFactory factory;
 47 | 
 48 |   @Mock
 49 |   private InputReader reader;
 50 | 
 51 |   private byte[] archivedData;
 52 | 
 53 |   @Before
 54 |   public void setUp() throws IOException {
 55 |     initMocks(this);
 56 |     ByteArrayOutputStream os = new ByteArrayOutputStream();
 57 |     TarArchiveOutputStream tos = new TarArchiveOutputStream(new GzipCompressorOutputStream(os));
 58 |     for (int i = 0; i < CONTENT.length; i++) {
 59 |         TarArchiveEntry entry = new TarArchiveEntry(CONTENT[i]);
 60 |         entry.setSize(!CONTENT[i].endsWith("/") ? CONTENT[i].length() : 0);
 61 |         tos.putArchiveEntry(entry);
 62 |         if (!CONTENT[i].endsWith("/")) {
 63 |           tos.write(CONTENT[i].getBytes());
 64 |         }
 65 |         tos.closeArchiveEntry();
 66 |     }
 67 |     tos.flush();
 68 |     tos.close();
 69 |     archivedData = os.toByteArray();
 70 |   }
 71 | 
 72 |   @Test
 73 |   public void handleTarEntries_contentMatch() throws IOException {
 74 |     when(reader.getReadChannel()).thenReturn(
 75 |         Channels.newChannel(new ByteArrayInputStream(archivedData)));
 76 | 
 77 |     ByteArrayOutputStream[] oss = new ByteArrayOutputStream[] {
 78 |         new ByteArrayOutputStream(),
 79 |         new ByteArrayOutputStream(),
 80 |         new ByteArrayOutputStream(),
 81 |         new ByteArrayOutputStream(),
 82 |         new ByteArrayOutputStream()
 83 |     };
 84 | 
 85 |     for (int i = 0; i < CONTENT.length; i++) {
 86 |       OutputWriter ow = mock(OutputWriter.class);
 87 |       when(ow.getName()).thenReturn(CONTENT[i]);
 88 |       when(ow.getWriteChannel()).thenReturn(Channels.newChannel(oss[i]));
 89 |       when(factory.getOutputWriter(CONTENT[i])).thenReturn(ow);
 90 |     }
 91 | 
 92 |     List<String> files = new TarHandler(factory).handle(reader);
 93 |     assertEquals("Paths should match",
 94 |         Lists.newArrayList("first_file", "second_file", "folder/first_file", "folder/second_file"),
 95 |         files);
 96 | 
 97 |     for (int i = 0; i < CONTENT.length; i++) {
 98 |       if (!CONTENT[i].endsWith("/")) {
 99 |         assertEquals("Individual entry content should match",
100 |             CONTENT[i], new String(oss[i].toByteArray()));
101 |       }
102 |     }
103 |   }
104 | }


--------------------------------------------------------------------------------
/src/test/java/com/google/cloud/healthcare/decompress/ZipHandlerTest.java:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Google LLC
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package com.google.cloud.healthcare.decompress;
16 | 
17 | import static org.junit.Assert.assertEquals;
18 | import static org.mockito.Mockito.mock;
19 | import static org.mockito.Mockito.when;
20 | import static org.mockito.MockitoAnnotations.initMocks;
21 | 
22 | import com.google.cloud.healthcare.io.InputReader;
23 | import com.google.cloud.healthcare.io.OutputWriter;
24 | import com.google.cloud.healthcare.io.OutputWriterFactory;
25 | import java.io.ByteArrayInputStream;
26 | import java.io.ByteArrayOutputStream;
27 | import java.io.IOException;
28 | import java.nio.channels.Channels;
29 | import java.nio.channels.WritableByteChannel;
30 | import java.util.zip.ZipEntry;
31 | import java.util.zip.ZipOutputStream;
32 | import org.junit.Before;
33 | import org.junit.Test;
34 | import org.mockito.Mock;
35 | 
36 | public class ZipHandlerTest {
37 | 
38 |   private static final String[] CONTENT = new String[] {
39 |       "first_entry", "second_entry", "third_entry"
40 |   };
41 | 
42 |   @Mock
43 |   private OutputWriterFactory factory;
44 | 
45 |   @Mock
46 |   private InputReader reader;
47 | 
48 |   private byte[] zipCompressedData;
49 | 
50 |   @Before
51 |   public void setUp() throws Exception {
52 |     initMocks(this);
53 | 
54 |     ByteArrayOutputStream os = new ByteArrayOutputStream();
55 |     ZipOutputStream zos = new ZipOutputStream(os);
56 | 
57 |     for (String entry : CONTENT) {
58 |       zos.putNextEntry(new ZipEntry(entry));
59 |       zos.write(entry.getBytes());
60 |     }
61 |     zos.closeEntry();
62 |     zos.flush();
63 |     zos.close();
64 | 
65 |     zipCompressedData = os.toByteArray();
66 |   }
67 | 
68 |   @Test
69 |   public void handleZipEntries_contentMatch() throws IOException {
70 |     when(reader.getReadChannel()).thenReturn(
71 |         Channels.newChannel(new ByteArrayInputStream(zipCompressedData)));
72 | 
73 |     ByteArrayOutputStream[] oss = new ByteArrayOutputStream[] {
74 |         new ByteArrayOutputStream(), new ByteArrayOutputStream(), new ByteArrayOutputStream()
75 |     };
76 | 
77 |     WritableByteChannel[] wbc = new WritableByteChannel[] {
78 |         Channels.newChannel(oss[0]),
79 |         Channels.newChannel(oss[1]),
80 |         Channels.newChannel(oss[2])
81 |     };
82 | 
83 |     for (int i = 0; i < CONTENT.length; i++) {
84 |       OutputWriter ow = mock(OutputWriter.class);
85 |       when(ow.getWriteChannel()).thenReturn(wbc[i]);
86 |       when(factory.getOutputWriter(CONTENT[i])).thenReturn(ow);
87 |     }
88 | 
89 |     new ZipHandler(factory).handle(reader);
90 | 
91 |     for (int i = 0; i < CONTENT.length; i++) {
92 |       assertEquals("Individual entry content should match",
93 |           CONTENT[i], new String(oss[i].toByteArray()));
94 |     }
95 |   }
96 | }


--------------------------------------------------------------------------------
/src/test/java/com/google/cloud/healthcare/io/ByteReaderTest.java:
--------------------------------------------------------------------------------
  1 | // Copyright 2019 Google LLC
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | package com.google.cloud.healthcare.io;
 16 | 
 17 | import static org.junit.Assert.assertArrayEquals;
 18 | import static org.junit.Assert.assertEquals;
 19 | 
 20 | import java.io.IOException;
 21 | import java.nio.channels.SeekableByteChannel;
 22 | import org.apache.commons.compress.utils.SeekableInMemoryByteChannel;
 23 | import org.junit.Test;
 24 | 
 25 | /** Test for ByteReader */
 26 | public class ByteReaderTest {
 27 | 
 28 |   private static final String LINE_FEED_SEPARATED = "abc, cbd\ncab, abc\n, abc";
 29 |   private static final String LINE_FEED_AT_FIRST = "\ncab, abc\n, abc\n";
 30 |   private static final String CARRIAGE_RETURN_SEPARATED = "abc, cbd\r\rcab, abc\r";
 31 |   private static final String CARRIAGE_RETURN_AND_LINE_FEED_SEPARATED =
 32 |       "\r\nabc,\r\ncbd\r\ncab, abc\r\n";
 33 |   private static final String MIXED_SEPARATED =
 34 |       "\rabc,\ncbd\r\ncab, abc\r\n";
 35 |   private static final String ALL_EMPTY = "\r\r\r\r\r";
 36 | 
 37 |   @Test
 38 |   public void readUntil_endBeforeCurrPos_noneReturned() throws IOException {
 39 |     SeekableByteChannel channel = new SeekableInMemoryByteChannel(LINE_FEED_SEPARATED.getBytes());
 40 |     channel.position(10);
 41 |     ByteReader reader = new ByteReader(channel);
 42 |     assertEquals("None was returned", 0, reader.readUntil(5).length);
 43 |   }
 44 | 
 45 |   @Test
 46 |   public void readUntil_endSameAsCurrPos_noneReturned() throws IOException {
 47 |     SeekableByteChannel channel = new SeekableInMemoryByteChannel(LINE_FEED_SEPARATED.getBytes());
 48 |     channel.position(10);
 49 |     ByteReader reader = new ByteReader(channel);
 50 |     assertEquals("None was returned", 0, reader.readUntil(10).length);
 51 |   }
 52 | 
 53 |   @Test
 54 |   public void readUntil_expectedResult() throws IOException {
 55 |     SeekableByteChannel channel = new SeekableInMemoryByteChannel(LINE_FEED_SEPARATED.getBytes());
 56 |     channel.position(3);
 57 |     ByteReader reader = new ByteReader(channel);
 58 |     assertArrayEquals("None was returned", ", cbd\nc".getBytes(), reader.readUntil(10));
 59 |   }
 60 | 
 61 |   @Test
 62 |   public void readUntil_eof_expectedResult() throws IOException {
 63 |     SeekableByteChannel channel = new SeekableInMemoryByteChannel(LINE_FEED_SEPARATED.getBytes());
 64 |     channel.position(3);
 65 |     ByteReader reader = new ByteReader(channel);
 66 |     assertArrayEquals("None was returned", ", cbd\ncab, abc\n, abc".getBytes(),
 67 |         reader.readUntil(100));
 68 |   }
 69 | 
 70 |   @Test(expected = IOException.class)
 71 |   public void readLine_closedChannel_throwException() throws IOException {
 72 |     SeekableByteChannel channel = new SeekableInMemoryByteChannel();
 73 |     channel.close();
 74 |     ByteReader reader = new ByteReader(channel);
 75 |     reader.readLine();
 76 |   }
 77 | 
 78 |   @Test
 79 |   public void readLine_returnExpectedContent() throws IOException {
 80 |     try (SeekableByteChannel channel =
 81 |         new SeekableInMemoryByteChannel(LINE_FEED_SEPARATED.getBytes())) {
 82 | 
 83 |       ByteReader reader = new ByteReader(channel);
 84 |       assertEquals("First line matches.", "abc, cbd", new String(reader.readLine()));
 85 |       assertEquals("Second line matches.", "cab, abc", new String(reader.readLine()));
 86 |       assertEquals("Last line matches.", ", abc", new String(reader.readLine()));
 87 |     }
 88 |   }
 89 | 
 90 |   @Test
 91 |   public void readLine_returnExpectedContent2() throws IOException {
 92 |     try (SeekableByteChannel channel =
 93 |         new SeekableInMemoryByteChannel(LINE_FEED_AT_FIRST.getBytes())) {
 94 | 
 95 |       ByteReader reader = new ByteReader(channel);
 96 |       assertEquals("First line matches.", "", new String(reader.readLine()));
 97 |       assertEquals("Second line matches.", "cab, abc", new String(reader.readLine()));
 98 |       assertEquals("Last line matches.", ", abc", new String(reader.readLine()));
 99 |     }
100 |   }
101 | 
102 |   @Test
103 |   public void readLine_returnExpectedContent3() throws IOException {
104 |     try (SeekableByteChannel channel =
105 |         new SeekableInMemoryByteChannel(CARRIAGE_RETURN_SEPARATED.getBytes())) {
106 | 
107 |       ByteReader reader = new ByteReader(channel);
108 |       assertEquals("First line matches.", "abc, cbd", new String(reader.readLine()));
109 |       assertEquals("Second line matches.", "", new String(reader.readLine()));
110 |       assertEquals("Last line matches.", "cab, abc", new String(reader.readLine()));
111 |     }
112 |   }
113 | 
114 |   @Test
115 |   public void readLine_returnExpectedContent4() throws IOException {
116 |     try (SeekableByteChannel channel =
117 |         new SeekableInMemoryByteChannel(CARRIAGE_RETURN_AND_LINE_FEED_SEPARATED.getBytes())) {
118 | 
119 |       ByteReader reader = new ByteReader(channel);
120 |       assertEquals("First line matches.", "", new String(reader.readLine()));
121 |       assertEquals("Second line matches.", "abc,", new String(reader.readLine()));
122 |       assertEquals("Third line matches.", "cbd", new String(reader.readLine()));
123 |       assertEquals("Last line matches.", "cab, abc", new String(reader.readLine()));
124 |     }
125 |   }
126 | 
127 |   @Test
128 |   public void readLine_returnExpectedContent5() throws IOException {
129 |     try (SeekableByteChannel channel =
130 |         new SeekableInMemoryByteChannel(MIXED_SEPARATED.getBytes())) {
131 | 
132 |       ByteReader reader = new ByteReader(channel);
133 |       assertEquals("First line matches.", "", new String(reader.readLine()));
134 |       assertEquals("Second line matches.", "abc,", new String(reader.readLine()));
135 |       assertEquals("Third line matches.", "cbd", new String(reader.readLine()));
136 |       assertEquals("Last line matches.", "cab, abc", new String(reader.readLine()));
137 |     }
138 |   }
139 | 
140 |   @Test
141 |   public void readLine_returnExpectedContent6() throws IOException {
142 |     try (SeekableByteChannel channel =
143 |         new SeekableInMemoryByteChannel(ALL_EMPTY.getBytes())) {
144 | 
145 |       ByteReader reader = new ByteReader(channel);
146 |       assertEquals("First line matches.", "", new String(reader.readLine()));
147 |       assertEquals("Second line matches.", "", new String(reader.readLine()));
148 |       assertEquals("Third line matches.", "", new String(reader.readLine()));
149 |       assertEquals("Fourth line matches.", "", new String(reader.readLine()));
150 |       assertEquals("Last line matches.", "", new String(reader.readLine()));
151 |     }
152 |   }
153 | 
154 |   @Test
155 |   public void readLine_returnExpectedContent7() throws IOException {
156 |     try (SeekableByteChannel channel =
157 |         new SeekableInMemoryByteChannel(MIXED_SEPARATED.getBytes())) {
158 | 
159 |       ByteReader reader = new ByteReader(channel);
160 |       channel.position(2);
161 |       assertEquals("Line matches.", "bc,", new String(reader.readLine()));
162 |       channel.position(5);
163 |       assertEquals("Line matches.", "", new String(reader.readLine()));
164 |       assertEquals("Line matches.", "cbd", new String(reader.readLine()));
165 |       channel.position(5);
166 |       assertEquals("Line matches.", "", new String(reader.readLine()));
167 |       assertEquals("Line matches.", "cbd", new String(reader.readLine()));
168 |       assertEquals("Last line matches.", "cab, abc", new String(reader.readLine()));
169 |     }
170 |   }
171 | }


--------------------------------------------------------------------------------
/src/test/java/com/google/cloud/healthcare/io/GcsOutputWriterTest.java:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Google LLC
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package com.google.cloud.healthcare.io;
16 | 
17 | import static org.mockito.ArgumentMatchers.any;
18 | import static org.mockito.Mockito.times;
19 | import static org.mockito.Mockito.verify;
20 | import static org.mockito.Mockito.when;
21 | import static org.mockito.MockitoAnnotations.initMocks;
22 | 
23 | import com.google.cloud.WriteChannel;
24 | import com.google.cloud.storage.Blob;
25 | import com.google.cloud.storage.BlobInfo;
26 | import com.google.cloud.storage.Storage;
27 | import com.google.cloud.storage.StorageOptions;
28 | import java.nio.channels.Channels;
29 | import org.junit.Before;
30 | import org.junit.Test;
31 | import org.junit.runner.RunWith;
32 | import org.mockito.Mock;
33 | import org.powermock.api.mockito.PowerMockito;
34 | import org.powermock.core.classloader.annotations.PrepareForTest;
35 | import org.powermock.modules.junit4.PowerMockRunner;
36 | 
37 | @RunWith(PowerMockRunner.class)
38 | @PrepareForTest({StorageOptions.class, Channels.class})
39 | public class GcsOutputWriterTest {
40 | 
41 |   @Mock
42 |   private Storage storage;
43 | 
44 |   @Mock
45 |   private StorageOptions storageOptions;
46 | 
47 |   @Mock
48 |   private Blob blob;
49 | 
50 |   @Mock
51 |   private WriteChannel writer;
52 | 
53 |   @Before
54 |   public void setUp() {
55 |     initMocks(this);
56 |     PowerMockito.mockStatic(StorageOptions.class, Channels.class);
57 |     when(StorageOptions.getDefaultInstance()).thenReturn(storageOptions);
58 |     when(storageOptions.getService()).thenReturn(storage);
59 |     when(blob.writer()).thenReturn(writer);
60 |     when(storage.create(any(BlobInfo.class))).thenReturn(blob);
61 |   }
62 | 
63 |   @Test
64 |   public void getWriteChannel_correctPath() {
65 |     new GcsOutputWriter(null, "bucket", "path").getWriteChannel();
66 |     verify(storage, times(1))
67 |         .create(BlobInfo.newBuilder("bucket", "path").build());
68 |   }
69 | }


--------------------------------------------------------------------------------
/src/test/java/com/google/cloud/healthcare/process/pipeline/GcsReadChunksFnTest.java:
--------------------------------------------------------------------------------
  1 | // Copyright 2019 Google LLC
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | package com.google.cloud.healthcare.process.pipeline;
 16 | 
 17 | import static org.mockito.ArgumentMatchers.any;
 18 | import static org.mockito.ArgumentMatchers.eq;
 19 | import static org.mockito.Mockito.when;
 20 | import static org.mockito.MockitoAnnotations.initMocks;
 21 | 
 22 | import avro.shaded.com.google.common.collect.Sets;
 23 | import com.google.cloud.healthcare.config.GcpConfiguration;
 24 | import com.google.cloud.healthcare.io.GcsInputReader;
 25 | import com.google.cloud.healthcare.process.schema.GcpUtil;
 26 | import java.io.ByteArrayInputStream;
 27 | import java.nio.channels.Channels;
 28 | import java.nio.channels.ReadableByteChannel;
 29 | import java.util.Set;
 30 | import org.apache.beam.sdk.coders.SerializableCoder;
 31 | import org.apache.beam.sdk.testing.PAssert;
 32 | import org.apache.beam.sdk.testing.TestPipeline;
 33 | import org.apache.beam.sdk.transforms.Create;
 34 | import org.apache.beam.sdk.transforms.ParDo;
 35 | import org.apache.beam.sdk.values.KV;
 36 | import org.apache.beam.sdk.values.PCollection;
 37 | import org.apache.beam.sdk.values.TypeDescriptors;
 38 | import org.junit.Before;
 39 | import org.junit.Rule;
 40 | import org.junit.Test;
 41 | import org.junit.runner.RunWith;
 42 | import org.mockito.Mock;
 43 | import org.powermock.api.mockito.PowerMockito;
 44 | import org.powermock.core.classloader.annotations.PrepareForTest;
 45 | import org.powermock.modules.junit4.PowerMockRunner;
 46 | 
 47 | /** Test for {@link GcsReadChunksFn}. */
 48 | @RunWith(PowerMockRunner.class)
 49 | @PrepareForTest({GcpUtil.class})
 50 | public class GcsReadChunksFnTest {
 51 | 
 52 |   private static final String BUCKET1 = "bucket";
 53 |   private static final String PATH1 = "file.csv";
 54 |   private static final String FILENAME1 = String.format("gs://%s/%s", BUCKET1, PATH1);
 55 |   private static final String FILE_CONTENT1 = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
 56 |   private ReadableByteChannel readChannel1 =
 57 |       Channels.newChannel(new ByteArrayInputStream(FILE_CONTENT1.getBytes()));
 58 | 
 59 |   private static final String BUCKET2 = "mybucket";
 60 |   private static final String PATH2 = "path/to/file.csv";
 61 |   private static final String FILENAME2 = String.format("gs://%s/%s", BUCKET2, PATH2);
 62 |   private static final String FILE_CONTENT2 = "ZYXWVUTSRQPONMLKJIHGFEDCBA";
 63 |   private ReadableByteChannel readChannel2 =
 64 |       Channels.newChannel(new ByteArrayInputStream(FILE_CONTENT2.getBytes()));
 65 | 
 66 |   @Rule public final transient TestPipeline p = TestPipeline.create();
 67 | 
 68 |   @Mock private GcsInputReader reader1;
 69 | 
 70 |   @Mock private GcsInputReader reader2;
 71 | 
 72 |   @Before
 73 |   public void setUp() {
 74 |     initMocks(this);
 75 |     PowerMockito.mockStatic(GcpUtil.class);
 76 |     when(GcpUtil.openGcsFile(any(), eq(FILENAME1))).thenReturn(readChannel1);
 77 |     when(GcpUtil.openGcsFile(any(), eq(FILENAME2))).thenReturn(readChannel2);
 78 |   }
 79 | 
 80 |   @Test
 81 |   public void generate_twoElements_oneByteArray() {
 82 |     PCollection<KV<String, Set<Long>>> input =
 83 |         p.apply(Create.of(KV.of(FILENAME1, Sets.newHashSet(0L, 10L))));
 84 |     PCollection<KV<String, byte[]>> output =
 85 |         input.apply(ParDo.of(new GcsReadChunksFn(GcpConfiguration.getInstance())));
 86 |     PAssert.that(output).containsInAnyOrder(KV.of(FILENAME1, "ABCDEFGHIJ".getBytes()));
 87 |     p.run();
 88 |   }
 89 | 
 90 |   @Test
 91 |   public void generate_multipleElements_expectedByteArrays() {
 92 |     PCollection<KV<String, Set<Long>>> input =
 93 |         p.apply(
 94 |             Create.of(
 95 |                 KV.of(FILENAME1, Sets.newHashSet(0L, 20L)),
 96 |                 KV.of(FILENAME2, Sets.newHashSet(26L, 0L, 1L, 14L, 10L))));
 97 |     PCollection<KV<String, byte[]>> output =
 98 |         input.apply(ParDo.of(new GcsReadChunksFn(GcpConfiguration.getInstance())));
 99 |     PAssert.that(output)
100 |         .containsInAnyOrder(
101 |             KV.of(FILENAME1, "ABCDEFGHIJKLMNOPQRST".getBytes()),
102 |             KV.of(FILENAME2, "Z".getBytes()),
103 |             KV.of(FILENAME2, "YXWVUTSRQ".getBytes()),
104 |             KV.of(FILENAME2, "PONM".getBytes()),
105 |             KV.of(FILENAME2, "LKJIHGFEDCBA".getBytes()));
106 |     p.run();
107 |   }
108 | 
109 |   @Test
110 |   public void read_nothing_returnZeroResult() {
111 |     PCollection<KV<String, Set<Long>>> input =
112 |         p.apply(
113 |             Create.empty(
114 |                 SerializableCoder.of(
115 |                     TypeDescriptors.kvs(
116 |                         TypeDescriptors.strings(),
117 |                         TypeDescriptors.sets(TypeDescriptors.longs())))));
118 |     PCollection<KV<String, byte[]>> output =
119 |         input.apply(ParDo.of(new GcsReadChunksFn(GcpConfiguration.getInstance())));
120 |     PAssert.that(output).empty();
121 |     p.run();
122 |   }
123 | 
124 |   @Test
125 |   public void read_returnExpectedByteArrays() {
126 |     PCollection<KV<String, Set<Long>>> input =
127 |         p.apply(
128 |             Create.of(
129 |                 KV.of(FILENAME1, Sets.newHashSet(0L, (long) FILE_CONTENT1.length())),
130 |                 KV.of(FILENAME2, Sets.newHashSet(0L, (long) FILE_CONTENT2.length()))));
131 |     PCollection<KV<String, byte[]>> output =
132 |         input.apply(ParDo.of(new GcsReadChunksFn(GcpConfiguration.getInstance())));
133 |     PAssert.that(output)
134 |         .containsInAnyOrder(
135 |             KV.of(FILENAME1, FILE_CONTENT1.getBytes()), KV.of(FILENAME2, FILE_CONTENT2.getBytes()));
136 |     p.run();
137 |   }
138 | }
139 | 


--------------------------------------------------------------------------------
/src/test/java/com/google/cloud/healthcare/process/pipeline/csv/CsvDetectSchemaFnTest.java:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Google LLC
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package com.google.cloud.healthcare.process.pipeline.csv;
16 | 
17 | import com.google.cloud.healthcare.config.CsvConfiguration;
18 | import com.google.cloud.healthcare.process.schema.FieldType;
19 | import com.google.common.collect.Lists;
20 | import java.io.IOException;
21 | import java.net.URISyntaxException;
22 | import java.net.URL;
23 | import java.nio.file.Files;
24 | import java.nio.file.Paths;
25 | import java.util.List;
26 | import org.apache.beam.sdk.testing.PAssert;
27 | import org.apache.beam.sdk.testing.TestPipeline;
28 | import org.apache.beam.sdk.transforms.Create;
29 | import org.apache.beam.sdk.transforms.ParDo;
30 | import org.apache.beam.sdk.values.KV;
31 | import org.apache.beam.sdk.values.PCollection;
32 | import org.junit.Rule;
33 | import org.junit.Test;
34 | 
35 | /** Test for {@link CsvDetectSchemaFn}. */
36 | public class CsvDetectSchemaFnTest {
37 | 
38 |   private static final String FILENAME1 = "test_input_parse.csv";
39 |   private static final String FILENAME2 = "test_input_parse2.csv";
40 | 
41 |   @Rule public final transient TestPipeline p = TestPipeline.create();
42 | 
43 |   @Test
44 |   public void detect_returnExpectedSchema() throws IOException, URISyntaxException {
45 |     URL url1 = this.getClass().getClassLoader().getResource(FILENAME1);
46 |     byte[] bytes1 = Files.readAllBytes(Paths.get(url1.toURI()));
47 |     URL url2 = this.getClass().getClassLoader().getResource(FILENAME2);
48 |     byte[] bytes2 = Files.readAllBytes(Paths.get(url2.toURI()));
49 | 
50 |     PCollection<KV<String, byte[]>> input =
51 |         p.apply(Create.of(KV.of(FILENAME1, bytes1), KV.of(FILENAME2, bytes2)));
52 |     PCollection<KV<String, List<FieldType>>> output =
53 |         input
54 |             .apply(ParDo.of(new CsvParseDataFn(CsvConfiguration.getInstance())))
55 |             .apply(ParDo.of(new CsvDetectSchemaFn()));
56 |     PAssert.that(output)
57 |         .containsInAnyOrder(
58 |             KV.of(FILENAME1, Lists.newArrayList(FieldType.INT, FieldType.STRING, FieldType.STRING)),
59 |             KV.of(FILENAME1, Lists.newArrayList(FieldType.INT, FieldType.STRING, FieldType.STRING)),
60 |             KV.of(FILENAME1, Lists.newArrayList(FieldType.INT, FieldType.STRING, FieldType.STRING)),
61 |             KV.of(FILENAME1, Lists.newArrayList(FieldType.INT, FieldType.STRING, FieldType.STRING)),
62 |             KV.of(
63 |                 FILENAME2,
64 |                 Lists.newArrayList(
65 |                     FieldType.INT,
66 |                     FieldType.BOOLEAN,
67 |                     FieldType.DOUBLE,
68 |                     FieldType.STRING,
69 |                     FieldType.DATE)),
70 |             KV.of(
71 |                 FILENAME2,
72 |                 Lists.newArrayList(
73 |                     FieldType.INT,
74 |                     FieldType.BOOLEAN,
75 |                     FieldType.DOUBLE,
76 |                     FieldType.STRING,
77 |                     FieldType.DATE)));
78 |     p.run();
79 |   }
80 | }
81 | 


--------------------------------------------------------------------------------
/src/test/java/com/google/cloud/healthcare/process/pipeline/csv/CsvExtractHeadersFnTest.java:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Google LLC
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package com.google.cloud.healthcare.process.pipeline.csv;
16 | 
17 | import java.net.URL;
18 | import org.apache.beam.sdk.Pipeline.PipelineExecutionException;
19 | import org.apache.beam.sdk.io.FileIO;
20 | import org.apache.beam.sdk.io.FileIO.ReadableFile;
21 | import org.apache.beam.sdk.testing.PAssert;
22 | import org.apache.beam.sdk.testing.TestPipeline;
23 | import org.apache.beam.sdk.transforms.Create;
24 | import org.apache.beam.sdk.transforms.ParDo;
25 | import org.apache.beam.sdk.values.KV;
26 | import org.apache.beam.sdk.values.PCollection;
27 | import org.junit.Rule;
28 | import org.junit.Test;
29 | 
30 | /** Test for {@link CsvExtractHeadersFn}. */
31 | public class CsvExtractHeadersFnTest {
32 | 
33 |   @Rule
34 |   public final transient TestPipeline p = TestPipeline.create();
35 | 
36 |   @Test
37 |   public void extract_validHeaders_returnHeaders() {
38 |     URL url1 = this.getClass().getClassLoader().getResource("test_input_valid_headers1.csv");
39 |     URL url2 = this.getClass().getClassLoader().getResource("test_input_valid_headers2.csv");
40 | 
41 |     PCollection<String> urls = p.apply(Create.of(url1.toString(), url2.toString()));
42 |     PCollection<ReadableFile> input = urls
43 |         .apply(FileIO.matchAll())
44 |         .apply(FileIO.readMatches());
45 |     PCollection<KV<String, String[]>> output = input.apply(ParDo.of(new CsvExtractHeadersFn()));
46 |     PAssert.that(output).containsInAnyOrder(
47 |         KV.of(url1.getPath(), new String[] {"YEAR", "MAKE", "DESCRIPTION"}),
48 |         KV.of(url2.getPath(), new String[] {"BIRTH_DATE", "GENDER", "HEIGHT"})
49 |     );
50 |     p.run();
51 |   }
52 | 
53 |   @Test(expected = PipelineExecutionException.class)
54 |   public void extract_invalidHeaders_exception() {
55 |     URL url = this.getClass().getClassLoader().getResource("test_input_invalid_headers.csv");
56 | 
57 |     PCollection<ReadableFile> input = p
58 |         .apply(FileIO.match().filepattern(url.toString()))
59 |         .apply(FileIO.readMatches());
60 |     input.apply(ParDo.of(new CsvExtractHeadersFn()));
61 |     p.run();
62 |   }
63 | }


--------------------------------------------------------------------------------
/src/test/java/com/google/cloud/healthcare/process/pipeline/csv/CsvMergeSchemaFnTest.java:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Google LLC
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package com.google.cloud.healthcare.process.pipeline.csv;
16 | 
17 | import static org.junit.Assert.assertEquals;
18 | 
19 | import com.google.cloud.healthcare.process.schema.FieldType;
20 | import com.google.common.collect.Lists;
21 | import java.util.List;
22 | import org.apache.beam.sdk.testing.PAssert;
23 | import org.apache.beam.sdk.testing.TestPipeline;
24 | import org.apache.beam.sdk.transforms.Combine;
25 | import org.apache.beam.sdk.transforms.Create;
26 | import org.apache.beam.sdk.values.PCollection;
27 | import org.junit.Rule;
28 | import org.junit.Test;
29 | 
30 | /** Test for {@link CsvMergeSchemaFn}. */
31 | public class CsvMergeSchemaFnTest {
32 | 
33 |   @Rule public final transient TestPipeline p = TestPipeline.create();
34 | 
35 |   @Test
36 |   public void merge_returnMergedSchema() {
37 |     PCollection<List<FieldType>> input =
38 |         p.apply(
39 |             Create.of(
40 |                 Lists.newArrayList(
41 |                     FieldType.STRING, FieldType.DATETIME, FieldType.INT, FieldType.BOOLEAN),
42 |                 Lists.newArrayList(
43 |                     FieldType.LONG, FieldType.DATE, FieldType.TIME, FieldType.DOUBLE)));
44 | 
45 |     PCollection<FieldType[]> output = input.apply(Combine.globally(new CsvMergeSchemaFn()));
46 | 
47 |     FieldType[] expected =
48 |         new FieldType[] {FieldType.STRING, FieldType.DATETIME, FieldType.STRING, FieldType.STRING};
49 | 
50 |     PAssert.thatSingleton(output)
51 |         .satisfies(
52 |             in -> {
53 |               assertEquals("Size matches headers.", expected.length, in.length);
54 |               for (int i = 0; i < expected.length; i++) {
55 |                 assertEquals("Type matches.", expected[i], in[i]);
56 |               }
57 |               return null;
58 |             });
59 |     p.run();
60 |   }
61 | }
62 | 


--------------------------------------------------------------------------------
/src/test/java/com/google/cloud/healthcare/process/pipeline/csv/CsvParseDataFnTest.java:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Google LLC
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package com.google.cloud.healthcare.process.pipeline.csv;
16 | 
17 | import com.google.cloud.healthcare.config.CsvConfiguration;
18 | import java.io.IOException;
19 | import java.net.URISyntaxException;
20 | import java.net.URL;
21 | import java.nio.file.Files;
22 | import java.nio.file.Paths;
23 | import org.apache.beam.sdk.testing.PAssert;
24 | import org.apache.beam.sdk.testing.TestPipeline;
25 | import org.apache.beam.sdk.transforms.Create;
26 | import org.apache.beam.sdk.transforms.DoFn;
27 | import org.apache.beam.sdk.transforms.ParDo;
28 | import org.apache.beam.sdk.values.KV;
29 | import org.apache.beam.sdk.values.PCollection;
30 | import org.junit.Rule;
31 | import org.junit.Test;
32 | 
33 | /** Test for {@link CsvParseDataFn}. */
34 | public class CsvParseDataFnTest {
35 |   private static final String FILENAME = "test_input_parse.csv";
36 | 
37 |   @Rule
38 |   public final transient TestPipeline p = TestPipeline.create();
39 | 
40 |   @Test
41 |   public void parse_returnExpectedParsedResult() throws IOException, URISyntaxException {
42 |     URL url = this.getClass().getClassLoader().getResource(FILENAME);
43 |     byte[] bytes = Files.readAllBytes(Paths.get(url.toURI()));
44 | 
45 |     PCollection<KV<String, byte[]>> input = p.apply(Create.of(KV.of(FILENAME, bytes)));
46 |     PCollection<KV<String, String>> output = input
47 |         .apply(ParDo.of(new CsvParseDataFn(CsvConfiguration.getInstance())))
48 |         .apply(ParDo.of(new FlattenDoFn()));
49 |     PAssert.that(output).containsInAnyOrder(
50 |         KV.of(FILENAME, "1999"), KV.of(FILENAME, "Chevy"),
51 |         KV.of(FILENAME, "Venture \"Extended Edition\""),
52 |         KV.of(FILENAME, "1999"), KV.of(FILENAME, "Chevy"),
53 |         KV.of(FILENAME, "Venture \"Extended Edition, Very Large\""),
54 |         KV.of(FILENAME, "1997"), KV.of(FILENAME, "Ford"),
55 |         KV.of(FILENAME, "Super, luxurious truck"),
56 |         KV.of(FILENAME, "1996"), KV.of(FILENAME, "Jeep"),
57 |         KV.of(FILENAME, "MUST SELL!\nair, moon roof, loaded")
58 |     );
59 |     p.run();
60 |   }
61 | 
62 |   /** Flatten the result for easier comparison. */
63 |   public static class FlattenDoFn extends DoFn<KV<String, String[]>, KV<String, String>> {
64 |     @ProcessElement
65 |     public void process(ProcessContext ctx) {
66 |       KV<String, String[]> input = ctx.element();
67 |       for (String column : input.getValue()) {
68 |         ctx.output(KV.of(input.getKey(), column));
69 |       }
70 |     }
71 |   }
72 | }


--------------------------------------------------------------------------------
/src/test/java/com/google/cloud/healthcare/process/pipeline/csv/FillTableRowFnTest.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2019 Google LLC.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package com.google.cloud.healthcare.process.pipeline.csv;
 18 | 
 19 | import com.google.api.services.bigquery.model.TableRow;
 20 | import com.google.cloud.healthcare.process.pipeline.FillTableRowFn;
 21 | import com.google.cloud.healthcare.process.schema.FieldType;
 22 | import com.google.common.collect.Lists;
 23 | import java.util.Map;
 24 | import org.apache.beam.sdk.testing.PAssert;
 25 | import org.apache.beam.sdk.testing.TestPipeline;
 26 | import org.apache.beam.sdk.transforms.Create;
 27 | import org.apache.beam.sdk.transforms.ParDo;
 28 | import org.apache.beam.sdk.transforms.View;
 29 | import org.apache.beam.sdk.values.KV;
 30 | import org.apache.beam.sdk.values.PCollection;
 31 | import org.apache.beam.sdk.values.PCollectionView;
 32 | import org.junit.Rule;
 33 | import org.junit.Test;
 34 | 
 35 | /** Test for {@link FillTableRowFn}. */
 36 | public class FillTableRowFnTest {
 37 | 
 38 |   private static final String FILENAME = "file.csv";
 39 | 
 40 |   private static final String[] row1 = new String[] {"1", "a", "true", "0.5"};
 41 |   private static final String[] row2 = new String[] {"0", "e", "true", "2.71"};
 42 |   private static final String[] row3 = new String[] {"20", "ab", "false", "0.1"};
 43 |   private static final String[] row4 = new String[] {"100", "pi", "false", "3.14"};
 44 | 
 45 |   private static final FieldType[] schema =
 46 |       new FieldType[] {FieldType.INT, FieldType.STRING, FieldType.BOOLEAN, FieldType.DOUBLE};
 47 |   private static final String[] header = new String[] {"count", "name", "valid", "value"};
 48 | 
 49 |   @Rule public final transient TestPipeline p = TestPipeline.create();
 50 | 
 51 |   @Test
 52 |   public void fill_returnFilledTableRow() {
 53 |     PCollection<KV<String, String[]>> input =
 54 |         p.apply(
 55 |             Create.of(
 56 |                 KV.of(FILENAME, row1),
 57 |                 KV.of(FILENAME, row2),
 58 |                 KV.of(FILENAME, row3),
 59 |                 KV.of(FILENAME, row4)));
 60 | 
 61 |     PCollectionView<Map<String, FieldType[]>> schemasView =
 62 |         p.apply("create_schemas", Create.of(KV.of(FILENAME, schema)))
 63 |             .apply("as_map_schemas", View.asMap());
 64 | 
 65 |     PCollectionView<Map<String, String[]>> headersView =
 66 |         p.apply("create_headers", Create.of(KV.of(FILENAME, header)))
 67 |             .apply("as_map_headers", View.asMap());
 68 | 
 69 |     PCollection<KV<String, TableRow>> output =
 70 |         input.apply(
 71 |             ParDo.of(new FillTableRowFn(schemasView, headersView))
 72 |                 .withSideInputs(schemasView, headersView));
 73 | 
 74 |     PAssert.that(output)
 75 |         .containsInAnyOrder(
 76 |             Lists.newArrayList(
 77 |                 KV.of(
 78 |                     FILENAME,
 79 |                     new TableRow()
 80 |                         .set("count", 1)
 81 |                         .set("name", "a")
 82 |                         .set("valid", true)
 83 |                         .set("value", 0.5)),
 84 |                 KV.of(
 85 |                     FILENAME,
 86 |                     new TableRow()
 87 |                         .set("count", 0)
 88 |                         .set("name", "e")
 89 |                         .set("valid", true)
 90 |                         .set("value", 2.71)),
 91 |                 KV.of(
 92 |                     FILENAME,
 93 |                     new TableRow()
 94 |                         .set("count", 20)
 95 |                         .set("name", "ab")
 96 |                         .set("valid", false)
 97 |                         .set("value", 0.1)),
 98 |                 KV.of(
 99 |                     FILENAME,
100 |                     new TableRow()
101 |                         .set("count", 100)
102 |                         .set("name", "pi")
103 |                         .set("valid", false)
104 |                         .set("value", 3.14))));
105 | 
106 |     p.run();
107 |   }
108 | }
109 | 


--------------------------------------------------------------------------------
/src/test/java/com/google/cloud/healthcare/process/pipeline/csv/GcsSplitCsvFnTest.java:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Google LLC
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package com.google.cloud.healthcare.process.pipeline.csv;
16 | 
17 | import com.google.cloud.healthcare.config.CsvConfiguration;
18 | import com.google.common.collect.ImmutableMap;
19 | import com.google.common.collect.Sets;
20 | import java.net.URL;
21 | import java.util.Arrays;
22 | import java.util.Collection;
23 | import java.util.Map;
24 | import java.util.Set;
25 | import org.apache.beam.sdk.io.FileIO;
26 | import org.apache.beam.sdk.io.FileIO.ReadableFile;
27 | import org.apache.beam.sdk.testing.PAssert;
28 | import org.apache.beam.sdk.testing.TestPipeline;
29 | import org.apache.beam.sdk.transforms.Create;
30 | import org.apache.beam.sdk.transforms.ParDo;
31 | import org.apache.beam.sdk.transforms.View;
32 | import org.apache.beam.sdk.values.KV;
33 | import org.apache.beam.sdk.values.PCollection;
34 | import org.apache.beam.sdk.values.PCollectionView;
35 | import org.junit.Before;
36 | import org.junit.Rule;
37 | import org.junit.Test;
38 | import org.junit.runner.RunWith;
39 | import org.junit.runners.Parameterized;
40 | import org.junit.runners.Parameterized.Parameter;
41 | import org.junit.runners.Parameterized.Parameters;
42 | 
43 | /** Test for {@link GcsSplitCsvFn}. */
44 | @RunWith(Parameterized.class)
45 | public class GcsSplitCsvFnTest {
46 | 
47 |   @Rule
48 |   public final transient TestPipeline p = TestPipeline.create();
49 | 
50 |   @Parameter
51 |   public String testInputFilename;
52 | 
53 |   @Parameter(1)
54 |   public Set<Long> expectedSplitPoints;
55 | 
56 |   @Before
57 |   public void setUp() {
58 |     CsvConfiguration.getInstance().withQuote('\"').withDelimiter(',').withRecordSeparator("\n");
59 |   }
60 | 
61 |   @Parameters
62 |   public static Collection<Object[]> data() {
63 |     return Arrays.asList(new Object[][] {
64 |         {"test_input_no_split.csv", Sets.newHashSet(22L, 218L)},
65 |         {"test_input_no_split2.csv", Sets.newHashSet(22L, 871L)},
66 |         {"test_input_with_quotes.csv", Sets.newHashSet(22L, 517L, 1006L)},
67 |         {"test_input_without_quotes.csv", Sets.newHashSet(22L, 551L, 882L)},
68 |         {"test_input_mixed_quotes.csv", Sets.newHashSet(22L, 537L, 1045L, 1595L, 2092L, 2332L)},
69 |         {"test_input_all_lines_have_new_lines.csv", Sets.newHashSet(22L, 571L, 1059L, 1181L)}
70 |     });
71 |   }
72 | 
73 |   @Test
74 |   public void split() {
75 |     URL url = this.getClass().getClassLoader().getResource(testInputFilename);
76 | 
77 |     Map<String, String[]> headers = ImmutableMap.<String, String[]>builder()
78 |         .put(url.getPath(), new String[] {"Year", "Make", "Description"}).build();
79 |     PCollectionView<Map<String, String[]>> headersView = p.apply(Create.of(headers))
80 |         .apply(View.asMap());
81 | 
82 |     PCollection<ReadableFile> input = p
83 |         .apply(FileIO.match().filepattern(url.toString()))
84 |         .apply(FileIO.readMatches());
85 |     PCollection<KV<String, Set<Long>>> output = input
86 |         .apply(ParDo.of(new TestGcsSplitCsvFn(headersView)).withSideInputs(headersView));
87 |     PAssert.thatSingleton(output).isEqualTo(KV.of(url.getPath(), expectedSplitPoints));
88 |     p.run();
89 |   }
90 | 
91 |   static class TestGcsSplitCsvFn extends GcsSplitCsvFn {
92 |     TestGcsSplitCsvFn(PCollectionView<Map<String, String[]>> headersView) {
93 |       super(CsvConfiguration.getInstance(), headersView);
94 |       CHUNK_SIZE = 512;
95 |     }
96 |   }
97 | }


--------------------------------------------------------------------------------
/src/test/java/com/google/cloud/healthcare/process/pipeline/csv/advance/CsvParseDataAdvanceFnTest.java:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Google LLC
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package com.google.cloud.healthcare.process.pipeline.csv.advance;
16 | 
17 | import com.google.cloud.healthcare.config.CsvConfiguration;
18 | import java.io.IOException;
19 | import java.net.URISyntaxException;
20 | import java.net.URL;
21 | import java.nio.file.Files;
22 | import java.nio.file.Paths;
23 | import org.apache.beam.sdk.testing.PAssert;
24 | import org.apache.beam.sdk.testing.TestPipeline;
25 | import org.apache.beam.sdk.transforms.Create;
26 | import org.apache.beam.sdk.transforms.DoFn;
27 | import org.apache.beam.sdk.transforms.ParDo;
28 | import org.apache.beam.sdk.values.KV;
29 | import org.apache.beam.sdk.values.PCollection;
30 | import org.junit.Rule;
31 | import org.junit.Test;
32 | 
33 | /** Test for {@link CsvParseDataAdvanceFn}. */
34 | public class CsvParseDataAdvanceFnTest {
35 |   private static final String FILENAME = "test_input_parse.csv";
36 | 
37 |   @Rule
38 |   public final transient TestPipeline p = TestPipeline.create();
39 | 
40 |   @Test
41 |   public void parse_returnExpectedParsedResult() throws IOException, URISyntaxException {
42 |     URL url = this.getClass().getClassLoader().getResource(FILENAME);
43 |     byte[] bytes = Files.readAllBytes(Paths.get(url.toURI()));
44 | 
45 |     PCollection<KV<String, byte[]>> input = p.apply(Create.of(KV.of(FILENAME, bytes)));
46 |     PCollection<KV<String, String>> output = input
47 |         .apply(ParDo.of(new CsvParseDataAdvanceFn(
48 |             CsvConfiguration.getInstance()
49 |                 .withRecordSeparatorRegex("\n(?=\\d{4})").withDelimiterRegex(",(?=\\S+)"))))
50 |         .apply(ParDo.of(new FlattenDoFn()));
51 |     PAssert.that(output).containsInAnyOrder(
52 |         KV.of(FILENAME, "1999"), KV.of(FILENAME, "Chevy"),
53 |         KV.of(FILENAME, "\"Venture \"\"Extended Edition\"\"\""),
54 |         KV.of(FILENAME, "1999"), KV.of(FILENAME, "Chevy"),
55 |         KV.of(FILENAME, "\"Venture \"\"Extended Edition, Very Large\"\"\""),
56 |         KV.of(FILENAME, "1997"), KV.of(FILENAME, "Ford"),
57 |         KV.of(FILENAME, "\"Super, luxurious truck\""),
58 |         KV.of(FILENAME, "1996"), KV.of(FILENAME, "Jeep"),
59 |         KV.of(FILENAME, "\"MUST SELL!\nair, moon roof, loaded\"")
60 |     );
61 |     p.run();
62 |   }
63 | 
64 |   /** Flatten the result for easier comparison. */
65 |   public static class FlattenDoFn extends DoFn<KV<String, String[]>, KV<String, String>> {
66 |     @ProcessElement
67 |     public void process(ProcessContext ctx) {
68 |       KV<String, String[]> input = ctx.element();
69 |       for (String column : input.getValue()) {
70 |         ctx.output(KV.of(input.getKey(), column));
71 |       }
72 |     }
73 |   }
74 | }


--------------------------------------------------------------------------------
/src/test/java/com/google/cloud/healthcare/process/pipeline/csv/advance/GcsSplitCsvAdvanceFnTest.java:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Google LLC
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package com.google.cloud.healthcare.process.pipeline.csv.advance;
16 | 
17 | import com.google.cloud.healthcare.config.CsvConfiguration;
18 | import com.google.common.collect.ImmutableMap;
19 | import com.google.common.collect.Sets;
20 | import java.net.URL;
21 | import java.util.Arrays;
22 | import java.util.Collection;
23 | import java.util.Map;
24 | import java.util.Set;
25 | import org.apache.beam.sdk.io.FileIO;
26 | import org.apache.beam.sdk.io.FileIO.ReadableFile;
27 | import org.apache.beam.sdk.testing.PAssert;
28 | import org.apache.beam.sdk.testing.TestPipeline;
29 | import org.apache.beam.sdk.transforms.Create;
30 | import org.apache.beam.sdk.transforms.ParDo;
31 | import org.apache.beam.sdk.transforms.View;
32 | import org.apache.beam.sdk.values.KV;
33 | import org.apache.beam.sdk.values.PCollection;
34 | import org.apache.beam.sdk.values.PCollectionView;
35 | import org.junit.Rule;
36 | import org.junit.Test;
37 | import org.junit.runner.RunWith;
38 | import org.junit.runners.Parameterized;
39 | import org.junit.runners.Parameterized.Parameter;
40 | import org.junit.runners.Parameterized.Parameters;
41 | 
42 | /** Test for {@link GcsSplitCsvAdvanceFn}. */
43 | @RunWith(Parameterized.class)
44 | public class GcsSplitCsvAdvanceFnTest {
45 | 
46 |   @Rule
47 |   public final transient TestPipeline p = TestPipeline.create();
48 | 
49 |   @Parameter
50 |   public String testInputFilename;
51 | 
52 |   @Parameter(1)
53 |   public Set<Long> expectedSplitPoints;
54 | 
55 |   @Parameters
56 |   public static Collection<Object[]> data() {
57 |     return Arrays.asList(new Object[][] {
58 |         {"test_input_no_split.csv", Sets.newHashSet(22L, 218L)},
59 |         {"test_input_no_split2.csv", Sets.newHashSet(22L, 871L)},
60 |         {"test_input_with_quotes.csv", Sets.newHashSet(22L, 517L, 1006L)},
61 |         {"test_input_without_quotes.csv", Sets.newHashSet(22L, 514L, 882L)},
62 |         {"test_input_mixed_quotes.csv", Sets.newHashSet(22L, 537L, 1045L, 1558L, 2055L, 2332L)},
63 |         {"test_input_all_lines_have_new_lines.csv", Sets.newHashSet(22L, 571L, 1059L, 1181L)},
64 |         {"test_input_advance_split.csv", Sets.newHashSet(31L, 743L, 1091L, 1459L)}
65 |     });
66 |   }
67 | 
68 |   @Test
69 |   public void split() {
70 |     URL url = this.getClass().getClassLoader().getResource(testInputFilename);
71 | 
72 |     Map<String, String[]> headers = ImmutableMap.<String, String[]>builder()
73 |         .put(url.getPath(), new String[] {"Year", "Make", "Description"}).build();
74 |     PCollectionView<Map<String, String[]>> headersView = p.apply(Create.of(headers))
75 |         .apply(View.asMap());
76 | 
77 |     PCollection<ReadableFile> input = p
78 |         .apply(FileIO.match().filepattern(url.toString()))
79 |         .apply(FileIO.readMatches());
80 |     PCollection<KV<String, Set<Long>>> output = input
81 |         .apply(ParDo.of(new TestGcsSplitCsvAdvanceFn(
82 |             CsvConfiguration.getInstance().withRecordSeparatorRegex("\n(?=\\d{4},)"), headersView))
83 |             .withSideInputs(headersView));
84 |     PAssert.thatSingleton(output).isEqualTo(KV.of(url.getPath(), expectedSplitPoints));
85 |     p.run();
86 |   }
87 | 
88 |   static class TestGcsSplitCsvAdvanceFn extends GcsSplitCsvAdvanceFn {
89 |     TestGcsSplitCsvAdvanceFn(CsvConfiguration config,
90 |         PCollectionView<Map<String, String[]>> headersView) {
91 |       super(config, headersView);
92 |       CHUNK_SIZE = 512;
93 |     }
94 |   }
95 | }


--------------------------------------------------------------------------------
/src/test/java/com/google/cloud/healthcare/process/schema/FieldTypeTest.java:
--------------------------------------------------------------------------------
  1 | // Copyright 2019 Google LLC
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | package com.google.cloud.healthcare.process.schema;
 16 | 
 17 | import static org.junit.Assert.assertEquals;
 18 | import static org.junit.Assert.assertNull;
 19 | 
 20 | import org.junit.Test;
 21 | 
 22 | /** Test for {@link FieldType}. */
 23 | public class FieldTypeTest {
 24 | 
 25 |   @Test
 26 |   public void getCommonType_bothNull_returnNull() {
 27 |     assertNull("Should return null if both are null.", FieldType.getCommonType(null, null));
 28 |   }
 29 | 
 30 |   @Test
 31 |   public void getCommonType_same_returnOriginal() {
 32 |     assertEquals("Should return the original type if both are same.",
 33 |         FieldType.INT, FieldType.getCommonType(FieldType.INT, FieldType.INT));
 34 |     assertEquals("Should return the original type if both are same.",
 35 |         FieldType.LONG, FieldType.getCommonType(FieldType.LONG, FieldType.LONG));
 36 |     assertEquals("Should return the original type if both are same.",
 37 |         FieldType.DOUBLE, FieldType.getCommonType(FieldType.DOUBLE, FieldType.DOUBLE));
 38 |     assertEquals("Should return the original type if both are same.",
 39 |         FieldType.DATE, FieldType.getCommonType(FieldType.DATE, FieldType.DATE));
 40 |     assertEquals("Should return the original type if both are same.",
 41 |         FieldType.TIME, FieldType.getCommonType(FieldType.TIME, FieldType.TIME));
 42 |     assertEquals("Should return the original type if both are same.",
 43 |         FieldType.DATETIME, FieldType.getCommonType(FieldType.DATETIME, FieldType.DATETIME));
 44 |     assertEquals("Should return the original type if both are same.",
 45 |         FieldType.BOOLEAN, FieldType.getCommonType(FieldType.BOOLEAN, FieldType.BOOLEAN));
 46 |   }
 47 | 
 48 |   @Test
 49 |   public void getCommonType_unknownAndOther_returnOther() {
 50 |     assertEquals("Should return int if input are null and int.",
 51 |         FieldType.INT, FieldType.getCommonType(FieldType.UNKNOWN, FieldType.INT));
 52 |     assertEquals("Should return long if input are unknown and long.",
 53 |         FieldType.LONG, FieldType.getCommonType(FieldType.UNKNOWN, FieldType.LONG));
 54 |     assertEquals("Should return double if input are unknown and double.",
 55 |         FieldType.DOUBLE, FieldType.getCommonType(FieldType.UNKNOWN, FieldType.DOUBLE));
 56 |     assertEquals("Should return date if input are unknown and date.",
 57 |         FieldType.DATE, FieldType.getCommonType(FieldType.UNKNOWN, FieldType.DATE));
 58 |     assertEquals("Should return time if input are unknown and time.",
 59 |         FieldType.TIME, FieldType.getCommonType(FieldType.UNKNOWN, FieldType.TIME));
 60 |     assertEquals("Should return datetime if input are unknown and datetime.",
 61 |         FieldType.DATETIME, FieldType.getCommonType(FieldType.UNKNOWN, FieldType.DATETIME));
 62 |     assertEquals("Should return boolean if input are unknown and boolean.",
 63 |         FieldType.BOOLEAN, FieldType.getCommonType(FieldType.UNKNOWN, FieldType.BOOLEAN));
 64 |   }
 65 | 
 66 |   @Test
 67 |   public void getCommonType_stringAndOther_returnString() {
 68 |     assertEquals("Should return string if one of the input is string",
 69 |         FieldType.STRING, FieldType.getCommonType(FieldType.INT, FieldType.STRING));
 70 |     assertEquals("Should return string if one of the input is string",
 71 |         FieldType.STRING, FieldType.getCommonType(FieldType.LONG, FieldType.STRING));
 72 |     assertEquals("Should return string if one of the input is string",
 73 |         FieldType.STRING, FieldType.getCommonType(FieldType.DOUBLE, FieldType.STRING));
 74 |     assertEquals("Should return string if one of the input is string",
 75 |         FieldType.STRING, FieldType.getCommonType(FieldType.DATE, FieldType.STRING));
 76 |     assertEquals("Should return string if one of the input is string",
 77 |         FieldType.STRING, FieldType.getCommonType(FieldType.TIME, FieldType.STRING));
 78 |     assertEquals("Should return string if one of the input is string",
 79 |         FieldType.STRING, FieldType.getCommonType(FieldType.DATETIME, FieldType.STRING));
 80 |     assertEquals("Should return string if one of the input is string",
 81 |         FieldType.STRING, FieldType.getCommonType(FieldType.BOOLEAN, FieldType.STRING));
 82 |   }
 83 | 
 84 |   @Test
 85 |   public void getCommonType_dateOrTimeAndDateTime_returnDateTime() {
 86 |     assertEquals("Date and datetime should return datetime.",
 87 |         FieldType.DATETIME, FieldType.getCommonType(FieldType.DATETIME, FieldType.DATE));
 88 |     assertEquals("Time and datetime should return datetime.",
 89 |         FieldType.DATETIME, FieldType.getCommonType(FieldType.DATETIME, FieldType.TIME));
 90 |   }
 91 | 
 92 |   @Test
 93 |   public void getCommonType_dateAndTime_returnString() {
 94 |     assertEquals("Date and time should return string.",
 95 |         FieldType.STRING, FieldType.getCommonType(FieldType.TIME, FieldType.DATE));
 96 |   }
 97 | 
 98 |   @Test
 99 |   public void getCommonType_intAndLong_returnLong() {
100 |     assertEquals("int and long should return long.",
101 |         FieldType.LONG, FieldType.getCommonType(FieldType.LONG, FieldType.INT));
102 |   }
103 | 
104 |   @Test
105 |   public void getCommonType_intAndDouble_returnDouble() {
106 |     assertEquals("int and double should return double.",
107 |         FieldType.DOUBLE, FieldType.getCommonType(FieldType.DOUBLE, FieldType.INT));
108 |   }
109 | 
110 |   @Test
111 |   public void getCommonType_longAndDouble_returnDouble() {
112 |     assertEquals("int and long should return double.",
113 |         FieldType.DOUBLE, FieldType.getCommonType(FieldType.LONG, FieldType.DOUBLE));
114 |   }
115 | 
116 |   @Test
117 |   public void getCommonType_booleanAndOther_returnString() {
118 |     assertEquals("boolean and other should return string.",
119 |         FieldType.STRING, FieldType.getCommonType(FieldType.BOOLEAN, FieldType.INT));
120 |     assertEquals("boolean and other should return string.",
121 |         FieldType.STRING, FieldType.getCommonType(FieldType.BOOLEAN, FieldType.LONG));
122 |     assertEquals("boolean and other should return string.",
123 |         FieldType.STRING, FieldType.getCommonType(FieldType.BOOLEAN, FieldType.DOUBLE));
124 |     assertEquals("boolean and other should return string.",
125 |         FieldType.STRING, FieldType.getCommonType(FieldType.BOOLEAN, FieldType.DATE));
126 |     assertEquals("boolean and other should return string.",
127 |         FieldType.STRING, FieldType.getCommonType(FieldType.BOOLEAN, FieldType.TIME));
128 |     assertEquals("boolean and other should return string.",
129 |         FieldType.STRING, FieldType.getCommonType(FieldType.BOOLEAN, FieldType.DATETIME));
130 |   }
131 | 
132 |   @Test
133 |   public void getCommonType_numericAndDate_returnString() {
134 |     assertEquals("boolean and other should return string.",
135 |         FieldType.STRING, FieldType.getCommonType(FieldType.INT, FieldType.DATE));
136 |     assertEquals("boolean and other should return string.",
137 |         FieldType.STRING, FieldType.getCommonType(FieldType.LONG, FieldType.TIME));
138 |     assertEquals("boolean and other should return string.",
139 |         FieldType.STRING, FieldType.getCommonType(FieldType.DOUBLE, FieldType.DATETIME));
140 |   }
141 | }


--------------------------------------------------------------------------------
/src/test/java/com/google/cloud/healthcare/process/schema/SchemaUtilTest.java:
--------------------------------------------------------------------------------
  1 | // Copyright 2019 Google LLC
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | package com.google.cloud.healthcare.process.schema;
 16 | 
 17 | import static org.junit.Assert.assertEquals;
 18 | 
 19 | import com.google.api.services.bigquery.model.TableSchema;
 20 | import java.io.IOException;
 21 | import java.util.Arrays;
 22 | import java.util.List;
 23 | import org.apache.beam.sdk.util.Transport;
 24 | import org.junit.Test;
 25 | 
 26 | /** Test for {@link SchemaUtil}. */
 27 | public class SchemaUtilTest {
 28 | 
 29 |   private static final String TEST_SCHEMA =
 30 |       "{\n"
 31 |           + "  \"fields\" : [ {\n"
 32 |           + "    \"mode\" : \"NULLABLE\",\n"
 33 |           + "    \"name\" : \"name1\",\n"
 34 |           + "    \"type\" : \"INT64\"\n"
 35 |           + "  }, {\n"
 36 |           + "    \"mode\" : \"NULLABLE\",\n"
 37 |           + "    \"name\" : \"name2\",\n"
 38 |           + "    \"type\" : \"INT64\"\n"
 39 |           + "  }, {\n"
 40 |           + "    \"mode\" : \"NULLABLE\",\n"
 41 |           + "    \"name\" : \"name3\",\n"
 42 |           + "    \"type\" : \"FLOAT64\"\n"
 43 |           + "  }, {\n"
 44 |           + "    \"mode\" : \"NULLABLE\",\n"
 45 |           + "    \"name\" : \"name4\",\n"
 46 |           + "    \"type\" : \"BOOL\"\n"
 47 |           + "  }, {\n"
 48 |           + "    \"mode\" : \"NULLABLE\",\n"
 49 |           + "    \"name\" : \"name5\",\n"
 50 |           + "    \"type\" : \"DATE\"\n"
 51 |           + "  }, {\n"
 52 |           + "    \"mode\" : \"NULLABLE\",\n"
 53 |           + "    \"name\" : \"name6\",\n"
 54 |           + "    \"type\" : \"TIME\"\n"
 55 |           + "  }, {\n"
 56 |           + "    \"mode\" : \"NULLABLE\",\n"
 57 |           + "    \"name\" : \"name7\",\n"
 58 |           + "    \"type\" : \"DATETIME\"\n"
 59 |           + "  }, {\n"
 60 |           + "    \"mode\" : \"NULLABLE\",\n"
 61 |           + "    \"name\" : \"name8\",\n"
 62 |           + "    \"type\" : \"STRING\"\n"
 63 |           + "  }, {\n"
 64 |           + "    \"mode\" : \"NULLABLE\",\n"
 65 |           + "    \"name\" : \"name9\",\n"
 66 |           + "    \"type\" : \"STRING\"\n"
 67 |           + "  } ]\n"
 68 |           + "}";
 69 | 
 70 |   @Test
 71 |   public void infer_returnTypes() {
 72 |     List<FieldType> types = SchemaUtil.infer(Arrays.asList(
 73 |         "123", "12345678901234", "2.31",
 74 |         "true", "Y", "No", "T",
 75 |         "2018-11-18", "11:25:33", "2018-11-18T11:25:33",
 76 |         "abc", ""));
 77 |     assertEquals("Should return expected types.", Arrays.asList(
 78 |         FieldType.INT, FieldType.LONG, FieldType.DOUBLE,
 79 |         FieldType.BOOLEAN, FieldType.BOOLEAN, FieldType.BOOLEAN, FieldType.BOOLEAN,
 80 |         FieldType.DATE, FieldType.TIME, FieldType.DATETIME,
 81 |         FieldType.STRING, FieldType.UNKNOWN), types);
 82 |   }
 83 | 
 84 |   @Test
 85 |   public void generate_expectedSchema() throws IOException {
 86 |     TableSchema schema = SchemaUtil.generateBigQueryTableSchema(new String[] {
 87 |         "name1", "name2", "name3", "name4", "name5", "name6", "name7", "name8", "name9"},
 88 |         new FieldType[] {
 89 |             FieldType.INT, FieldType.LONG, FieldType.DOUBLE,
 90 |             FieldType.BOOLEAN,
 91 |             FieldType.DATE, FieldType.TIME, FieldType.DATETIME,
 92 |             FieldType.STRING, FieldType.UNKNOWN
 93 |         });
 94 |     assertEquals("Generates expected schema.", TEST_SCHEMA,
 95 |         Transport.getJsonFactory().toPrettyString(schema));
 96 |   }
 97 | 
 98 |   @Test
 99 |   public void merge_resultExpectedTypes() {
100 |     List<FieldType> s = Arrays.asList(FieldType.INT, FieldType.DATE,
101 |         FieldType.STRING, FieldType.BOOLEAN, FieldType.LONG, FieldType.DATE);
102 |     List<FieldType> t = Arrays.asList(FieldType.LONG, FieldType.DATETIME,
103 |         FieldType.DOUBLE, FieldType.TIME, FieldType.DOUBLE, FieldType.UNKNOWN);
104 | 
105 |     assertEquals("Should return expected types.",
106 |         Arrays.asList(FieldType.LONG, FieldType.DATETIME, FieldType.STRING, FieldType.STRING,
107 |             FieldType.DOUBLE, FieldType.DATE),
108 |         SchemaUtil.merge(Arrays.asList(s, t)));
109 |   }
110 | }


--------------------------------------------------------------------------------
/src/test/java/com/google/cloud/healthcare/util/StringUtilTest.java:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Google LLC
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package com.google.cloud.healthcare.util;
16 | 
17 | import static org.junit.Assert.assertEquals;
18 | 
19 | import org.junit.Test;
20 | 
21 | /** Test for {@link StringUtil}. */
22 | public class StringUtilTest {
23 | 
24 |   @Test(expected = IllegalArgumentException.class)
25 |   public void splitGcsUri_emptyUri_exception() {
26 |     StringUtil.splitGcsUri("");
27 |   }
28 | 
29 |   @Test(expected = IllegalArgumentException.class)
30 |   public void splitGcsUri_invalidUri_exception() {
31 |     StringUtil.splitGcsUri("http://example.com/my.txt");
32 |   }
33 | 
34 |   @Test(expected = IllegalArgumentException.class)
35 |   public void splitGcsUri_invalidUri_exception2() {
36 |     StringUtil.splitGcsUri("gs://bucket"); // No path.
37 |   }
38 | 
39 |   @Test
40 |   public void splitGcsUri_expectedBucketAndPath() {
41 |     String[] parts = StringUtil.splitGcsUri("gs://bucket/path/to/file");
42 |     assertEquals("Bucket should match.", "bucket", parts[0]);
43 |     assertEquals("Path should match.", "path/to/file", parts[1]);
44 |   }
45 | 
46 |   @Test(expected = IllegalArgumentException.class)
47 |   public void generateGcsUri_emptyBucket_exception() {
48 |     StringUtil.generateGcsUri("", "test");
49 |   }
50 | 
51 |   @Test(expected = IllegalArgumentException.class)
52 |   public void generateGcsUri_emptyPath_exception() {
53 |     StringUtil.generateGcsUri("bucket", "");
54 |   }
55 | 
56 |   @Test
57 |   public void generateGcsUri_validGcsUri() {
58 |     assertEquals("Should return expected GCS URI.",
59 |         "gs://bucket/path/to/file",
60 |         StringUtil.generateGcsUri("bucket", "path/to/file"));
61 |   }
62 | 
63 |   @Test
64 |   public void getGcsDecompressUri_expected() {
65 |     assertEquals("Return expected uri.",
66 |         "gs://bucket/temp/decompress",
67 |         StringUtil.getGcsDecompressUri("bucket"));
68 |   }
69 | 
70 |   @Test(expected = IllegalArgumentException.class)
71 |   public void getGcsDecompressUri_exception() {
72 |     StringUtil.getGcsDecompressUri("");
73 |   }
74 | }


--------------------------------------------------------------------------------
/src/test/resources/test_input_advance_split.csv:
--------------------------------------------------------------------------------
 1 | YEAR,MAKE,DESCRIPTION,DOCUMENT
 2 | 1999,Chevy,"Venture ""Extended Edition""","<!DOCTYPE html
 3 |   SYSTEM "about:legacy-compat">
 4 | <html><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><title>CAR SALE!!!</title></head><body marginwidth="6" marginheight="6" leftmargin="6" topmargin="6"><div valign="top">
 5 | <div style="white-space: pre-wrap;">
 6 |   </div></body></html>"
 7 | 1999,Chevy,"Venture ""Extended Edition, Very Large""","<!DOCTYPE html
 8 |   SYSTEM "about:legacy-compat">
 9 | <html><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><title>CAR SALE!!!</title></head><body marginwidth="6" marginheight="6" leftmargin="6" topmargin="6"><div valign="top">
10 | <div style="white-space: pre-wrap;">
11 |   </div></body></html>"
12 | 1997,Ford,E350,"Super, luxurious truck","<!DOCTYPE html
13 |   SYSTEM "about:legacy-compat">
14 | <html><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><title>CAR SALE!!!</title></head><body marginwidth="6" marginheight="6" leftmargin="6" topmargin="6"><div valign="top">
15 | <div style="white-space: pre-wrap;">
16 |   </div></body></html>"
17 | 1996,Jeep,Grand Cherokee,"MUST SELL!
18 | air, moon roof, loaded","<!DOCTYPE html
19 |   SYSTEM "about:legacy-compat">
20 | <html><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><title>CAR SALE!!!</title></head><body marginwidth="6" marginheight="6" leftmargin="6" topmargin="6"><div valign="top">
21 | <div style="white-space: pre-wrap;">
22 |   </div></body></html>"


--------------------------------------------------------------------------------
/src/test/resources/test_input_all_lines_have_new_lines.csv:
--------------------------------------------------------------------------------
 1 | YEAR,MAKE,DESCRIPTION
 2 | 1996,Jeep,Grand Cherokee,"MUST SELL!
 3 | air, moon roof,
 4 | loaded"
 5 | 1996,Jeep,Grand Cherokee,"MUST SELL!
 6 | air, moon roof,
 7 | loaded"
 8 | 1996,Jeep,Grand Cherokee,"MUST SELL!
 9 | air, moon roof,
10 | loaded"
11 | 1996,Jeep,Grand Cherokee,"MUST SELL!
12 | air, moon roof,
13 | loaded"
14 | 1996,Jeep,Grand Cherokee,"MUST SELL!
15 | air, moon roof,
16 | loaded"
17 | 1996,Jeep,Grand Cherokee,"MUST SELL!
18 | air, moon roof,
19 | loaded"
20 | 1996,Jeep,Grand Cherokee,"MUST SELL!
21 | air, moon roof,
22 | loaded"
23 | 1996,Jeep,Grand Cherokee,"MUST SELL!
24 | air, moon roof,
25 | loaded"
26 | 1996,Jeep,Grand Cherokee,"MUST SELL!
27 | air, moon roof,
28 | loaded"
29 | 1996,Jeep,Grand Cherokee,"MUST SELL!
30 | air, moon roof,
31 | loaded"
32 | 1996,Jeep,Grand Cherokee,"MUST SELL!
33 | air, moon roof,
34 | loaded"
35 | 1996,Jeep,Grand Cherokee,"MUST SELL!
36 | air, moon roof,
37 | loaded"
38 | 1996,Jeep,Grand Cherokee,"MUST SELL!
39 | air, moon roof,
40 | loaded"
41 | 1996,Jeep,Grand Cherokee,"MUST SELL!
42 | air, moon roof,
43 | loaded"
44 | 1996,Jeep,Grand Cherokee,"MUST SELL!
45 | air, moon roof,
46 | loaded"
47 | 1996,Jeep,Grand Cherokee,"MUST SELL!
48 | air, moon roof,
49 | loaded"
50 | 1996,Jeep,Grand Cherokee,"MUST SELL!
51 | air, moon roof,
52 | loaded"
53 | 1996,Jeep,Grand Cherokee,"MUST SELL!
54 | air, moon roof,
55 | loaded"
56 | 1996,Jeep,Grand Cherokee,"MUST SELL!
57 | air, moon roof,
58 | loaded"
59 | 


--------------------------------------------------------------------------------
/src/test/resources/test_input_invalid_headers.csv:
--------------------------------------------------------------------------------
1 | 1999,Chevy,"Venture ""Extended Edition"""


--------------------------------------------------------------------------------
/src/test/resources/test_input_mixed_quotes.csv:
--------------------------------------------------------------------------------
 1 | YEAR,MAKE,DESCRIPTION
 2 | 1991,Ford,E350,"Super, luxurious truck"
 3 | 1992,Jeep,Grand Cherokee,"MUST SELL!
 4 | air, moon roof, loaded"
 5 | 1993,Chevy,"Venture ""Extended Edition"""
 6 | 1994,Chevy,"Venture ""Extended Edition, Very Large"""
 7 | 1995,Ford,E350,Super luxurious truck
 8 | 1996,Jeep,Grand Cherokee,MUST SELL!
 9 | 1997,Chevy,Extended Edition
10 | 1998,Chevy,Very Large
11 | 1999,Chevy,"Venture ""Extended Edition, Very Large"""
12 | 1997,Ford,E350,"Super, luxurious truck"
13 | 1996,Jeep,Grand Cherokee,"MUST SELL!
14 | air, moon roof, loaded"
15 | 1997,Ford,E350,"Super, luxurious truck"
16 | 1996,Jeep,Grand Cherokee,"MUST SELL!
17 | air, moon roof, loaded"
18 | 1999,Chevy,"Venture ""Extended Edition"""
19 | 1999,Chevy,"Venture ""Extended Edition, Very Large"""
20 | 1997,Ford,E350,Super luxurious truck
21 | 1996,Jeep,Grand Cherokee,MUST SELL!
22 | 1999,Chevy,Extended Edition
23 | 1999,Chevy,Very Large
24 | 1997,Ford,E350,"Super, luxurious truck"
25 | 1996,Jeep,Grand Cherokee,"MUST SELL!
26 | air, moon roof, loaded"
27 | 1997,Ford,E350,Super luxurious truck
28 | 1996,Jeep,Grand Cherokee,MUST SELL!
29 | 1999,Chevy,"Venture ""Extended Edition, Very Large"""
30 | 1997,Ford,E350,"Super, luxurious truck"
31 | 1996,Jeep,Grand Cherokee,"MUST SELL!
32 | air, moon roof, loaded"
33 | 1999,Chevy,Extended Edition
34 | 1999,Chevy,Very Large
35 | 1999,Chevy,"Venture ""Extended Edition"""
36 | 1999,Chevy,"Venture ""Extended Edition, Very Large"""
37 | 1997,Ford,E350,Super luxurious truck
38 | 1996,Jeep,Grand Cherokee,MUST SELL!
39 | 1999,Chevy,Extended Edition
40 | 1999,Chevy,Very Large
41 | 1997,Ford,E350,"Super, luxurious truck"
42 | 1996,Jeep,Grand Cherokee,"MUST SELL!
43 | air, moon roof, loaded"
44 | 1999,Chevy,"Venture ""Extended Edition"""
45 | 1997,Ford,E350,Super luxurious truck
46 | 1996,Jeep,Grand Cherokee,MUST SELL!
47 | 1999,Chevy,Extended Edition
48 | 1999,Chevy,Very Large
49 | 1999,Chevy,"Venture ""Extended Edition, Very Large"""
50 | 1997,Ford,E350,"Super, luxurious truck"
51 | 1996,Jeep,Grand Cherokee,"MUST SELL!
52 | air, moon roof, loaded"
53 | 1997,Ford,E350,Super luxurious truck
54 | 1996,Jeep,Grand Cherokee,MUST SELL!
55 | 1999,Chevy,Extended Edition
56 | 1999,Chevy,Very Large
57 | 1999,Chevy,"Venture ""Extended Edition"""
58 | 1999,Chevy,"Venture ""Extended Edition, Very Large"""
59 | 1997,Ford,E350,Super luxurious truck
60 | 1996,Jeep,Grand Cherokee,MUST SELL!
61 | 1999,Chevy,"Venture ""Extended Edition, Very Large"""
62 | 1997,Ford,E350,"Super, luxurious truck"
63 | 1996,Jeep,Grand Cherokee,"MUST SELL!
64 | air, moon roof, loaded"
65 | 1999,Chevy,Extended Edition
66 | 1999,Chevy,Very Large


--------------------------------------------------------------------------------
/src/test/resources/test_input_no_split.csv:
--------------------------------------------------------------------------------
1 | YEAR,MAKE,DESCRIPTION
2 | 1999,Chevy,"Venture ""Extended Edition"""
3 | 1999,Chevy,"Venture ""Extended Edition, Very Large"""
4 | 1997,Ford,E350,"Super, luxurious truck"
5 | 1996,Jeep,Grand Cherokee,"MUST SELL!
6 | air, moon roof, loaded"


--------------------------------------------------------------------------------
/src/test/resources/test_input_no_split2.csv:
--------------------------------------------------------------------------------
  1 | YEAR,MAKE,DESCRIPTION
  2 | 
  3 | 
  4 | 
  5 | 
  6 | 
  7 | 
  8 | 
  9 | 
 10 | 
 11 | 
 12 | 
 13 | 
 14 | 
 15 | 
 16 | 
 17 | 
 18 | 
 19 | 
 20 | 
 21 | 
 22 | 
 23 | 
 24 | 
 25 | 
 26 | 
 27 | 
 28 | 
 29 | 
 30 | 
 31 | 
 32 | 
 33 | 
 34 | 
 35 | 
 36 | 
 37 | 
 38 | 
 39 | 
 40 | 
 41 | 
 42 | 
 43 | 
 44 | 
 45 | 
 46 | 
 47 | 
 48 | 
 49 | 
 50 | 
 51 | 
 52 | 
 53 | 
 54 | 
 55 | 
 56 | 
 57 | 
 58 | 
 59 | 
 60 | 
 61 | 
 62 | 
 63 | 
 64 | 
 65 | 
 66 | 
 67 | 
 68 | 
 69 | 
 70 | 
 71 | 
 72 | 
 73 | 
 74 | 
 75 | 
 76 | 
 77 | 
 78 | 
 79 | 
 80 | 
 81 | 
 82 | 
 83 | 
 84 | 
 85 | 
 86 | 
 87 | 
 88 | 
 89 | 
 90 | 
 91 | 
 92 | 
 93 | 
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | 
105 | 
106 | 
107 | 
108 | 
109 | 
110 | 
111 | 
112 | 
113 | 
114 | 
115 | 
116 | 
117 | 
118 | 
119 | 
120 | 
121 | 
122 | 
123 | 
124 | 
125 | 
126 | 
127 | 
128 | 
129 | 
130 | 
131 | 
132 | 
133 | 
134 | 
135 | 
136 | 
137 | 
138 | 
139 | 
140 | 
141 | 
142 | 
143 | 
144 | 
145 | 
146 | 
147 | 
148 | 
149 | 
150 | 
151 | 
152 | 
153 | 
154 | 
155 | 
156 | 
157 | 
158 | 
159 | 
160 | 
161 | 
162 | 
163 | 
164 | 
165 | 
166 | 
167 | 
168 | 
169 | 
170 | 
171 | 
172 | 
173 | 
174 | 
175 | 
176 | 
177 | 
178 | 
179 | 
180 | 
181 | 
182 | 
183 | 
184 | 
185 | 
186 | 
187 | 
188 | 
189 | 
190 | 
191 | 
192 | 
193 | 
194 | 
195 | 
196 | 
197 | 
198 | 
199 | 
200 | 
201 | 
202 | 
203 | 
204 | 
205 | 
206 | 
207 | 
208 | 
209 | 
210 | 
211 | 
212 | 
213 | 
214 | 
215 | 
216 | 
217 | 
218 | 
219 | 
220 | 
221 | 
222 | 
223 | 
224 | 
225 | 
226 | 
227 | 
228 | 
229 | 
230 | 
231 | 
232 | 
233 | 
234 | 
235 | 
236 | 
237 | 
238 | 
239 | 
240 | 
241 | 
242 | 
243 | 
244 | 
245 | 
246 | 
247 | 
248 | 
249 | 
250 | 
251 | 
252 | 
253 | 
254 | 
255 | 
256 | 
257 | 
258 | 
259 | 
260 | 
261 | 
262 | 
263 | 
264 | 
265 | 
266 | 
267 | 
268 | 
269 | 
270 | 
271 | 
272 | 
273 | 
274 | 
275 | 
276 | 
277 | 
278 | 
279 | 
280 | 
281 | 
282 | 
283 | 
284 | 
285 | 
286 | 
287 | 
288 | 
289 | 
290 | 
291 | 
292 | 
293 | 
294 | 
295 | 
296 | 
297 | 
298 | 
299 | 
300 | 
301 | 
302 | 
303 | 
304 | 
305 | 
306 | 
307 | 
308 | 
309 | 
310 | 
311 | 
312 | 
313 | 
314 | 
315 | 
316 | 
317 | 
318 | 
319 | 
320 | 
321 | 
322 | 
323 | 
324 | 
325 | 
326 | 
327 | 
328 | 
329 | 
330 | 
331 | 
332 | 
333 | 
334 | 
335 | 
336 | 
337 | 
338 | 
339 | 
340 | 
341 | 
342 | 
343 | 
344 | 
345 | 
346 | 
347 | 
348 | 
349 | 
350 | 
351 | 
352 | 
353 | 
354 | 
355 | 
356 | 
357 | 
358 | 
359 | 
360 | 
361 | 
362 | 
363 | 
364 | 
365 | 
366 | 
367 | 
368 | 
369 | 
370 | 
371 | 
372 | 
373 | 
374 | 
375 | 
376 | 
377 | 
378 | 
379 | 
380 | 
381 | 
382 | 
383 | 
384 | 
385 | 
386 | 
387 | 
388 | 
389 | 
390 | 
391 | 
392 | 
393 | 
394 | 
395 | 
396 | 
397 | 
398 | 
399 | 
400 | 
401 | 
402 | 
403 | 
404 | 
405 | 
406 | 
407 | 
408 | 
409 | 
410 | 
411 | 
412 | 
413 | 
414 | 
415 | 
416 | 
417 | 
418 | 
419 | 
420 | 
421 | 
422 | 
423 | 
424 | 
425 | 
426 | 
427 | 
428 | 
429 | 
430 | 
431 | 
432 | 
433 | 
434 | 
435 | 
436 | 
437 | 
438 | 
439 | 
440 | 
441 | 
442 | 
443 | 
444 | 
445 | 
446 | 
447 | 
448 | 
449 | 
450 | 
451 | 
452 | 
453 | 
454 | 
455 | 
456 | 
457 | 
458 | 
459 | 
460 | 
461 | 
462 | 
463 | 
464 | 
465 | 
466 | 
467 | 
468 | 
469 | 
470 | 
471 | 
472 | 
473 | 
474 | 
475 | 
476 | 
477 | 
478 | 
479 | 
480 | 
481 | 
482 | 
483 | 
484 | 
485 | 
486 | 
487 | 
488 | 
489 | 
490 | 
491 | 
492 | 
493 | 
494 | 
495 | 
496 | 
497 | 
498 | 
499 | 
500 | 
501 | 
502 | 
503 | 
504 | 
505 | 
506 | 
507 | 
508 | 
509 | 
510 | 
511 | 
512 | 
513 | 
514 | 
515 | 
516 | 
517 | 
518 | 
519 | 
520 | 
521 | 
522 | 
523 | 
524 | 
525 | 
526 | 
527 | 
528 | 
529 | 
530 | 
531 | 
532 | 
533 | 
534 | 
535 | 
536 | 
537 | 
538 | 
539 | 
540 | 
541 | 
542 | 
543 | 
544 | 
545 | 
546 | 
547 | 
548 | 
549 | 
550 | 
551 | 
552 | 
553 | 
554 | 
555 | 
556 | 
557 | 
558 | 
559 | 
560 | 
561 | 
562 | 
563 | 
564 | 
565 | 
566 | 
567 | 
568 | 
569 | 
570 | 
571 | 
572 | 
573 | 
574 | 
575 | 
576 | 
577 | 
578 | 
579 | 
580 | 
581 | 
582 | 
583 | 
584 | 
585 | 
586 | 
587 | 
588 | 
589 | 
590 | 
591 | 
592 | 
593 | 
594 | 
595 | 
596 | 
597 | 
598 | 
599 | 
600 | 
601 | 
602 | 
603 | 
604 | 
605 | 
606 | 
607 | 
608 | 
609 | 
610 | 
611 | 
612 | 
613 | 
614 | 
615 | 
616 | 
617 | 
618 | 
619 | 
620 | 
621 | 
622 | 
623 | 
624 | 
625 | 
626 | 
627 | 
628 | 
629 | 
630 | 
631 | 
632 | 
633 | 
634 | 
635 | 
636 | 
637 | 
638 | 
639 | 
640 | 
641 | 
642 | 
643 | 
644 | 
645 | 
646 | 
647 | 
648 | 
649 | 
650 | 
651 | 
652 | 
653 | 
654 | 
655 | 
656 | 
657 | 
658 | 
659 | 
660 | 
661 | 
662 | 
663 | 
664 | 
665 | 
666 | 
667 | 
668 | 
669 | 
670 | 
671 | 
672 | 
673 | 
674 | 
675 | 
676 | 
677 | 
678 | 
679 | 
680 | 
681 | 
682 | 
683 | 
684 | 
685 | 
686 | 
687 | 
688 | 
689 | 
690 | 
691 | 
692 | 
693 | 
694 | 
695 | 
696 | 
697 | 
698 | 
699 | 
700 | 
701 | 
702 | 
703 | 
704 | 
705 | 
706 | 
707 | 
708 | 
709 | 
710 | 
711 | 
712 | 
713 | 
714 | 
715 | 
716 | 
717 | 
718 | 
719 | 
720 | 
721 | 
722 | 
723 | 
724 | 
725 | 
726 | 
727 | 
728 | 
729 | 
730 | 
731 | 
732 | 
733 | 
734 | 
735 | 
736 | 
737 | 
738 | 
739 | 
740 | 
741 | 
742 | 
743 | 
744 | 
745 | 
746 | 
747 | 
748 | 
749 | 
750 | 
751 | 
752 | 
753 | 
754 | 
755 | 
756 | 
757 | 
758 | 
759 | 
760 | 
761 | 
762 | 
763 | 
764 | 
765 | 
766 | 
767 | 
768 | 
769 | 
770 | 
771 | 
772 | 
773 | 
774 | 
775 | 
776 | 
777 | 
778 | 
779 | 
780 | 
781 | 
782 | 
783 | 
784 | 
785 | 
786 | 
787 | 
788 | 
789 | 
790 | 
791 | 
792 | 
793 | 
794 | 
795 | 
796 | 
797 | 
798 | 
799 | 
800 | 
801 | 
802 | 
803 | 
804 | 
805 | 
806 | 
807 | 
808 | 
809 | 
810 | 
811 | 
812 | 
813 | 
814 | 
815 | 
816 | 
817 | 
818 | 
819 | 
820 | 
821 | 
822 | 
823 | 
824 | 
825 | 
826 | 
827 | 
828 | 
829 | 
830 | 
831 | 
832 | 
833 | 
834 | 
835 | 
836 | 
837 | 
838 | 
839 | 
840 | 
841 | 
842 | 
843 | 
844 | 
845 | 
846 | 
847 | 
848 | 
849 | 
850 | 
851 | 


--------------------------------------------------------------------------------
/src/test/resources/test_input_parse.csv:
--------------------------------------------------------------------------------
1 | 1999,Chevy,"Venture ""Extended Edition"""
2 | 1999,Chevy,"Venture ""Extended Edition, Very Large"""
3 | 1997,Ford,"Super, luxurious truck"
4 | 1996,Jeep,"MUST SELL!
5 | air, moon roof, loaded"


--------------------------------------------------------------------------------
/src/test/resources/test_input_parse2.csv:
--------------------------------------------------------------------------------
1 | 0,false,0.1,"test",2018-08-24
2 | 1,true,1.0,"google",2019-06-21
3 | 


--------------------------------------------------------------------------------
/src/test/resources/test_input_valid_headers1.csv:
--------------------------------------------------------------------------------
1 | "YEAR",MAKE,DESCRIPTION
2 | 1999,Chevy,"Venture ""Extended Edition"""


--------------------------------------------------------------------------------
/src/test/resources/test_input_valid_headers2.csv:
--------------------------------------------------------------------------------
1 | BIRTH.DATE,GENDER,HEIGHT
2 | 1990,M,180


--------------------------------------------------------------------------------
/src/test/resources/test_input_with_quotes.csv:
--------------------------------------------------------------------------------
 1 | YEAR,MAKE,DESCRIPTION
 2 | 1997,Ford,E350,"Super, luxurious truck"
 3 | 1996,Jeep,Grand Cherokee,"MUST SELL!
 4 | air, moon roof, loaded"
 5 | 1999,Chevy,"Venture ""Extended Edition"""
 6 | 1999,Chevy,"Venture ""Extended Edition, Very Large"""
 7 | 1997,Ford,E350,"Super, luxurious truck"
 8 | 1996,Jeep,Grand Cherokee,"MUST SELL!
 9 | air, moon roof, loaded"
10 | 1999,Chevy,"Venture ""Extended Edition"""
11 | 1999,Chevy,"Venture ""Extended Edition, Very Large"""
12 | 1997,Ford,E350,"Super, luxurious truck"
13 | 1996,Jeep,Grand Cherokee,"MUST SELL!
14 | air, moon roof, loaded"
15 | 1999,Chevy,"Venture ""Extended Edition"""
16 | 1999,Chevy,"Venture ""Extended Edition, Very Large"""
17 | 1997,Ford,E350,"Super, luxurious truck"
18 | 1996,Jeep,Grand Cherokee,"MUST SELL!
19 | air, moon roof, loaded"
20 | 1999,Chevy,"Venture ""Extended Edition"""
21 | 1999,Chevy,"Venture ""Extended Edition, Very Large"""
22 | 1997,Ford,E350,"Super, luxurious truck"
23 | 1996,Jeep,Grand Cherokee,"MUST SELL!
24 | air, moon roof, loaded"
25 | 1999,Chevy,"Venture ""Extended Edition"""
26 | 1999,Chevy,"Venture ""Extended Edition, Very Large"""


--------------------------------------------------------------------------------
/src/test/resources/test_input_without_quotes.csv:
--------------------------------------------------------------------------------
 1 | YEAR,MAKE,DESCRIPTION
 2 | 1997,Ford,E350,Super luxurious truck
 3 | 1996,Jeep,Grand Cherokee,MUST SELL!
 4 | 1999,Chevy,Extended Edition
 5 | 1999,Chevy,Very Large
 6 | 1997,Ford,E350,Super luxurious truck
 7 | 1996,Jeep,Grand Cherokee,MUST SELL!
 8 | 1999,Chevy,Extended Edition
 9 | 1999,Chevy,Very Large
10 | 1997,Ford,E350,Super luxurious truck
11 | 1996,Jeep,Grand Cherokee,MUST SELL!
12 | 1999,Chevy,Extended Edition
13 | 1999,Chevy,Very Large
14 | 1997,Ford,E350,Super luxurious truck
15 | 1996,Jeep,Grand Cherokee,MUST SELL!
16 | 1999,Chevy,Extended Edition
17 | 1999,Chevy,Very Large
18 | 1997,Ford,E350,Super luxurious truck
19 | 1996,Jeep,Grand Cherokee,MUST SELL!
20 | 1999,Chevy,Extended Edition
21 | 1999,Chevy,Very Large
22 | 1997,Ford,E350,Super luxurious truck
23 | 1996,Jeep,Grand Cherokee,MUST SELL!
24 | 1999,Chevy,Extended Edition
25 | 1999,Chevy,Very Large
26 | 1997,Ford,E350,Super luxurious truck
27 | 1996,Jeep,Grand Cherokee,MUST SELL!
28 | 1999,Chevy,Extended Edition
29 | 1999,Chevy,Very Large


--------------------------------------------------------------------------------