├── .gitignore ├── .travis.yml ├── LICENSE.txt ├── README.md ├── build.gradle ├── config └── checkstyle │ ├── checkstyle.xml │ └── default.xml ├── gradle └── wrapper │ ├── gradle-wrapper.jar │ └── gradle-wrapper.properties ├── gradlew ├── gradlew.bat ├── lib └── embulk │ └── filter │ └── mask.rb └── src ├── main └── java │ └── org │ └── embulk │ └── filter │ └── mask │ ├── MaskFilterPlugin.java │ └── MaskPageOutput.java └── test └── java └── org └── embulk └── filter └── mask └── TestMaskFilterPlugin.java /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | /pkg/ 3 | /tmp/ 4 | *.gemspec 5 | .gradle/ 6 | /classpath/ 7 | build/ 8 | .idea 9 | /.settings/ 10 | /.metadata/ 11 | .classpath 12 | .project 13 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: java 2 | jdk: 3 | # Tentatively ignore these environments as we face SSL errors 4 | # https://github.com/gradle/gradle/issues/2421 5 | # 6 | # - openjdk7 7 | # - oraclejdk7 8 | - oraclejdk8 9 | script: 10 | - ./gradlew test 11 | after_success: 12 | - ./gradlew jacocoTestReport coveralls 13 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | MIT License 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining 5 | a copy of this software and associated documentation files (the 6 | "Software"), to deal in the Software without restriction, including 7 | without limitation the rights to use, copy, modify, merge, publish, 8 | distribute, sublicense, and/or sell copies of the Software, and to 9 | permit persons to whom the Software is furnished to do so, subject to 10 | the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be 13 | included in all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 19 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 20 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 21 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Mask filter plugin for Embulk 2 | 3 | [![Coverage Status](https://coveralls.io/repos/github/beniyama/embulk-filter-mask/badge.svg)](https://coveralls.io/github/beniyama/embulk-filter-mask) 4 | 5 | Mask columns with asterisks in a variety of patterns (still in initial development phase and missing basic features to use in production). 6 | 7 | ## Overview 8 | 9 | * **Plugin type**: filter 10 | 11 | ## Configuration 12 | 13 | *Caution* : Now we use `type` to specify mask types such as `all` and `email`, instead of `pattern` which was used in version 0.1.1 or earlier. 14 | 15 | - **columns**: target columns which would be replaced with asterisks (string, required) 16 | - **name**: name of the column (string, required) 17 | - **type**: mask type, `all`, `email`, `regex` or `substring` (string, default: `all`) 18 | - **paths**: list of JSON path and type, works if the column type is JSON 19 | - `[{key: $.json_path1}, {key: $.json_path2}]` would mask both `$.json_path1` and `$.json_path2` nodes 20 | - Elements under the nodes would be converted to string and then masked (e.g., `[0,1,2]` -> `*******`) 21 | - **length**: if specified, this filter replaces the column with fixed number of asterisks (integer, optional. supported only in `all`, `email`, `substring`.) 22 | - **pattern**: Regex pattern such as "[0-9]+" (string, required for `regex` type) 23 | - **start**: The beginning index for `substring` type. The value starts from 0 and inclusive (integer, default: 0) 24 | - **end**: The ending index for `substring` type. The value is exclusive (integer, default: length of the target column) 25 | 26 | ## Example 27 | 28 | 29 | 30 | If you have below data in csv or other format file, 31 | 32 | |first_name | last_name | gender | age | contact | 33 | |---|---|---|---|---| 34 | | Benjamin | Bell | male | 30 | bell.benjamin_dummy@example.com | 35 | | Lucas | Duncan | male | 20 | lucas.duncan_dummy@example.com | 36 | | Elizabeth | May | female | 25 | elizabeth.may_dummy@example.com | 37 | | Christian | Reid | male | 15 | christian.reid_dummy@example.com | 38 | | Amy | Avery | female | 40 | amy.avercy_dummy@example.com | 39 | 40 | below filter configuration 41 | 42 | ```yaml 43 | filters: 44 | - type: mask 45 | columns: 46 | - { name: last_name} 47 | - { name: age} 48 | - { name: contact, type: email, length: 5} 49 | ``` 50 | 51 | would produce 52 | 53 | |first_name | last_name | gender | age | contact | 54 | |---|---|---|---|---| 55 | | Benjamin | **** | male | ** | *****@example.com | 56 | | Lucas | ****** | male | ** | *****@example.com | 57 | | Elizabeth | *** | female | ** | *****@example.com | 58 | | Christian | **** | male | ** | *****@example.com | 59 | | Amy | ***** | female | ** | *****@example.com | 60 | 61 | If you use `regex` and/or `substring` types, 62 | 63 | ```yaml 64 | filters: 65 | - type: mask 66 | columns: 67 | - { name: first_name, type: regex, pattern: "[a-z]"} 68 | - { name: contact, type: substring, start: 5, length: 5} 69 | ``` 70 | 71 | would produce 72 | 73 | |first_name | last_name | gender | age | contact | 74 | |---|---|---|---|---| 75 | | B******* | Bell | male | 30 | bell.***** | 76 | | L**** | Duncan | male | 20 | lucas***** | 77 | | E******* | May | female | 25 | eliza***** | 78 | | C******** | Reid | male | 15 | chris***** | 79 | | A** | Avery | female | 40 | amy.a***** | 80 | 81 | JSON type column is also partially supported. 82 | 83 | If you have a `user` column with this JSON data structure 84 | 85 | ```json 86 | { 87 | "full_name": { 88 | "first_name": "Benjamin", 89 | "last_name": "Bell" 90 | }, 91 | "gender": "male", 92 | "age": 30, 93 | "email": "test_mail@example.com" 94 | } 95 | ``` 96 | 97 | below filter configuration 98 | 99 | ```yaml 100 | filters: 101 | - type: mask 102 | columns: 103 | - { name: user, paths: [{key: $.full_name.first_name}, {key: $.email, type: email}]} 104 | ``` 105 | 106 | would produce 107 | 108 | ```json 109 | { 110 | "full_name": { 111 | "first_name": "********", 112 | "last_name": "Bell" 113 | }, 114 | "gender": "male", 115 | "age": 30, 116 | "email": "*********@example.com" 117 | } 118 | ``` 119 | 120 | 121 | ## Build 122 | 123 | ``` 124 | $ ./gradlew gem # -t to watch change of files and rebuild continuously 125 | ``` 126 | -------------------------------------------------------------------------------- /build.gradle: -------------------------------------------------------------------------------- 1 | plugins { 2 | id "com.jfrog.bintray" version "1.1" 3 | id "com.github.jruby-gradle.base" version "0.1.5" 4 | id "com.github.kt3k.coveralls" version "2.8.1" 5 | id "java" 6 | id "checkstyle" 7 | id "jacoco" 8 | } 9 | import com.github.jrubygradle.JRubyExec 10 | repositories { 11 | mavenCentral() 12 | jcenter() 13 | } 14 | configurations { 15 | provided 16 | } 17 | 18 | version = "0.2.1" 19 | 20 | sourceCompatibility = 1.7 21 | targetCompatibility = 1.7 22 | 23 | dependencies { 24 | compile "org.embulk:embulk-core:0.8.29" 25 | provided "org.embulk:embulk-core:0.8.29" 26 | compile "com.jayway.jsonpath:json-path:2.+" 27 | testCompile "junit:junit:4.+" 28 | testCompile "org.embulk:embulk-core:0.8.29:tests" 29 | } 30 | 31 | jacocoTestReport { 32 | reports { 33 | xml.enabled = true // coveralls plugin depends on xml format report 34 | html.enabled = true 35 | } 36 | } 37 | 38 | task classpath(type: Copy, dependsOn: ["jar"]) { 39 | doFirst { file("classpath").deleteDir() } 40 | from (configurations.runtime - configurations.provided + files(jar.archivePath)) 41 | into "classpath" 42 | } 43 | clean { delete "classpath" } 44 | 45 | checkstyle { 46 | configFile = file("${project.rootDir}/config/checkstyle/checkstyle.xml") 47 | toolVersion = '6.14.1' 48 | } 49 | checkstyleMain { 50 | configFile = file("${project.rootDir}/config/checkstyle/default.xml") 51 | ignoreFailures = true 52 | } 53 | checkstyleTest { 54 | configFile = file("${project.rootDir}/config/checkstyle/default.xml") 55 | ignoreFailures = true 56 | } 57 | task checkstyle(type: Checkstyle) { 58 | classpath = sourceSets.main.output + sourceSets.test.output 59 | source = sourceSets.main.allJava + sourceSets.test.allJava 60 | } 61 | 62 | task gem(type: JRubyExec, dependsOn: ["gemspec", "classpath"]) { 63 | jrubyArgs "-rrubygems/gem_runner", "-eGem::GemRunner.new.run(ARGV)", "build" 64 | script "${project.name}.gemspec" 65 | doLast { ant.move(file: "${project.name}-${project.version}.gem", todir: "pkg") } 66 | } 67 | 68 | task gemPush(type: JRubyExec, dependsOn: ["gem"]) { 69 | jrubyArgs "-rrubygems/gem_runner", "-eGem::GemRunner.new.run(ARGV)", "push" 70 | script "pkg/${project.name}-${project.version}.gem" 71 | } 72 | 73 | task "package"(dependsOn: ["gemspec", "classpath"]) << { 74 | println "> Build succeeded." 75 | println "> You can run embulk with '-L ${file(".").absolutePath}' argument." 76 | } 77 | 78 | task gemspec { 79 | ext.gemspecFile = file("${project.name}.gemspec") 80 | inputs.file "build.gradle" 81 | outputs.file gemspecFile 82 | doLast { gemspecFile.write($/ 83 | Gem::Specification.new do |spec| 84 | spec.name = "${project.name}" 85 | spec.version = "${project.version}" 86 | spec.authors = ["Tetsuo Yamabe"] 87 | spec.summary = %[Mask filter plugin for Embulk] 88 | spec.description = %[Mask] 89 | spec.email = ["tetsuo.yamabe@gmail.com"] 90 | spec.licenses = ["MIT"] 91 | spec.homepage = "https://github.com/beniyama/embulk-filter-mask" 92 | 93 | spec.files = `git ls-files`.split("\n") + Dir["classpath/*.jar"] 94 | spec.test_files = spec.files.grep(%r"^(test|spec)/") 95 | spec.require_paths = ["lib"] 96 | 97 | #spec.add_dependency 'YOUR_GEM_DEPENDENCY', ['~> YOUR_GEM_DEPENDENCY_VERSION'] 98 | spec.add_development_dependency 'bundler', ['~> 1.0'] 99 | spec.add_development_dependency 'rake', ['>= 10.0'] 100 | end 101 | /$) 102 | } 103 | } 104 | clean { delete "${project.name}.gemspec" } 105 | -------------------------------------------------------------------------------- /config/checkstyle/checkstyle.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 126 | 127 | 128 | 129 | -------------------------------------------------------------------------------- /config/checkstyle/default.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 106 | 107 | 108 | 109 | -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/beniyama/embulk-filter-mask/b1b545087d0fa50ecdc2d72773b3c58c4e20dc0f/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | #Tue Jul 12 16:30:09 JST 2016 2 | distributionBase=GRADLE_USER_HOME 3 | distributionPath=wrapper/dists 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | distributionUrl=https\://services.gradle.org/distributions/gradle-2.10-all.zip 7 | -------------------------------------------------------------------------------- /gradlew: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ############################################################################## 4 | ## 5 | ## Gradle start up script for UN*X 6 | ## 7 | ############################################################################## 8 | 9 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 10 | DEFAULT_JVM_OPTS="" 11 | 12 | APP_NAME="Gradle" 13 | APP_BASE_NAME=`basename "$0"` 14 | 15 | # Use the maximum available, or set MAX_FD != -1 to use that value. 16 | MAX_FD="maximum" 17 | 18 | warn ( ) { 19 | echo "$*" 20 | } 21 | 22 | die ( ) { 23 | echo 24 | echo "$*" 25 | echo 26 | exit 1 27 | } 28 | 29 | # OS specific support (must be 'true' or 'false'). 30 | cygwin=false 31 | msys=false 32 | darwin=false 33 | case "`uname`" in 34 | CYGWIN* ) 35 | cygwin=true 36 | ;; 37 | Darwin* ) 38 | darwin=true 39 | ;; 40 | MINGW* ) 41 | msys=true 42 | ;; 43 | esac 44 | 45 | # Attempt to set APP_HOME 46 | # Resolve links: $0 may be a link 47 | PRG="$0" 48 | # Need this for relative symlinks. 49 | while [ -h "$PRG" ] ; do 50 | ls=`ls -ld "$PRG"` 51 | link=`expr "$ls" : '.*-> \(.*\)$'` 52 | if expr "$link" : '/.*' > /dev/null; then 53 | PRG="$link" 54 | else 55 | PRG=`dirname "$PRG"`"/$link" 56 | fi 57 | done 58 | SAVED="`pwd`" 59 | cd "`dirname \"$PRG\"`/" >/dev/null 60 | APP_HOME="`pwd -P`" 61 | cd "$SAVED" >/dev/null 62 | 63 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar 64 | 65 | # Determine the Java command to use to start the JVM. 66 | if [ -n "$JAVA_HOME" ] ; then 67 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 68 | # IBM's JDK on AIX uses strange locations for the executables 69 | JAVACMD="$JAVA_HOME/jre/sh/java" 70 | else 71 | JAVACMD="$JAVA_HOME/bin/java" 72 | fi 73 | if [ ! -x "$JAVACMD" ] ; then 74 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME 75 | 76 | Please set the JAVA_HOME variable in your environment to match the 77 | location of your Java installation." 78 | fi 79 | else 80 | JAVACMD="java" 81 | which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 82 | 83 | Please set the JAVA_HOME variable in your environment to match the 84 | location of your Java installation." 85 | fi 86 | 87 | # Increase the maximum file descriptors if we can. 88 | if [ "$cygwin" = "false" -a "$darwin" = "false" ] ; then 89 | MAX_FD_LIMIT=`ulimit -H -n` 90 | if [ $? -eq 0 ] ; then 91 | if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then 92 | MAX_FD="$MAX_FD_LIMIT" 93 | fi 94 | ulimit -n $MAX_FD 95 | if [ $? -ne 0 ] ; then 96 | warn "Could not set maximum file descriptor limit: $MAX_FD" 97 | fi 98 | else 99 | warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" 100 | fi 101 | fi 102 | 103 | # For Darwin, add options to specify how the application appears in the dock 104 | if $darwin; then 105 | GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" 106 | fi 107 | 108 | # For Cygwin, switch paths to Windows format before running java 109 | if $cygwin ; then 110 | APP_HOME=`cygpath --path --mixed "$APP_HOME"` 111 | CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` 112 | JAVACMD=`cygpath --unix "$JAVACMD"` 113 | 114 | # We build the pattern for arguments to be converted via cygpath 115 | ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` 116 | SEP="" 117 | for dir in $ROOTDIRSRAW ; do 118 | ROOTDIRS="$ROOTDIRS$SEP$dir" 119 | SEP="|" 120 | done 121 | OURCYGPATTERN="(^($ROOTDIRS))" 122 | # Add a user-defined pattern to the cygpath arguments 123 | if [ "$GRADLE_CYGPATTERN" != "" ] ; then 124 | OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" 125 | fi 126 | # Now convert the arguments - kludge to limit ourselves to /bin/sh 127 | i=0 128 | for arg in "$@" ; do 129 | CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` 130 | CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option 131 | 132 | if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition 133 | eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` 134 | else 135 | eval `echo args$i`="\"$arg\"" 136 | fi 137 | i=$((i+1)) 138 | done 139 | case $i in 140 | (0) set -- ;; 141 | (1) set -- "$args0" ;; 142 | (2) set -- "$args0" "$args1" ;; 143 | (3) set -- "$args0" "$args1" "$args2" ;; 144 | (4) set -- "$args0" "$args1" "$args2" "$args3" ;; 145 | (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; 146 | (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; 147 | (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; 148 | (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; 149 | (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; 150 | esac 151 | fi 152 | 153 | # Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules 154 | function splitJvmOpts() { 155 | JVM_OPTS=("$@") 156 | } 157 | eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS 158 | JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME" 159 | 160 | exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@" 161 | -------------------------------------------------------------------------------- /gradlew.bat: -------------------------------------------------------------------------------- 1 | @if "%DEBUG%" == "" @echo off 2 | @rem ########################################################################## 3 | @rem 4 | @rem Gradle startup script for Windows 5 | @rem 6 | @rem ########################################################################## 7 | 8 | @rem Set local scope for the variables with windows NT shell 9 | if "%OS%"=="Windows_NT" setlocal 10 | 11 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 12 | set DEFAULT_JVM_OPTS= 13 | 14 | set DIRNAME=%~dp0 15 | if "%DIRNAME%" == "" set DIRNAME=. 16 | set APP_BASE_NAME=%~n0 17 | set APP_HOME=%DIRNAME% 18 | 19 | @rem Find java.exe 20 | if defined JAVA_HOME goto findJavaFromJavaHome 21 | 22 | set JAVA_EXE=java.exe 23 | %JAVA_EXE% -version >NUL 2>&1 24 | if "%ERRORLEVEL%" == "0" goto init 25 | 26 | echo. 27 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 28 | echo. 29 | echo Please set the JAVA_HOME variable in your environment to match the 30 | echo location of your Java installation. 31 | 32 | goto fail 33 | 34 | :findJavaFromJavaHome 35 | set JAVA_HOME=%JAVA_HOME:"=% 36 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 37 | 38 | if exist "%JAVA_EXE%" goto init 39 | 40 | echo. 41 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 42 | echo. 43 | echo Please set the JAVA_HOME variable in your environment to match the 44 | echo location of your Java installation. 45 | 46 | goto fail 47 | 48 | :init 49 | @rem Get command-line arguments, handling Windowz variants 50 | 51 | if not "%OS%" == "Windows_NT" goto win9xME_args 52 | if "%@eval[2+2]" == "4" goto 4NT_args 53 | 54 | :win9xME_args 55 | @rem Slurp the command line arguments. 56 | set CMD_LINE_ARGS= 57 | set _SKIP=2 58 | 59 | :win9xME_args_slurp 60 | if "x%~1" == "x" goto execute 61 | 62 | set CMD_LINE_ARGS=%* 63 | goto execute 64 | 65 | :4NT_args 66 | @rem Get arguments from the 4NT Shell from JP Software 67 | set CMD_LINE_ARGS=%$ 68 | 69 | :execute 70 | @rem Setup the command line 71 | 72 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 73 | 74 | @rem Execute Gradle 75 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% 76 | 77 | :end 78 | @rem End local scope for the variables with windows NT shell 79 | if "%ERRORLEVEL%"=="0" goto mainEnd 80 | 81 | :fail 82 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 83 | rem the _cmd.exe /c_ return code! 84 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 85 | exit /b 1 86 | 87 | :mainEnd 88 | if "%OS%"=="Windows_NT" endlocal 89 | 90 | :omega 91 | -------------------------------------------------------------------------------- /lib/embulk/filter/mask.rb: -------------------------------------------------------------------------------- 1 | Embulk::JavaPlugin.register_filter( 2 | "mask", "org.embulk.filter.mask.MaskFilterPlugin", 3 | File.expand_path('../../../../classpath', __FILE__)) 4 | -------------------------------------------------------------------------------- /src/main/java/org/embulk/filter/mask/MaskFilterPlugin.java: -------------------------------------------------------------------------------- 1 | package org.embulk.filter.mask; 2 | 3 | import com.google.common.base.Optional; 4 | import com.google.common.collect.ImmutableList; 5 | import org.embulk.config.Config; 6 | import org.embulk.config.ConfigDefault; 7 | import org.embulk.config.ConfigSource; 8 | import org.embulk.config.Task; 9 | import org.embulk.config.TaskSource; 10 | import org.embulk.spi.*; 11 | import org.embulk.spi.type.Type; 12 | import org.embulk.spi.type.Types; 13 | import org.slf4j.Logger; 14 | 15 | import java.util.HashMap; 16 | import java.util.List; 17 | import java.util.Map; 18 | 19 | public class MaskFilterPlugin implements FilterPlugin { 20 | private final Logger logger = Exec.getLogger(MaskFilterPlugin.class); 21 | 22 | public interface PluginTask extends Task { 23 | @Config("columns") 24 | List getColumns(); 25 | 26 | } 27 | 28 | public interface MaskColumn extends Task { 29 | @Config("name") 30 | String getName(); 31 | 32 | @Config("type") 33 | @ConfigDefault("\"all\"") 34 | Optional getType(); 35 | 36 | @Config("pattern") 37 | @ConfigDefault("\"all\"") 38 | Optional getPattern(); 39 | 40 | @Config("length") 41 | @ConfigDefault("null") 42 | Optional getLength(); 43 | 44 | @Config("start") 45 | @ConfigDefault("null") 46 | Optional getStart(); 47 | 48 | @Config("end") 49 | @ConfigDefault("null") 50 | Optional getEnd(); 51 | 52 | @Config("paths") 53 | @ConfigDefault("null") 54 | Optional>> getPaths(); 55 | } 56 | 57 | @Override 58 | public void transaction(ConfigSource config, Schema inputSchema, 59 | FilterPlugin.Control control) { 60 | PluginTask task = config.loadConfig(PluginTask.class); 61 | Schema outputSchema = buildOutputSchema(task, inputSchema); 62 | control.run(task.dump(), outputSchema); 63 | } 64 | 65 | 66 | private Schema buildOutputSchema(PluginTask task, Schema inputSchema) { 67 | ImmutableList.Builder builder = ImmutableList.builder(); 68 | 69 | Map maskColumnMap = getMaskColumnMap(task); 70 | int i = 0; 71 | for (Column inputColumn : inputSchema.getColumns()) { 72 | String name = inputColumn.getName(); 73 | Type type = (maskColumnMap.containsKey(name) && inputColumn.getType() != Types.JSON) ? Types.STRING : inputColumn.getType(); 74 | Column outputColumn = new Column(i++, inputColumn.getName(), type); 75 | builder.add(outputColumn); 76 | } 77 | 78 | Schema outputSchema = new Schema(builder.build()); 79 | return outputSchema; 80 | } 81 | 82 | public static Map getMaskColumnMap(PluginTask task) { 83 | Map maskColumnMap = new HashMap<>(); 84 | for (MaskColumn maskColumn : task.getColumns()) { 85 | maskColumnMap.put(maskColumn.getName(), maskColumn); 86 | } 87 | return maskColumnMap; 88 | } 89 | 90 | @Override 91 | public PageOutput open(TaskSource taskSource, Schema inputSchema, Schema outputSchema, PageOutput output) { 92 | return new MaskPageOutput(taskSource, inputSchema, outputSchema, output); 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /src/main/java/org/embulk/filter/mask/MaskPageOutput.java: -------------------------------------------------------------------------------- 1 | package org.embulk.filter.mask; 2 | 3 | import com.fasterxml.jackson.databind.node.TextNode; 4 | import com.jayway.jsonpath.*; 5 | import org.apache.commons.lang3.StringUtils; 6 | import org.embulk.config.TaskSource; 7 | import org.embulk.spi.*; 8 | import org.embulk.spi.json.JsonParser; 9 | import org.embulk.spi.time.Timestamp; 10 | import org.embulk.spi.type.Type; 11 | import org.embulk.spi.type.Types; 12 | import org.embulk.filter.mask.MaskFilterPlugin.*; 13 | import org.msgpack.value.Value; 14 | import org.slf4j.Logger; 15 | 16 | import java.util.ArrayList; 17 | import java.util.HashMap; 18 | import java.util.List; 19 | import java.util.Map; 20 | import java.util.regex.Matcher; 21 | import java.util.regex.Pattern; 22 | 23 | public class MaskPageOutput implements PageOutput { 24 | private final MaskFilterPlugin.PluginTask task; 25 | private final Map outputColumnMap; 26 | private final List inputColumns; 27 | private final Map maskColumnMap; 28 | private final PageReader reader; 29 | private final PageBuilder builder; 30 | private final ParseContext parseContext; 31 | private final JsonParser jsonParser; 32 | private final Logger logger = Exec.getLogger(MaskPageOutput.class); 33 | 34 | public MaskPageOutput(TaskSource taskSource, Schema inputSchema, Schema outputSchema, PageOutput output) { 35 | this.task = taskSource.loadTask(MaskFilterPlugin.PluginTask.class); 36 | this.inputColumns = inputSchema.getColumns(); 37 | this.maskColumnMap = MaskFilterPlugin.getMaskColumnMap(this.task); 38 | this.reader = new PageReader(inputSchema); 39 | this.builder = new PageBuilder(Exec.getBufferAllocator(), outputSchema, output); 40 | this.outputColumnMap = new HashMap<>(); 41 | for (Column column : outputSchema.getColumns()) { 42 | this.outputColumnMap.put(column.getName(), column); 43 | } 44 | this.parseContext = initializeParseContext(); 45 | this.jsonParser = new JsonParser(); 46 | } 47 | 48 | private ParseContext initializeParseContext() { 49 | Configuration conf = Configuration.defaultConfiguration(); 50 | conf = conf.addOptions(Option.DEFAULT_PATH_LEAF_TO_NULL); 51 | conf = conf.addOptions(Option.SUPPRESS_EXCEPTIONS); 52 | return JsonPath.using(conf); 53 | } 54 | 55 | @Override 56 | public void add(Page page) { 57 | reader.setPage(page); 58 | while (reader.nextRecord()) { 59 | setValue(); 60 | builder.addRecord(); 61 | } 62 | } 63 | 64 | private void setValue() { 65 | for (Column inputColumn : inputColumns) { 66 | if (reader.isNull(inputColumn)) { 67 | builder.setNull(inputColumn); 68 | continue; 69 | } 70 | 71 | String name = inputColumn.getName(); 72 | Type type = inputColumn.getType(); 73 | 74 | if (Types.STRING.equals(type)) { 75 | final String value = reader.getString(inputColumn); 76 | if (maskColumnMap.containsKey(name)) { 77 | builder.setString(inputColumn, maskAsString(name, value)); 78 | } else { 79 | builder.setString(inputColumn, value); 80 | } 81 | } else if (Types.BOOLEAN.equals(type)) { 82 | final boolean value = reader.getBoolean(inputColumn); 83 | if (maskColumnMap.containsKey(name)) { 84 | builder.setString(inputColumn, maskAsString(name, value)); 85 | } else { 86 | builder.setBoolean(inputColumn, value); 87 | } 88 | } else if (Types.DOUBLE.equals(type)) { 89 | final double value = reader.getDouble(inputColumn); 90 | if (maskColumnMap.containsKey(name)) { 91 | builder.setString(inputColumn, maskAsString(name, value)); 92 | } else { 93 | builder.setDouble(inputColumn, value); 94 | } 95 | } else if (Types.LONG.equals(type)) { 96 | final long value = reader.getLong(inputColumn); 97 | if (maskColumnMap.containsKey(name)) { 98 | builder.setString(inputColumn, maskAsString(name, value)); 99 | } else { 100 | builder.setLong(inputColumn, value); 101 | } 102 | } else if (Types.TIMESTAMP.equals(type)) { 103 | final Timestamp value = reader.getTimestamp(inputColumn); 104 | if (maskColumnMap.containsKey(name)) { 105 | builder.setString(inputColumn, maskAsString(name, value)); 106 | } else { 107 | builder.setTimestamp(inputColumn, value); 108 | } 109 | } else if (Types.JSON.equals(type)) { 110 | final Value value = reader.getJson(inputColumn); 111 | if (maskColumnMap.containsKey(name)) { 112 | builder.setJson(inputColumn, maskAsJson(name, value)); 113 | } else { 114 | builder.setJson(inputColumn, value); 115 | } 116 | } else { 117 | throw new DataException("Unexpected type:" + type); 118 | } 119 | } 120 | } 121 | 122 | private String maskAsString(String name, Object value) { 123 | MaskColumn maskColumn = maskColumnMap.get(name); 124 | String type = maskColumn.getType().get(); 125 | String pattern = maskColumn.getPattern().or(""); 126 | Integer length = maskColumn.getLength().or(-1); 127 | Integer start = maskColumn.getStart().or(-1); 128 | Integer end = maskColumn.getEnd().or(-1); 129 | 130 | return mask(type, value, pattern, length, start, end); 131 | } 132 | 133 | private Value maskAsJson(String name, Value value) { 134 | MaskColumn maskColumn = maskColumnMap.get(name); 135 | DocumentContext context = parseContext.parse(value.toJson()); 136 | List> paths = maskColumn.getPaths().or(new ArrayList>()); 137 | 138 | for (Map path : paths) { 139 | String key = path.get("key"); 140 | String type = path.containsKey("type") ? path.get("type") : "all"; 141 | String pattern = path.containsKey("pattern") ? path.get("pattern") : ""; 142 | Integer length = path.containsKey("length") ? Integer.parseInt(path.get("length")) : -1; 143 | Integer start = path.containsKey("start") ? Integer.parseInt(path.get("start")) : -1; 144 | Integer end = path.containsKey("end") ? Integer.parseInt(path.get("end")) : -1; 145 | Object element = context.read(key); 146 | if (!key.equals("$") && element != null) { 147 | String maskedValue = mask(type, element, pattern, length, start, end); 148 | context.set(key, new TextNode(maskedValue).asText()).jsonString(); 149 | } 150 | } 151 | return jsonParser.parse(context.jsonString()); 152 | } 153 | 154 | @Override 155 | public void finish() { 156 | builder.finish(); 157 | } 158 | 159 | @Override 160 | public void close() { 161 | builder.close(); 162 | } 163 | 164 | private String mask(String type, Object value, String pattern, Integer length, Integer start, Integer end) { 165 | String maskedValue; 166 | String nakedValue = value.toString(); 167 | if (type.equals("regex")) { 168 | maskedValue = maskRegex(nakedValue, pattern); 169 | } else if (type.equals("substring")) { 170 | maskedValue = maskSubstring(nakedValue, start, end, length); 171 | } else if (type.equals("email")) { 172 | maskedValue = maskEmail(nakedValue, length); 173 | } else if (type.equals("all")) { 174 | maskedValue = maskAll(nakedValue, length); 175 | } else { 176 | maskedValue = nakedValue; 177 | } 178 | return maskedValue; 179 | } 180 | 181 | private String maskAll(Object value, Integer length) { 182 | String maskedValue; 183 | String nakedValue = value.toString(); 184 | if (length > 0) { 185 | maskedValue = StringUtils.repeat("*", length); 186 | } else { 187 | maskedValue = nakedValue.replaceAll(".", "*"); 188 | } 189 | return maskedValue; 190 | } 191 | 192 | private String maskEmail(Object value, Integer length) { 193 | String maskedValue; 194 | String nakedValue = value.toString(); 195 | if (length > 0) { 196 | String maskPattern = StringUtils.repeat("*", length) + "@$1"; 197 | maskedValue = nakedValue.replaceFirst("^.+?@(.+)$", maskPattern); 198 | } else { 199 | maskedValue = nakedValue.replaceAll(".(?=[^@]*@)", "*"); 200 | } 201 | return maskedValue; 202 | } 203 | 204 | private String maskRegex(Object value, String pattern) { 205 | String nakedValue = value.toString(); 206 | return nakedValue.replaceAll(pattern, "*"); 207 | } 208 | 209 | private String maskSubstring(Object value, Integer start, Integer end, Integer length) { 210 | String nakedValue = value.toString(); 211 | 212 | if (nakedValue.length() <= start || (0 <= end && (end - 1) <= start)) return nakedValue; 213 | 214 | start = start < 0 ? 0 : start; 215 | end = (end < 0 || nakedValue.length() <= end) ? nakedValue.length() : end; 216 | int repeat = length > 0 ? length : end - start; 217 | 218 | StringBuffer buffer = new StringBuffer(nakedValue); 219 | return buffer.replace(start, end, StringUtils.repeat("*", repeat)).toString(); 220 | } 221 | } 222 | -------------------------------------------------------------------------------- /src/test/java/org/embulk/filter/mask/TestMaskFilterPlugin.java: -------------------------------------------------------------------------------- 1 | package org.embulk.filter.mask; 2 | 3 | import com.fasterxml.jackson.core.JsonProcessingException; 4 | import com.fasterxml.jackson.databind.ObjectMapper; 5 | import com.google.common.base.Throwables; 6 | import com.google.common.collect.ImmutableMap; 7 | import org.embulk.EmbulkTestRuntime; 8 | import org.embulk.config.ConfigException; 9 | import org.embulk.config.ConfigLoader; 10 | import org.embulk.config.ConfigSource; 11 | import org.embulk.config.TaskSource; 12 | import org.embulk.spi.*; 13 | import org.embulk.spi.TestPageBuilderReader.*; 14 | import org.embulk.spi.time.Timestamp; 15 | import org.embulk.spi.util.Pages; 16 | import org.junit.Rule; 17 | import org.junit.Test; 18 | import org.junit.rules.ExpectedException; 19 | import org.msgpack.value.Value; 20 | 21 | import java.util.List; 22 | 23 | 24 | import static org.embulk.filter.mask.MaskFilterPlugin.PluginTask; 25 | import static org.embulk.filter.mask.MaskFilterPlugin.Control; 26 | import static org.embulk.spi.type.Types.*; 27 | import static org.junit.Assert.assertEquals; 28 | import static org.msgpack.value.ValueFactory.*; 29 | 30 | public class TestMaskFilterPlugin { 31 | @Rule 32 | public EmbulkTestRuntime runtime = new EmbulkTestRuntime(); 33 | 34 | @Rule 35 | public ExpectedException exception = ExpectedException.none(); 36 | 37 | private static Value s(String value) { 38 | return newString(value); 39 | } 40 | 41 | private static Value i(int value) { 42 | return newInteger(value); 43 | } 44 | 45 | private static Value f(double value) { 46 | return newFloat(value); 47 | } 48 | 49 | private static Value b(boolean value) { 50 | return newBoolean(value); 51 | } 52 | 53 | private ConfigSource getConfigFromYaml(String yaml) { 54 | ConfigLoader loader = new ConfigLoader(Exec.getModelManager()); 55 | return loader.fromYamlString(yaml); 56 | } 57 | 58 | private String getMaskedCharacters(Object value) { 59 | String maskedValue = ""; 60 | for (int i = 0; i < value.toString().length(); i++) { 61 | maskedValue += "*"; 62 | } 63 | return maskedValue; 64 | } 65 | 66 | private String getMaskedEmail(String email) { 67 | String maskedValue = ""; 68 | for (int i = 0; i < email.length(); i++) { 69 | if (email.charAt(i) == '@') { 70 | maskedValue += email.substring(i); 71 | break; 72 | } 73 | maskedValue += "*"; 74 | } 75 | return maskedValue; 76 | } 77 | 78 | @Test 79 | public void testThrowExceptionAtMissingColumnsField() { 80 | String configYaml = "type: mask"; 81 | ConfigSource config = getConfigFromYaml(configYaml); 82 | 83 | exception.expect(ConfigException.class); 84 | exception.expectMessage("Field 'columns' is required but not set"); 85 | config.loadConfig(PluginTask.class); 86 | } 87 | 88 | @Test 89 | public void testOnlyMaskTargetColumns() { 90 | String configYaml = "" + 91 | "type: mask\n" + 92 | "columns:\n" + 93 | " - { name: _c0}\n" + 94 | " - { name: _c2}\n"; 95 | 96 | ConfigSource config = getConfigFromYaml(configYaml); 97 | 98 | final Schema inputSchema = Schema.builder() 99 | .add("_c0", STRING) 100 | .add("_c1", STRING) 101 | .add("_c2", STRING) 102 | .add("_c3", STRING) 103 | .build(); 104 | 105 | final MaskFilterPlugin maskFilterPlugin = new MaskFilterPlugin(); 106 | maskFilterPlugin.transaction(config, inputSchema, new Control() { 107 | @Override 108 | public void run(TaskSource taskSource, Schema outputSchema) { 109 | final String c0ColumnValue = "_c0_THIS_MUST_BE_MASKED"; 110 | final String c1ColumnValue = "_c1_THIS_MUST_NOT_BE_MASKED"; 111 | final String c2ColumnValue = "_c2_THIS_MUST_BE_MASKED_ALSO"; 112 | final String c3ColumnValue = "_c3_THIS_MUST_NOT_BE_MASKED_ALSO"; 113 | 114 | MockPageOutput mockPageOutput = new MockPageOutput(); 115 | try (PageOutput pageOutput = maskFilterPlugin.open(taskSource, inputSchema, outputSchema, mockPageOutput)) { 116 | for (Page page : PageTestUtils.buildPage(runtime.getBufferAllocator(), inputSchema, 117 | c0ColumnValue, 118 | c1ColumnValue, 119 | c2ColumnValue, 120 | c3ColumnValue 121 | )) { 122 | pageOutput.add(page); 123 | } 124 | pageOutput.finish(); 125 | } 126 | List records = Pages.toObjects(outputSchema, mockPageOutput.pages); 127 | 128 | assertEquals(1, records.size()); 129 | Object[] record = records.get(0); 130 | 131 | assertEquals(4, record.length); 132 | assertEquals(getMaskedCharacters(c0ColumnValue), record[0]); 133 | assertEquals(c1ColumnValue, record[1]); 134 | assertEquals(getMaskedCharacters(c2ColumnValue), record[2]); 135 | assertEquals(c3ColumnValue, record[3]); 136 | } 137 | }); 138 | } 139 | 140 | @Test 141 | public void testPassVarietyOfTypes() { 142 | String configYaml = "" + 143 | "type: mask\n" + 144 | "columns:\n" + 145 | " - { name: _dummy}\n"; 146 | 147 | ConfigSource config = getConfigFromYaml(configYaml); 148 | 149 | final Schema inputSchema = Schema.builder() 150 | .add("_c0", STRING) 151 | .add("_c1", BOOLEAN) 152 | .add("_c2", DOUBLE) 153 | .add("_c3", LONG) 154 | .add("_c4", TIMESTAMP) 155 | .add("_c5", JSON) 156 | .build(); 157 | 158 | final MaskFilterPlugin maskFilterPlugin = new MaskFilterPlugin(); 159 | maskFilterPlugin.transaction(config, inputSchema, new Control() { 160 | @Override 161 | public void run(TaskSource taskSource, Schema outputSchema) { 162 | final String c0ColumnValue = "_c0_STRING"; 163 | final Boolean c1ColumnValue = false; 164 | final Double c2ColumnValue = 12345.6789; 165 | final Long c3ColumnValue = Long.MAX_VALUE; 166 | final Timestamp c4ColumnValue = Timestamp.ofEpochSecond(4); 167 | final Value c5ColumnValue = newMapBuilder().put(s("_c5"), s("_v5")).build(); 168 | 169 | MockPageOutput mockPageOutput = new MockPageOutput(); 170 | try (PageOutput pageOutput = maskFilterPlugin.open(taskSource, inputSchema, outputSchema, mockPageOutput)) { 171 | for (Page page : PageTestUtils.buildPage(runtime.getBufferAllocator(), inputSchema, 172 | c0ColumnValue, 173 | c1ColumnValue, 174 | c2ColumnValue, 175 | c3ColumnValue, 176 | c4ColumnValue, 177 | c5ColumnValue 178 | )) { 179 | pageOutput.add(page); 180 | } 181 | pageOutput.finish(); 182 | } 183 | List records = Pages.toObjects(outputSchema, mockPageOutput.pages); 184 | 185 | assertEquals(1, records.size()); 186 | Object[] record = records.get(0); 187 | 188 | assertEquals(6, record.length); 189 | assertEquals(c0ColumnValue, record[0]); 190 | assertEquals(c1ColumnValue, record[1]); 191 | assertEquals(c2ColumnValue, record[2]); 192 | assertEquals(c3ColumnValue, record[3]); 193 | assertEquals(c4ColumnValue, record[4]); 194 | assertEquals(c5ColumnValue, record[5]); 195 | } 196 | }); 197 | } 198 | 199 | @Test 200 | public void testMaskVarietyOfTypes() { 201 | String configYaml = "" + 202 | "type: mask\n" + 203 | "columns:\n" + 204 | " - { name: _c0}\n" + 205 | " - { name: _c1}\n" + 206 | " - { name: _c2}\n" + 207 | " - { name: _c3}\n" + 208 | " - { name: _c4}\n"; 209 | 210 | ConfigSource config = getConfigFromYaml(configYaml); 211 | 212 | final Schema inputSchema = Schema.builder() 213 | .add("_c0", STRING) 214 | .add("_c1", BOOLEAN) 215 | .add("_c2", DOUBLE) 216 | .add("_c3", LONG) 217 | .add("_c4", TIMESTAMP) 218 | .build(); 219 | 220 | final MaskFilterPlugin maskFilterPlugin = new MaskFilterPlugin(); 221 | maskFilterPlugin.transaction(config, inputSchema, new Control() { 222 | @Override 223 | public void run(TaskSource taskSource, Schema outputSchema) { 224 | final String c0ColumnValue = "_c0_STRING"; 225 | final Boolean c1ColumnValue = false; 226 | final Double c2ColumnValue = 12345.6789; 227 | final Long c3ColumnValue = Long.MAX_VALUE; 228 | final Timestamp c4ColumnValue = Timestamp.ofEpochSecond(4); 229 | 230 | MockPageOutput mockPageOutput = new MockPageOutput(); 231 | try (PageOutput pageOutput = maskFilterPlugin.open(taskSource, inputSchema, outputSchema, mockPageOutput)) { 232 | for (Page page : PageTestUtils.buildPage(runtime.getBufferAllocator(), inputSchema, 233 | c0ColumnValue, 234 | c1ColumnValue, 235 | c2ColumnValue, 236 | c3ColumnValue, 237 | c4ColumnValue 238 | )) { 239 | pageOutput.add(page); 240 | } 241 | pageOutput.finish(); 242 | } 243 | List records = Pages.toObjects(outputSchema, mockPageOutput.pages); 244 | 245 | assertEquals(1, records.size()); 246 | Object[] record = records.get(0); 247 | 248 | assertEquals(5, record.length); 249 | assertEquals(getMaskedCharacters(c0ColumnValue), record[0]); 250 | assertEquals(getMaskedCharacters(c1ColumnValue), record[1]); 251 | assertEquals(getMaskedCharacters(c2ColumnValue), record[2]); 252 | assertEquals(getMaskedCharacters(c3ColumnValue), record[3]); 253 | assertEquals(getMaskedCharacters(c4ColumnValue), record[4]); 254 | } 255 | }); 256 | } 257 | 258 | @Test 259 | public void testMaskJson() { 260 | String configYaml = "" + 261 | "type: mask\n" + 262 | "columns:\n" + 263 | " - { name: _c0}\n" + 264 | " - { name: _c1, paths: [{key: $.root.key1}]}\n" + 265 | " - { name: _c2, paths: [{key: $.root.key3, length: 2}, {key: $.root.key4, type: all}]}\n" + 266 | " - { name: _c3, paths: [{key: $.root.key1}, {key: $.root.key3.key7, type: email, length: 3}]}\n" + 267 | " - { name: _c4, paths: [{key: $.root.key1, type: regex, pattern: \"[0-9]\"}]}\n" + 268 | " - { name: _c5, paths: [{key: $.root.key1, type: substring, start: 2, end: 4, length: 5}]}\n"; 269 | 270 | ConfigSource config = getConfigFromYaml(configYaml); 271 | 272 | final Schema inputSchema = Schema.builder() 273 | .add("_c0", JSON) 274 | .add("_c1", JSON) 275 | .add("_c2", JSON) 276 | .add("_c3", JSON) 277 | .add("_c4", JSON) 278 | .add("_c5", JSON) 279 | .build(); 280 | 281 | final MaskFilterPlugin maskFilterPlugin = new MaskFilterPlugin(); 282 | maskFilterPlugin.transaction(config, inputSchema, new Control() { 283 | @Override 284 | public void run(TaskSource taskSource, Schema outputSchema) { 285 | final Value jsonValue = newMapBuilder().put( 286 | s("root"), 287 | newMap( 288 | s("key1"), s("value1"), 289 | s("key2"), i(2), 290 | s("key3"), newMap( 291 | s("key5"), s("value5"), 292 | s("key6"), newArray(i(0), i(1), i(2), i(3), i(4)), 293 | s("key7"), s("testme@example.com") 294 | ), 295 | s("key4"), newArray(i(0), i(1), i(2), i(3), i(4)) 296 | ) 297 | ).build(); 298 | 299 | MockPageOutput mockPageOutput = new MockPageOutput(); 300 | try (PageOutput pageOutput = maskFilterPlugin.open(taskSource, inputSchema, outputSchema, mockPageOutput)) { 301 | for (Page page : PageTestUtils.buildPage(runtime.getBufferAllocator(), inputSchema, 302 | jsonValue, 303 | jsonValue, 304 | jsonValue, 305 | jsonValue, 306 | jsonValue, 307 | jsonValue 308 | )) { 309 | pageOutput.add(page); 310 | } 311 | pageOutput.finish(); 312 | } 313 | List records = Pages.toObjects(outputSchema, mockPageOutput.pages); 314 | 315 | assertEquals(1, records.size()); 316 | Object[] record = records.get(0); 317 | 318 | assertEquals(6, record.length); 319 | assertEquals("{\"root\":{\"key1\":\"value1\",\"key2\":2,\"key3\":{\"key5\":\"value5\",\"key6\":[0,1,2,3,4],\"key7\":\"testme@example.com\"},\"key4\":[0,1,2,3,4]}}", record[0].toString()); 320 | assertEquals("{\"root\":{\"key1\":\"******\",\"key2\":2,\"key3\":{\"key5\":\"value5\",\"key6\":[0,1,2,3,4],\"key7\":\"testme@example.com\"},\"key4\":[0,1,2,3,4]}}", record[1].toString()); 321 | assertEquals("{\"root\":{\"key1\":\"value1\",\"key2\":2,\"key3\":\"**\",\"key4\":\"***********\"}}", record[2].toString()); 322 | assertEquals("{\"root\":{\"key1\":\"******\",\"key2\":2,\"key3\":{\"key5\":\"value5\",\"key6\":[0,1,2,3,4],\"key7\":\"***@example.com\"},\"key4\":[0,1,2,3,4]}}", record[3].toString()); 323 | assertEquals("{\"root\":{\"key1\":\"value*\",\"key2\":2,\"key3\":{\"key5\":\"value5\",\"key6\":[0,1,2,3,4],\"key7\":\"testme@example.com\"},\"key4\":[0,1,2,3,4]}}", record[4].toString()); 324 | assertEquals("{\"root\":{\"key1\":\"va*****e1\",\"key2\":2,\"key3\":{\"key5\":\"value5\",\"key6\":[0,1,2,3,4],\"key7\":\"testme@example.com\"},\"key4\":[0,1,2,3,4]}}", record[5].toString()); 325 | } 326 | }); 327 | } 328 | 329 | @Test 330 | public void testMaskEmail() { 331 | String configYaml = "" + 332 | "type: mask\n" + 333 | "columns:\n" + 334 | " - { name: _c0, type: email}\n" + 335 | " - { name: _c1, type: email}\n" + 336 | " - { name: _c2, type: all}\n" + 337 | " - { name: _c3}\n"; 338 | 339 | ConfigSource config = getConfigFromYaml(configYaml); 340 | 341 | final Schema inputSchema = Schema.builder() 342 | .add("_c0", STRING) 343 | .add("_c1", STRING) 344 | .add("_c2", STRING) 345 | .add("_c3", STRING) 346 | .add("_c4", STRING) 347 | .build(); 348 | 349 | final MaskFilterPlugin maskFilterPlugin = new MaskFilterPlugin(); 350 | maskFilterPlugin.transaction(config, inputSchema, new Control() { 351 | @Override 352 | public void run(TaskSource taskSource, Schema outputSchema) { 353 | final String email1 = "dummy_test-me.1234@dummy-mail1.com"; 354 | final String email2 = "!#$%&'*+-/=?^_`.{|}~@dummy-mail2.com"; 355 | 356 | MockPageOutput mockPageOutput = new MockPageOutput(); 357 | try (PageOutput pageOutput = maskFilterPlugin.open(taskSource, inputSchema, outputSchema, mockPageOutput)) { 358 | for (Page page : PageTestUtils.buildPage(runtime.getBufferAllocator(), inputSchema, 359 | email1, 360 | email2, 361 | email1, 362 | email1, 363 | email1 364 | )) { 365 | pageOutput.add(page); 366 | } 367 | pageOutput.finish(); 368 | } 369 | List records = Pages.toObjects(outputSchema, mockPageOutput.pages); 370 | 371 | assertEquals(1, records.size()); 372 | Object[] record = records.get(0); 373 | 374 | assertEquals(5, record.length); 375 | assertEquals(getMaskedEmail(email1), record[0]); 376 | assertEquals(getMaskedEmail(email2), record[1]); 377 | assertEquals(getMaskedCharacters(email1), record[2]); 378 | assertEquals(getMaskedCharacters(email1), record[3]); 379 | assertEquals(email1, record[4]); 380 | } 381 | }); 382 | } 383 | 384 | @Test 385 | public void testRegexMaskType() { 386 | String configYaml = "" + 387 | "type: mask\n" + 388 | "columns:\n" + 389 | " - { name: _c1, type: regex, pattern: \"abc\" }\n" + 390 | " - { name: _c2, type: regex, pattern: \"(abc)\" }\n" + 391 | " - { name: _c3, type: regex, pattern: \"[0-9]+\" }\n" + 392 | " - { name: _c4, type: regex, pattern: \"[0-9]\" }\n"; 393 | 394 | ConfigSource config = getConfigFromYaml(configYaml); 395 | 396 | final Schema inputSchema = Schema.builder() 397 | .add("_c0", STRING) 398 | .add("_c1", STRING) 399 | .add("_c2", STRING) 400 | .add("_c3", STRING) 401 | .add("_c4", STRING) 402 | .build(); 403 | 404 | final MaskFilterPlugin maskFilterPlugin = new MaskFilterPlugin(); 405 | maskFilterPlugin.transaction(config, inputSchema, new Control() { 406 | @Override 407 | public void run(TaskSource taskSource, Schema outputSchema) { 408 | final String c0ColumnValue = "_c0_abcdefghi01234"; 409 | final String c1ColumnValue = "_c1_abcdefghi01234"; 410 | final String c2ColumnValue = "_c2_abcdefghi01234"; 411 | final String c3ColumnValue = "_c3_abcdefghi01234"; 412 | final String c4ColumnValue = "_c4_abcdefghi01234"; 413 | 414 | MockPageOutput mockPageOutput = new MockPageOutput(); 415 | try (PageOutput pageOutput = maskFilterPlugin.open(taskSource, inputSchema, outputSchema, mockPageOutput)) { 416 | for (Page page : PageTestUtils.buildPage(runtime.getBufferAllocator(), inputSchema, 417 | c0ColumnValue, 418 | c1ColumnValue, 419 | c2ColumnValue, 420 | c3ColumnValue, 421 | c4ColumnValue 422 | )) { 423 | pageOutput.add(page); 424 | } 425 | pageOutput.finish(); 426 | } 427 | List records = Pages.toObjects(outputSchema, mockPageOutput.pages); 428 | 429 | assertEquals(1, records.size()); 430 | Object[] record = records.get(0); 431 | 432 | assertEquals(5, record.length); 433 | assertEquals("_c0_abcdefghi01234", record[0]); 434 | assertEquals("_c1_*defghi01234", record[1]); 435 | assertEquals("_c2_*defghi01234", record[2]); 436 | assertEquals("_c*_abcdefghi*", record[3]); 437 | assertEquals("_c*_abcdefghi*****", record[4]); 438 | } 439 | }); 440 | } 441 | 442 | @Test 443 | public void testSubstringMaskType() { 444 | String configYaml = "" + 445 | "type: mask\n" + 446 | "columns:\n" + 447 | " - { name: _c0, type: substring }\n" + 448 | " - { name: _c1, type: substring, start: 2, end: 5 }\n" + 449 | " - { name: _c2, type: substring, start: 6 }\n" + 450 | " - { name: _c3, type: substring, end: 4 }\n" + 451 | " - { name: _c4, type: substring, start: 3, length: 5 }\n" + 452 | " - { name: _c5, type: substring, start: 3, end: 2, length: 5 }\n"; // invalid configuration 453 | 454 | ConfigSource config = getConfigFromYaml(configYaml); 455 | 456 | final Schema inputSchema = Schema.builder() 457 | .add("_c0", STRING) 458 | .add("_c1", STRING) 459 | .add("_c2", STRING) 460 | .add("_c3", STRING) 461 | .add("_c4", STRING) 462 | .add("_c5", STRING) 463 | .build(); 464 | 465 | final MaskFilterPlugin maskFilterPlugin = new MaskFilterPlugin(); 466 | maskFilterPlugin.transaction(config, inputSchema, new Control() { 467 | @Override 468 | public void run(TaskSource taskSource, Schema outputSchema) { 469 | final String c0ColumnValue = "_c0_abcdefghi01234"; 470 | final String c1ColumnValue = "_c1_abcdefghi01234"; 471 | final String c2ColumnValue = "_c2_abcdefghi01234"; 472 | final String c3ColumnValue = "_c3_abcdefghi01234"; 473 | final String c4ColumnValue = "_c4_abcdefghi01234"; 474 | final String c5ColumnValue = "_c5_abcdefghi01234"; 475 | 476 | MockPageOutput mockPageOutput = new MockPageOutput(); 477 | try (PageOutput pageOutput = maskFilterPlugin.open(taskSource, inputSchema, outputSchema, mockPageOutput)) { 478 | for (Page page : PageTestUtils.buildPage(runtime.getBufferAllocator(), inputSchema, 479 | c0ColumnValue, 480 | c1ColumnValue, 481 | c2ColumnValue, 482 | c3ColumnValue, 483 | c4ColumnValue, 484 | c5ColumnValue 485 | )) { 486 | pageOutput.add(page); 487 | } 488 | pageOutput.finish(); 489 | } 490 | List records = Pages.toObjects(outputSchema, mockPageOutput.pages); 491 | 492 | assertEquals(1, records.size()); 493 | Object[] record = records.get(0); 494 | 495 | assertEquals(6, record.length); 496 | assertEquals("******************", record[0]); 497 | assertEquals("_c***bcdefghi01234", record[1]); 498 | assertEquals("_c2_ab************", record[2]); 499 | assertEquals("****abcdefghi01234", record[3]); 500 | assertEquals("_c4*****", record[4]); 501 | assertEquals("_c5_abcdefghi01234", record[5]); 502 | } 503 | }); 504 | } 505 | } 506 | --------------------------------------------------------------------------------