├── src ├── test │ ├── resources │ │ └── uk │ │ │ └── gov │ │ │ └── nationalarchives │ │ │ └── utf8 │ │ │ └── validator │ │ │ ├── valid-one-byte-char.bin │ │ │ ├── valid-two-byte-char.bin │ │ │ ├── valid-four-byte-char.bin │ │ │ ├── valid-three-byte-char.bin │ │ │ ├── invalid-mixed-1.bin │ │ │ ├── invalid-mixed-2.bin │ │ │ ├── invalid-one-byte-char.bin │ │ │ ├── invalid-two-byte-char.bin │ │ │ ├── invalid-four-byte-char.bin │ │ │ ├── invalid-three-byte-char.bin │ │ │ ├── invalid-two-byte-char-2.bin │ │ │ ├── invalid-three-byte-char-2.bin │ │ │ └── invalid-three-byte-char-3.bin │ └── java │ │ └── uk │ │ └── gov │ │ └── nationalarchives │ │ └── utf8 │ │ └── validator │ │ └── Utf8ValidatorTest.java └── main │ ├── assembly │ └── appassembler-output.xml │ └── java │ └── uk │ └── gov │ └── nationalarchives │ └── utf8 │ └── validator │ ├── ExitCode.java │ ├── ValidationHandler.java │ ├── ValidationException.java │ ├── PrintingValidationHandler.java │ ├── Utf8ValidateCmd.java │ └── Utf8Validator.java ├── .gitignore ├── .github ├── dependabot.yml └── workflows │ └── ci.yml ├── LICENSE ├── README.md └── pom.xml /src/test/resources/uk/gov/nationalarchives/utf8/validator/valid-one-byte-char.bin: -------------------------------------------------------------------------------- 1 | x -------------------------------------------------------------------------------- /src/test/resources/uk/gov/nationalarchives/utf8/validator/valid-two-byte-char.bin: -------------------------------------------------------------------------------- 1 | © -------------------------------------------------------------------------------- /src/test/resources/uk/gov/nationalarchives/utf8/validator/valid-four-byte-char.bin: -------------------------------------------------------------------------------- 1 | 🀰 -------------------------------------------------------------------------------- /src/test/resources/uk/gov/nationalarchives/utf8/validator/valid-three-byte-char.bin: -------------------------------------------------------------------------------- 1 | € -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | target/ 3 | 4 | # Package Files # 5 | *.jar 6 | *.war 7 | *.ear 8 | -------------------------------------------------------------------------------- /src/test/resources/uk/gov/nationalarchives/utf8/validator/invalid-mixed-1.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/digital-preservation/utf8-validator/HEAD/src/test/resources/uk/gov/nationalarchives/utf8/validator/invalid-mixed-1.bin -------------------------------------------------------------------------------- /src/test/resources/uk/gov/nationalarchives/utf8/validator/invalid-mixed-2.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/digital-preservation/utf8-validator/HEAD/src/test/resources/uk/gov/nationalarchives/utf8/validator/invalid-mixed-2.bin -------------------------------------------------------------------------------- /src/test/resources/uk/gov/nationalarchives/utf8/validator/invalid-one-byte-char.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/digital-preservation/utf8-validator/HEAD/src/test/resources/uk/gov/nationalarchives/utf8/validator/invalid-one-byte-char.bin -------------------------------------------------------------------------------- /src/test/resources/uk/gov/nationalarchives/utf8/validator/invalid-two-byte-char.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/digital-preservation/utf8-validator/HEAD/src/test/resources/uk/gov/nationalarchives/utf8/validator/invalid-two-byte-char.bin -------------------------------------------------------------------------------- /src/test/resources/uk/gov/nationalarchives/utf8/validator/invalid-four-byte-char.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/digital-preservation/utf8-validator/HEAD/src/test/resources/uk/gov/nationalarchives/utf8/validator/invalid-four-byte-char.bin -------------------------------------------------------------------------------- /src/test/resources/uk/gov/nationalarchives/utf8/validator/invalid-three-byte-char.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/digital-preservation/utf8-validator/HEAD/src/test/resources/uk/gov/nationalarchives/utf8/validator/invalid-three-byte-char.bin -------------------------------------------------------------------------------- /src/test/resources/uk/gov/nationalarchives/utf8/validator/invalid-two-byte-char-2.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/digital-preservation/utf8-validator/HEAD/src/test/resources/uk/gov/nationalarchives/utf8/validator/invalid-two-byte-char-2.bin -------------------------------------------------------------------------------- /src/test/resources/uk/gov/nationalarchives/utf8/validator/invalid-three-byte-char-2.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/digital-preservation/utf8-validator/HEAD/src/test/resources/uk/gov/nationalarchives/utf8/validator/invalid-three-byte-char-2.bin -------------------------------------------------------------------------------- /src/test/resources/uk/gov/nationalarchives/utf8/validator/invalid-three-byte-char-3.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/digital-preservation/utf8-validator/HEAD/src/test/resources/uk/gov/nationalarchives/utf8/validator/invalid-three-byte-char-3.bin -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: maven 4 | directory: "/" 5 | schedule: 6 | interval: daily 7 | open-pull-requests-limit: 10 8 | ignore: 9 | - dependency-name: com.mycila:license-maven-plugin 10 | versions: 11 | - "4.0" 12 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | 9 | jobs: 10 | build: 11 | name: Build and Test 12 | strategy: 13 | fail-fast: false 14 | matrix: 15 | os: [ubuntu-latest, macos-latest, windows-latest] 16 | jdk: [8, 9, 11, 15] 17 | runs-on: ${{ matrix.os }} 18 | steps: 19 | - uses: actions/checkout@v2 20 | - name: Set up JDK 21 | uses: actions/setup-java@v2 22 | with: 23 | java-version: ${{ matrix.jdk }} 24 | distribution: zulu 25 | - name: Cache Maven packages 26 | uses: actions/cache@v2 27 | with: 28 | path: ~/.m2 29 | key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }} 30 | restore-keys: ${{ runner.os }}-m2 31 | - name: Maven Build 32 | run: mvn -V -B -DskipTests=true install 33 | - name: Maven Test 34 | run: mvn -B verify 35 | # - name: Maven Code Coverage 36 | # if: ${{ github.ref == 'refs/heads/main' && matrix.jdk == '8' && matrix.os == 'ubuntu-latest' }} 37 | # run: mvn -B jacoco:report coveralls:report -DrepoToken=${{ secrets.COVERALLS_TOKEN }} 38 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2011, The National Archives 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, 5 | are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, this 11 | list of conditions and the following disclaimer in the documentation and/or 12 | other materials provided with the distribution. 13 | 14 | * Neither the name of the {organization} nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /src/main/assembly/appassembler-output.xml: -------------------------------------------------------------------------------- 1 | 4 | application 5 | 6 | dir 7 | zip 8 | 9 | 10 | true 11 | 12 | 13 | 14 | 15 | 16 | ${project.build.directory}/appassembler 17 | 18 | bin/${project.artifactId} 19 | 20 | 755 21 | / 22 | 23 | 24 | 25 | 26 | ${project.build.directory}/appassembler/bin 27 | 28 | bin/${project.artifactId} 29 | 30 | /bin 31 | 32 | 33 | 34 | 35 | ${project.build.directory}/appassembler 36 | 37 | bin 38 | 39 | / 40 | 41 | 42 | 43 | 44 | 45 | 46 | LICENSE 47 | / 48 | 49 | 50 | README.md 51 | / 52 | 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /src/main/java/uk/gov/nationalarchives/utf8/validator/ExitCode.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2011, The National Archives 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | package uk.gov.nationalarchives.utf8.validator; 28 | 29 | /** 30 | * Enumeration of UTF8 Validator Exit Codes 31 | * 32 | * @author Adam Retter 33 | */ 34 | public enum ExitCode { 35 | OK(0), 36 | INVALID_ARGS(1), 37 | VALIDATION_ERROR(2), 38 | IO_ERROR(4); 39 | 40 | private final int code; 41 | 42 | ExitCode(final int code) { 43 | this.code = code; 44 | } 45 | 46 | public int getCode() { 47 | return code; 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/main/java/uk/gov/nationalarchives/utf8/validator/ValidationHandler.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2011, The National Archives 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | package uk.gov.nationalarchives.utf8.validator; 28 | 29 | /** 30 | * Interface for Validation Handlers 31 | * Used as a callback by the Utf8Validator 32 | * to report errors 33 | * 34 | * @author Adam Retter 35 | */ 36 | public interface ValidationHandler { 37 | 38 | /** 39 | * Error handler, called when a Validation Error occurs 40 | * 41 | * @param message Description of the validation error 42 | * @param byteOffset offset in the underlying data that failed validation 43 | * 44 | * @throws ValidationException, an implementation may choose to throw a 45 | * ValidationException on receipt of an error. Throwing a ValidationException 46 | * stops the Utf8Validator 47 | */ 48 | void error(final String message, final long byteOffset) throws ValidationException; 49 | } 50 | -------------------------------------------------------------------------------- /src/main/java/uk/gov/nationalarchives/utf8/validator/ValidationException.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2011, The National Archives 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | package uk.gov.nationalarchives.utf8.validator; 28 | 29 | /** 30 | * An Exception caused by a UTF8 Validation error 31 | * 32 | * @author Adam Retter 33 | */ 34 | public class ValidationException extends Exception { 35 | 36 | /** 37 | * @param message Description of the validation error 38 | * @param byteOffset offset in the underlying data that failed validation 39 | */ 40 | public ValidationException(final String message, final long byteOffset) { 41 | super(message + " @ byte position: " + byteOffset); 42 | } 43 | 44 | /** 45 | * @param message Description of the validation error 46 | * @param byteOffset offset in the underlying data that failed validation 47 | * @param cause 48 | */ 49 | public ValidationException(final String message, final long byteOffset, final Throwable cause) { 50 | super(message + " @ byte position: " + byteOffset, cause); 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/main/java/uk/gov/nationalarchives/utf8/validator/PrintingValidationHandler.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2011, The National Archives 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | package uk.gov.nationalarchives.utf8.validator; 28 | 29 | import java.io.PrintStream; 30 | 31 | /** 32 | * Example ValidationHandler which prints its errors to an output PrintStream 33 | * It also has the ability to fail-fast by aborting processing 34 | * upon the first error. 35 | * It is used by the Utf8ValidateCmd. 36 | * 37 | * @author Adam Retter 38 | */ 39 | public class PrintingValidationHandler implements ValidationHandler { 40 | 41 | private final boolean failFast; 42 | private final PrintStream output; 43 | private boolean errored = false; 44 | 45 | public PrintingValidationHandler(final boolean failFast, final PrintStream output) { 46 | this.failFast = failFast; 47 | this.output = output; 48 | } 49 | 50 | @Override 51 | public void error(final String message, final long byteOffset) throws ValidationException { 52 | errored = true; 53 | if(failFast) { 54 | throw new ValidationException(message, byteOffset); 55 | } else { 56 | output.println("[ERROR] " + message + " @ byte position: " + byteOffset); 57 | } 58 | } 59 | 60 | public boolean isErrored() { 61 | return errored; 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | UTF-8 Validator 2 | =============== 3 | 4 | A UTF-8 Validation Tool which may be used as either a command line tool or as a library embedded in your own program. 5 | 6 | Released under the [BSD 3-Clause Licence](http://opensource.org/licenses/BSD-3-Clause). 7 | 8 | [![CI](https://github.com/digital-preservation/utf8-validator/workflows/CI/badge.svg)](https://github.com/digital-preservation/utf8-validator/actions?query=workflow%3ACI) 9 | [![Maven Central](https://maven-badges.herokuapp.com/maven-central/uk.gov.nationalarchives/utf8-validator/badge.svg)](https://search.maven.org/search?q=g:uk.gov.nationalarchives) 10 | 11 | Use from the Command Line 12 | ------------------------- 13 | You can either download the application from [here](https://search.maven.org/remotecontent?filepath=uk/gov/nationalarchives/utf8-validator/1.2/utf8-validator-1.2-application.zip) or [build from the source code](#building-from-source-code). You should extract this ZIP file to the place on your computer where you keep your applications. You can then run either `bin/validate.sh` (Linux/Mac/Unix) or `bin\validate.bat` (Windows). 14 | 15 | For example, to report all validation errors: 16 | 17 | ```bash 18 | $ cd /opt/utf8-validator-1.2 19 | $ bin/validate /tmp/my-file.txt 20 | ``` 21 | 22 | For example to report the first validation error and exit: 23 | 24 | ```bash 25 | $ cd /opt/utf8-validator-1.2 26 | $ bin/validate.sh --fail-fast /tmp/my-file.txt 27 | ``` 28 | 29 | Command Line Exit Codes 30 | ----------------------- 31 | * **0** Success 32 | * **1** Invalid Arguments provided to the application 33 | * **2** File was not UTF-8 Valid 34 | * **4** IO Error, e.g. could not read file 35 | 36 | 37 | Use as a Library 38 | ---------------- 39 | The UTF-8 Validator is written in Java and may be easily used from any Java (Scala, Clojure, JVM Language etc) application. We are using the Maven build system, and our artifacts have been published to [Maven Central](http://search.maven.org/#search%7Cga%7C1%7Cg%3A%22uk.gov.nationalarchives%22). 40 | 41 | If you are using Maven, you can simply add this to the dependencies section of your pom.xml: 42 | 43 | ```xml 44 | 45 | uk.gov.nationalarchives 46 | utf8-validator 47 | 1.2 48 | 49 | ``` 50 | 51 | Alternatively if you are using Sbt, you can add this to your library dependencies: 52 | 53 | ```scala 54 | "uk.gov.nationalarchives" % "utf8-validator" % "1.2" 55 | ``` 56 | 57 | To use the Library you need to implement the very simple interface `uk.gov.nationalarchives.utf8.validator.ValidationHandler` (or you could use `uk.gov.nationalarchives.utf8.validator.PrintingValidationHandler` if it suits you). The interface has a single method which is called whenever a validator finds a validation error. You can then instantiate `Utf8Validator` and validate from either a `java.io.File` or `java.io.InputStream`. For example: 58 | 59 | ```java 60 | ValidationHandler handler = new ValidationHandler() { 61 | @Override 62 | public void error(final String message, final long byteOffset) throws ValidationException { 63 | System.err.println("[Error][@" + byteOffset + "] " + message); 64 | }; 65 | }; 66 | 67 | File f = ... //your file here 68 | 69 | new Utf8Validator(handler).validate(f); 70 | ``` 71 | 72 | Building from Source Code 73 | -------------------------- 74 | * Git clone the repository from https://github.com/digital-preservation/utf8-validator.git 75 | * Build using [Maven](http://maven.apache.org), by running `mvn package` you will then find a ZIP of the compiled application in `target/utf8-validator-1.2-application.zip`. 76 | -------------------------------------------------------------------------------- /src/main/java/uk/gov/nationalarchives/utf8/validator/Utf8ValidateCmd.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2011, The National Archives 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | package uk.gov.nationalarchives.utf8.validator; 28 | 29 | import java.io.File; 30 | import java.io.IOException; 31 | 32 | /** 33 | * UTF-8 Validator Command Line 34 | * 35 | * @author Adam Retter 36 | * @version 1.2 37 | */ 38 | public class Utf8ValidateCmd { 39 | 40 | final static String VERSION = "1.2"; 41 | 42 | /** 43 | * @param args the command line arguments 44 | */ 45 | public static void main(final String[] args) { 46 | 47 | //check useage 48 | if(args.length < 1) { 49 | System.out.println("UTF-8 Validator version: " + VERSION); 50 | System.out.println("Usage: utf8validate [options] "); 51 | System.out.println(""); 52 | System.out.println("\t-f | --fail-fast"); 53 | System.out.println("\t\tStops on the first validation error rather than reporting all errors. Default false"); 54 | System.out.println("\t-b | --buffer-size"); 55 | System.out.println("\t\tSize of the in-memory buffer for file data (in bytes). Default 8192"); 56 | System.out.println("\t-m | --mem-mapped"); 57 | System.out.println("\t\tUse memory mapped Disk I/O. Default false"); 58 | System.out.println(""); 59 | System.exit(ExitCode.INVALID_ARGS.getCode()); 60 | } 61 | 62 | //parse args 63 | boolean failFast = false; 64 | int bufferSize = -1; 65 | boolean memMapped = false; 66 | final File fileToValidate; 67 | 68 | // parse args 69 | for (int i = 0; i < args.length - 1; i++) { 70 | if(args[i].equals("-f") || args[i].equals("--fail-fast")) { 71 | failFast = true; 72 | } 73 | 74 | if(args[i].equals("-b") || args[i].equals("--buffer-size")) { 75 | bufferSize = Integer.parseInt(args[++i]); 76 | } 77 | 78 | if(args[i].equals("-m") || args[i].equals("--mem-mapped")) { 79 | memMapped = true; 80 | } 81 | } 82 | fileToValidate = new File(args[args.length - 1]); 83 | 84 | if(!fileToValidate.exists()) { 85 | System.out.println("File: " + fileToValidate.getPath() + " does not exist!"); 86 | System.exit(ExitCode.INVALID_ARGS.getCode()); 87 | } 88 | 89 | final PrintingValidationHandler handler = new PrintingValidationHandler(failFast, System.out); 90 | 91 | ExitCode result = ExitCode.OK; 92 | final long start = System.currentTimeMillis(); 93 | 94 | System.out.println("Validating: " + fileToValidate.getPath()); 95 | 96 | try { 97 | new Utf8Validator(memMapped, bufferSize, handler).validate(fileToValidate); 98 | 99 | if(!failFast && handler.isErrored()) { 100 | result = ExitCode.VALIDATION_ERROR; 101 | } else { 102 | System.out.println("Valid OK (took " + (System.currentTimeMillis() - start) + "ms)"); 103 | result = ExitCode.OK; 104 | } 105 | } catch(final ValidationException ve) { 106 | System.out.println(ve.getMessage()); 107 | result = ExitCode.VALIDATION_ERROR; 108 | } catch(final IOException ioe) { 109 | System.err.println("[ERROR]" + ioe.getMessage()); 110 | result = ExitCode.IO_ERROR; 111 | } 112 | 113 | System.exit(result.getCode()); 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /src/test/java/uk/gov/nationalarchives/utf8/validator/Utf8ValidatorTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2011, The National Archives 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | package uk.gov.nationalarchives.utf8.validator; 28 | 29 | import java.io.File; 30 | import java.io.IOException; 31 | import java.net.URISyntaxException; 32 | import java.net.URL; 33 | import java.util.Arrays; 34 | 35 | import org.junit.Test; 36 | import org.junit.runner.RunWith; 37 | import org.junit.runners.Parameterized; 38 | 39 | /** 40 | * @author Adam Retter 41 | */ 42 | @RunWith(Parameterized.class) 43 | public class Utf8ValidatorTest { 44 | @Parameterized.Parameters(name = "{0}") 45 | public static java.util.Collection data() { 46 | return Arrays.asList(new Object[][]{ 47 | {"classic", false}, 48 | {"memory-mapped", true} 49 | }); 50 | } 51 | 52 | @Parameterized.Parameter(value = 0) 53 | public String name; 54 | 55 | @Parameterized.Parameter(value = 1) 56 | public boolean memoryMappedIo; 57 | 58 | @Test 59 | public void validOneByteChar() throws IOException, ValidationException, URISyntaxException { 60 | //character 'x' 61 | new Utf8Validator(memoryMappedIo, new PrintingValidationHandler(true, System.out)) 62 | .validate(testResource("valid-one-byte-char.bin")); 63 | } 64 | 65 | @Test(expected = ValidationException.class) 66 | public void invalidOneByteChar() throws IOException, ValidationException, URISyntaxException { 67 | //first byte from 'e accute' two byte character 68 | new Utf8Validator(memoryMappedIo, new PrintingValidationHandler(true, System.out)) 69 | .validate(testResource("invalid-one-byte-char.bin")); 70 | } 71 | 72 | @Test 73 | public void validTwoByteChar() throws IOException, ValidationException, URISyntaxException { 74 | //character 'copyright symbol' 75 | new Utf8Validator(memoryMappedIo, new PrintingValidationHandler(true, System.out)) 76 | .validate(testResource("valid-two-byte-char.bin")); 77 | } 78 | 79 | @Test(expected = ValidationException.class) 80 | public void invalidTwoByteChar() throws IOException, ValidationException, URISyntaxException { 81 | //first byte from 'copyright symbol' and then byte from 'x' character 82 | new Utf8Validator(memoryMappedIo, new PrintingValidationHandler(true, System.out)) 83 | .validate(testResource("invalid-two-byte-char.bin")); 84 | } 85 | 86 | @Test(expected = ValidationException.class) 87 | public void invalidTwoByteChar2() throws IOException, ValidationException, URISyntaxException { 88 | //first byte from 'x' character and then first byte from 'copyright symbol' 89 | new Utf8Validator(memoryMappedIo, new PrintingValidationHandler(true, System.out)) 90 | .validate(testResource("invalid-two-byte-char-2.bin")); 91 | } 92 | 93 | @Test 94 | public void validThreeByteChar() throws IOException, ValidationException, URISyntaxException { 95 | //character 'euro symbol' 96 | new Utf8Validator(memoryMappedIo, new PrintingValidationHandler(true, System.out)) 97 | .validate(testResource("valid-three-byte-char.bin")); 98 | } 99 | 100 | @Test(expected = ValidationException.class) 101 | public void invalidThreeByteChar() throws IOException, ValidationException, URISyntaxException { 102 | //first two bytes from 'euro symbol' and then byte from 'x' character 103 | new Utf8Validator(memoryMappedIo, new PrintingValidationHandler(true, System.out)) 104 | .validate(testResource("invalid-three-byte-char.bin")); 105 | } 106 | 107 | @Test(expected = ValidationException.class) 108 | public void invalidThreeByteChar2() throws IOException, ValidationException, URISyntaxException { 109 | //first byte from 'euro symbol', then byte from 'x' character, then second byte from 'euro symbol' 110 | new Utf8Validator(memoryMappedIo, new PrintingValidationHandler(true, System.out)) 111 | .validate(testResource("invalid-three-byte-char-2.bin")); 112 | } 113 | 114 | @Test(expected = ValidationException.class) 115 | public void invalidThreeByteChar3() throws IOException, ValidationException, URISyntaxException { 116 | //byte from character 'x' and the first two bytes from 'euro symbol' 117 | new Utf8Validator(memoryMappedIo, new PrintingValidationHandler(true, System.out)) 118 | .validate(testResource("invalid-three-byte-char-3.bin")); 119 | } 120 | 121 | @Test 122 | public void validFourByteChar() throws IOException, ValidationException, URISyntaxException { 123 | //character 'domino tile horizontal black' 124 | new Utf8Validator(memoryMappedIo, new PrintingValidationHandler(true, System.out)) 125 | .validate(testResource("valid-four-byte-char.bin")); 126 | } 127 | 128 | @Test(expected = ValidationException.class) 129 | public void invalidFourByteChar() throws IOException, ValidationException, URISyntaxException { 130 | //first three bytes from character 'domino tile horizontal black', then the byte from character(x) 131 | new Utf8Validator(memoryMappedIo, new PrintingValidationHandler(true, System.out)) 132 | .validate(testResource("invalid-four-byte-char.bin")); 133 | } 134 | 135 | @Test(expected = ValidationException.class) 136 | public void oneInvalidOneByteChar_followedByTwoValidOneByteChars() throws IOException, ValidationException, URISyntaxException { 137 | //characters: invalid char, 'comma', 'c' 138 | new Utf8Validator(memoryMappedIo, new PrintingValidationHandler(true, System.out)) 139 | .validate(testResource("invalid-mixed-1.bin")); 140 | } 141 | 142 | @Test(expected = ValidationException.class) 143 | public void oneValidOneByteChar_oneInvalidOneByteChar_followedByOneValidOneByteChar() throws IOException, ValidationException, URISyntaxException { 144 | //characters: 'comma', invalid char, 'c' 145 | new Utf8Validator(memoryMappedIo, new PrintingValidationHandler(true, System.out)) 146 | .validate(testResource("invalid-mixed-2.bin")); 147 | } 148 | 149 | private File testResource(final String filename) throws URISyntaxException { 150 | final URL resource = getClass().getResource(filename); 151 | return new File(resource.toURI()); 152 | } 153 | } 154 | -------------------------------------------------------------------------------- /src/main/java/uk/gov/nationalarchives/utf8/validator/Utf8Validator.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2011, The National Archives 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | package uk.gov.nationalarchives.utf8.validator; 28 | 29 | import java.io.*; 30 | import java.nio.MappedByteBuffer; 31 | import java.nio.channels.FileChannel; 32 | 33 | /** 34 | * Validates a File or InputStream byte by byte 35 | * to ensure it is UTF-8 Valid. 36 | * 37 | * @author Adam Retter 38 | * @version 1.2 39 | */ 40 | public class Utf8Validator { 41 | 42 | private static final int DEFAULT_BUFFER_SIZE = 8192; 43 | 44 | private static final int FOUR_BYTE_CHAR = 0xF0; // 11110xxx 45 | private static final int THREE_BYTE_CHAR = 0xE0; // 1110xxxx 46 | private static final int TWO_BYTE_CHAR = 0xC0; // 110xxxxx 47 | 48 | private int bufferSize; 49 | private boolean memMapped; 50 | private ValidationHandler handler; 51 | 52 | /** 53 | * @param handler A ValidationHandler that receives errors 54 | */ 55 | public Utf8Validator(final ValidationHandler handler) { 56 | this(DEFAULT_BUFFER_SIZE, handler); 57 | } 58 | 59 | /** 60 | * @param bufferSize the amount of data from the file (in bytes) to buffer in RAM 61 | * @param handler A ValidationHandler that receives errors 62 | */ 63 | public Utf8Validator(final int bufferSize, final ValidationHandler handler) { 64 | this(false, DEFAULT_BUFFER_SIZE, handler); 65 | } 66 | 67 | 68 | /** 69 | * @memMapped true if memory mapped I/O should be used 70 | * @param handler A ValidationHandler that receives errors 71 | */ 72 | public Utf8Validator(final boolean memMapped, final ValidationHandler handler) { 73 | this(memMapped, DEFAULT_BUFFER_SIZE, handler); 74 | } 75 | 76 | /** 77 | * @memMapped true if memory mapped I/O should be used 78 | * @param bufferSize the amount of data from the file (in bytes) to buffer in RAM 79 | * @param handler A ValidationHandler that receives errors 80 | */ 81 | public Utf8Validator(final boolean memMapped, final int bufferSize, final ValidationHandler handler) { 82 | this.memMapped = memMapped; 83 | this.bufferSize = bufferSize <= 0 ? DEFAULT_BUFFER_SIZE : bufferSize; 84 | this.handler = handler; 85 | } 86 | 87 | /** 88 | * Validates the File as UTF-8. 89 | * 90 | * @param f The file to UTF-8 validate 91 | * 92 | * @throws IOException Exception is thrown if the file cannot be read 93 | * @throws ValidationException thrown if the ValidationHandler determines 94 | * that an error causes an exception 95 | */ 96 | public void validate(final File f) throws IOException, ValidationException { 97 | if (memMapped) { 98 | RandomAccessFile raf = null; 99 | FileChannel fc = null; 100 | try { 101 | raf = new RandomAccessFile(f, "r"); 102 | fc = raf.getChannel(); 103 | final MappedByteBuffer buffer = fc.map(FileChannel.MapMode.READ_ONLY, 0, fc.size()); 104 | validate(buffer); 105 | buffer.clear(); 106 | } finally { 107 | if(fc != null) { 108 | fc.close(); 109 | } 110 | if(raf != null) { 111 | raf.close(); 112 | } 113 | } 114 | } else { 115 | InputStream is = null; 116 | try { 117 | is = new BufferedInputStream(new FileInputStream(f), bufferSize); 118 | validate(is); 119 | } finally { 120 | if (is != null) { 121 | is.close(); 122 | } 123 | } 124 | } 125 | } 126 | 127 | /** 128 | * Validates Input Stream as UTF-8. 129 | * 130 | * @param is Input Stream for UTF-8 validation 131 | * 132 | * @throws IOException Exception is thrown if the stream cannot be read 133 | * @throws ValidationException thrown if the ValidationHandler determines 134 | * that an error causes an exception 135 | */ 136 | public void validate(final InputStream is) throws IOException, ValidationException { 137 | int read = 0; // total bytes read 138 | byte multiByteLen = 0; // length of multi-byte character sequence (or zero if a single byte character) 139 | byte multiBytesRemain = 0; // bytes remaining to read of multi-byte character sequence (or zero if a single byte character) 140 | int b = -1; // current byte 141 | 142 | while ((b = is.read()) > -1) { 143 | 144 | read++; 145 | 146 | if (multiBytesRemain > 0) { 147 | multiBytesRemain--; 148 | if ((b >>> 6) != 2) { 149 | handler.error("Invalid UTF-8 sequence, byte " + (multiByteLen - multiBytesRemain) + " of " + multiByteLen + " byte sequence.", read); 150 | } 151 | 152 | } else if ((b & 0x80) == 0) { 153 | // One byte Sequence (MSB of a single byte character must be 0) 154 | continue; 155 | 156 | } else if ((b & FOUR_BYTE_CHAR) == FOUR_BYTE_CHAR) { 157 | //Four byte Sequence 158 | multiByteLen = 4; 159 | multiBytesRemain = 3; 160 | 161 | } else if((b & THREE_BYTE_CHAR) == THREE_BYTE_CHAR) { 162 | //Three byte Sequence 163 | multiByteLen = 3; 164 | multiBytesRemain = 2; 165 | 166 | } else if((b & TWO_BYTE_CHAR) == TWO_BYTE_CHAR) { 167 | //Two byte Sequence 168 | multiByteLen = 2; 169 | multiBytesRemain = 1; 170 | 171 | } else { 172 | handler.error("Invalid single byte UTF-8 character ", read); 173 | } 174 | } 175 | 176 | if (multiBytesRemain > 0) { 177 | handler.error("Invalid UTF-8 Sequence, expecting: " + multiBytesRemain + " more bytes in " + multiByteLen + " byte sequence. End of File!", read); 178 | } 179 | } 180 | 181 | /** 182 | * Validates Mapped Byte Buffer as UTF-8. 183 | * 184 | * @param buf Mapped Byte Buffer for UTF-8 validation 185 | * 186 | * @throws IOException Exception is thrown if the buf cannot be read 187 | * @throws ValidationException thrown if the ValidationHandler determines 188 | * that an error causes an exception 189 | */ 190 | public void validate(final MappedByteBuffer buf) throws IOException, ValidationException { 191 | int read = 0; // total bytes read 192 | byte multiByteLen = 0; // length of multi-byte character sequence (or zero if a single byte character) 193 | byte multiBytesRemain = 0; // bytes remaining to read of multi-byte character sequence (or zero if a single byte character) 194 | int b = -1; // current byte 195 | 196 | while (buf.remaining() > 0) { 197 | 198 | b = buf.get() & 0xFF; 199 | 200 | read++; 201 | 202 | if (multiBytesRemain > 0) { 203 | multiBytesRemain--; 204 | if ((b >>> 6) != 2) { 205 | handler.error("Invalid UTF-8 sequence, byte " + (multiByteLen - multiBytesRemain) + " of " + multiByteLen + " byte sequence.", read); 206 | } 207 | 208 | } else if ((b & 0x80) == 0) { 209 | // One byte Sequence (MSB of a single byte character must be 0) 210 | continue; 211 | 212 | } else if ((b & FOUR_BYTE_CHAR) == FOUR_BYTE_CHAR) { 213 | //Four byte Sequence 214 | multiByteLen = 4; 215 | multiBytesRemain = 3; 216 | 217 | } else if((b & THREE_BYTE_CHAR) == THREE_BYTE_CHAR) { 218 | //Three byte Sequence 219 | multiByteLen = 3; 220 | multiBytesRemain = 2; 221 | 222 | } else if((b & TWO_BYTE_CHAR) == TWO_BYTE_CHAR) { 223 | //Two byte Sequence 224 | multiByteLen = 2; 225 | multiBytesRemain = 1; 226 | 227 | } else { 228 | handler.error("Invalid single byte UTF-8 character ", read); 229 | } 230 | } 231 | 232 | if (multiBytesRemain > 0) { 233 | handler.error("Invalid UTF-8 Sequence, expecting: " + multiBytesRemain + " more bytes in " + multiByteLen + " byte sequence. End of File!", read); 234 | } 235 | } 236 | } 237 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | 4 | 5 | org.sonatype.oss 6 | oss-parent 7 | 7 8 | 9 | 10 | uk.gov.nationalarchives 11 | utf8-validator 12 | 1.3.1-SNAPSHOT 13 | jar 14 | 15 | UTF-8 Validator 16 | A simple validator to check that a file contains only UTF-8 valid byte sequences 17 | https://github.com/digital-preservation/utf8-validator 18 | 2011 19 | 20 | 21 | The National Archives 22 | http://www.nationalarchives.gov.uk 23 | 24 | 25 | 26 | 27 | The BSD 3-Clause License 28 | http://www.opensource.org/licenses/BSD-3-Clause 29 | repo 30 | 31 | 32 | 33 | 34 | scm:git:https://github.com/digital-preservation/utf8-validator.git 35 | scm:git:https://github.com/digital-preservation/utf8-validator.git 36 | scm:git:https://github.com/digital-preservation/utf8-validator.git 37 | HEAD 38 | 39 | 40 | 41 | 42 | Adam Retter 43 | adam.retter@googlemail.com 44 | 45 | 46 | 47 | 48 | UTF-8 49 | UTF-8 50 | 1.7 51 | 1.7 52 | digitalpreservation@nationalarchives.gov.uk 53 | 54 | 55 | 56 | 57 | junit 58 | junit 59 | 4.13.2 60 | test 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | com.mycila 69 | license-maven-plugin 70 | 4.1 71 | 72 | 73 | org.apache.maven.plugins 74 | maven-compiler-plugin 75 | 3.10.0 76 | 77 | 78 | com.code54.mojo 79 | buildversion-plugin 80 | 1.0.3 81 | 82 | 83 | org.apache.maven.plugins 84 | maven-dependency-plugin 85 | 3.2.0 86 | 87 | 88 | org.apache.maven.plugins 89 | maven-jar-plugin 90 | 3.2.2 91 | 92 | 93 | org.apache.maven.plugins 94 | maven-source-plugin 95 | 3.2.1 96 | 97 | 98 | org.apache.maven.plugins 99 | maven-javadoc-plugin 100 | 3.3.2 101 | 102 | 103 | org.codehaus.mojo 104 | appassembler-maven-plugin 105 | 2.1.0 106 | 107 | 108 | org.apache.maven.plugins 109 | maven-assembly-plugin 110 | 3.3.0 111 | 112 | 113 | org.apache.maven.plugins 114 | maven-release-plugin 115 | 2.5.3 116 | 117 | 118 | org.apache.maven.plugins 119 | maven-gpg-plugin 120 | 3.0.1 121 | 122 | 123 | org.codehaus.mojo 124 | exec-maven-plugin 125 | 3.0.0 126 | 127 | 128 | 129 | 130 | 131 | com.mycila 132 | license-maven-plugin 133 | true 134 | 135 |
com/mycila/maven/plugin/license/templates/BSD-3.txt
136 | true 137 | true 138 | true 139 | 140 | ${project.organization.name} <${contact.email}> 141 | 142 | 143 | pom.xml 144 | README.md 145 | LICENSE 146 | src/main/assembly/appassembler-output.xml 147 | .github/workflows/ci.yml 148 | .github/dependabot.yml 149 | 150 | ${project.build.sourceEncoding} 151 |
152 | 153 | 154 | check-headers 155 | verify 156 | 157 | check 158 | 159 | 160 | 161 |
162 | 163 | org.apache.maven.plugins 164 | maven-dependency-plugin 165 | 166 | 167 | analyze 168 | 169 | analyze-only 170 | 171 | 172 | true 173 | 174 | 175 | 176 | 177 | 178 | com.code54.mojo 179 | buildversion-plugin 180 | 181 | 182 | validate 183 | 184 | set-properties 185 | 186 | 187 | 188 | 189 | 190 | org.apache.maven.plugins 191 | maven-compiler-plugin 192 | 3.10.0 193 | 194 | ${java.source} 195 | ${java.target} 196 | ${project.build.sourceEncoding} 197 | true 198 | false 199 | 200 | 201 | 202 | org.apache.maven.plugins 203 | maven-jar-plugin 204 | 205 | 206 | 207 | true 208 | true 209 | true 210 | 211 | 212 | ${build-tag} 213 | ${build-commit} 214 | ${build-commit-abbrev} 215 | ${build-version} 216 | ${build-tstamp} 217 | ${project.scm.connection} 218 | ${project.description} 219 | ${project.url} 220 | uk.gov.nationalarchives.utf8.validator.Utf8ValidateCmd 221 | 222 | 223 | 224 | 225 | 226 | 227 | org.apache.maven.plugins 228 | maven-source-plugin 229 | 230 | 231 | 232 | true 233 | true 234 | 235 | 236 | ${build-tag} 237 | ${build-commit} 238 | ${build-commit-abbrev} 239 | ${build-version} 240 | ${build-tstamp} 241 | ${project.scm.connection} 242 | ${project.description} 243 | ${project.url} 244 | 245 | 246 | 247 | 248 | 249 | attach-sources 250 | verify 251 | 252 | jar 253 | 254 | 255 | 256 | 257 | 258 | org.apache.maven.plugins 259 | maven-gpg-plugin 260 | 261 | 262 | org.codehaus.mojo 263 | appassembler-maven-plugin 264 | 265 | 266 | package 267 | 268 | assemble 269 | 270 | 271 | 272 | 273 | false 274 | flat 275 | lib 276 | 277 | .sh 278 | 279 | 280 | 281 | validate 282 | uk.gov.nationalarchives.utf8.validator.Utf8ValidateCmd 283 | 284 | 285 | 286 | 287 | 288 | maven-assembly-plugin 289 | 290 | 291 | package 292 | 293 | single 294 | 295 | 296 | 297 | 298 | 299 | src/main/assembly/appassembler-output.xml 300 | 301 | 302 | 303 | 304 | org.apache.maven.plugins 305 | maven-release-plugin 306 | 307 | forked-path 308 | 309 | 310 |
311 |
312 | 313 | 314 | 315 | native 316 | 317 | 318 | 319 | 320 | org.codehaus.mojo 321 | exec-maven-plugin 322 | 323 | 324 | package 325 | 326 | exec 327 | 328 | 329 | 330 | 331 | native-image 332 | ${project.build.directory} 333 | 334 | -da 335 | --class-path 336 | 337 | uk.gov.nationalarchives.utf8.validator.Utf8ValidateCmd 338 | utf8validate 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | sonatype-releases 350 | http://oss.sonatype.org/content/repositories/releases 351 | 352 | 353 | clojars.org 354 | http://clojars.org/repo 355 | 356 | 357 | 358 |
359 | --------------------------------------------------------------------------------