├── src ├── test │ └── resources │ │ ├── books.avro │ │ ├── xml │ │ └── iam │ │ │ ├── ArtifactResponse.avro │ │ │ ├── AuthnRequest.avro │ │ │ ├── AuthnRequest.xml │ │ │ ├── ArtifactResponse.xml │ │ │ └── SAML_response.asvc │ │ ├── avsc.yml │ │ ├── books1.xsd │ │ ├── avro.yml │ │ ├── both.yml │ │ ├── old_books.json │ │ ├── books.xml.bkp │ │ ├── books.xsd │ │ ├── new_books.json │ │ ├── book.avsc │ │ ├── books.xml │ │ ├── books.avsc │ │ ├── old_books.avsc │ │ ├── new_books.avsc │ │ ├── new_books2.avsc │ │ └── temp └── main │ ├── scala │ └── in │ │ └── dreamlabs │ │ └── xmlavro │ │ ├── Validator.scala │ │ ├── Converter.scala │ │ ├── config │ │ ├── ArgParse.scala │ │ ├── ConfigParser.scala │ │ └── Config.scala │ │ ├── XMLEvents.scala │ │ ├── XMLDocument.scala │ │ ├── RichAvro.scala │ │ ├── Supporters.scala │ │ └── AvroBuilder.scala │ └── python │ └── avsc_fix.py ├── settings.gradle ├── .gitignore ├── gradle └── wrapper │ └── gradle-wrapper.properties ├── example ├── config.yml ├── books.xml ├── books.xsd └── books.avsc ├── Dockerfile ├── gradlew.bat ├── gradlew ├── README.md └── LICENSE /src/test/resources/books.avro: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /settings.gradle: -------------------------------------------------------------------------------- 1 | rootProject.name = 'xml-avro' -------------------------------------------------------------------------------- /src/test/resources/xml/iam/ArtifactResponse.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeethanadhP/xml-avro/HEAD/src/test/resources/xml/iam/ArtifactResponse.avro -------------------------------------------------------------------------------- /src/test/resources/xml/iam/AuthnRequest.avro: -------------------------------------------------------------------------------- 1 | 00Haaf23196-1773-2113-474a-fe114412ab722.0&2004-12-05T09:21:59furn:oasis:names:tc:SAML:2.0:nameid-format:transient -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | *.class 3 | *.iml 4 | *.idea 5 | tmp 6 | 7 | # Package Files # 8 | *.jar 9 | *.war 10 | *.ear 11 | /xsd/ 12 | /bin/ 13 | /.gradle/ 14 | /build/ 15 | /out/ 16 | 17 | example/books\.avro 18 | /.project 19 | -------------------------------------------------------------------------------- /src/test/resources/avsc.yml: -------------------------------------------------------------------------------- 1 | dynamic: true 2 | dynamicSource: ENVIRONMENT 3 | debug: true 4 | baseDir: "src/test/resources" 5 | namespaces: true 6 | XSD: 7 | xsdFile: "books.xsd" 8 | avscFile: "books.avsc" 9 | rebuildChoice: true 10 | 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /src/test/resources/books1.xsd: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | #Mon Jul 22 11:43:36 CEST 2019 2 | distributionBase=GRADLE_USER_HOME 3 | distributionPath=wrapper/dists 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | distributionUrl=https\://services.gradle.org/distributions/gradle-4.10.3-all.zip 7 | -------------------------------------------------------------------------------- /example/config.yml: -------------------------------------------------------------------------------- 1 | baseDir: "example" 2 | debug: false 3 | XSD: 4 | xsdFile: "books.xsd" 5 | avscFile: "books.avsc" 6 | stringTimestamp: true 7 | attributePrefix: "_" 8 | 9 | XML: 10 | xmlInput: "books.xml" 11 | avroOutput: "books.avro" 12 | documentRootTag: "books" 13 | avscFile: "books.avsc" 14 | validationXSD: "books.xsd" 15 | ignoreMissing: false 16 | caseSensitive: true -------------------------------------------------------------------------------- /src/test/resources/avro.yml: -------------------------------------------------------------------------------- 1 | dynamic: true 2 | dynamicSource: ENVIRONMENT 3 | debug: false 4 | baseDir: "test/resources" 5 | namespaces: true 6 | 7 | XML: 8 | xmlFile: "books.xml" 9 | avscFile: "books.avsc" 10 | avroFile: "books.avro" 11 | validationXSD: "books.xsd" 12 | splitBy: "book" 13 | ignoreWarnings: true 14 | streamingInput: true 15 | caseSensitive: true 16 | ignoreCaseFor: 17 | - "Something" 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /src/test/resources/both.yml: -------------------------------------------------------------------------------- 1 | dynamic: true 2 | dynamicSource: ENVIRONMENT 3 | debug: false 4 | baseDir: "test/resources" 5 | namespaces: true 6 | XSD: 7 | xsdFile: "books.xsd" 8 | avscFile: "books.avsc" 9 | rebuildChoice: true 10 | 11 | XML: 12 | xmlFile: "books.xml" 13 | avscFile: "books.avsc" 14 | avroFile: "books.avro" 15 | validationXSD: "books.xsd" 16 | splitBy: "book" 17 | ignoreWarnings: true 18 | streamingInput: true 19 | caseSensitive: true 20 | ignoreCaseFor: 21 | - "Something" 22 | 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /src/test/resources/xml/iam/AuthnRequest.xml: -------------------------------------------------------------------------------- 1 | 9 | https://sp.example.com/SAML2 10 | 13 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM gradle:6.3.0 AS build 2 | ARG VERSION 3 | ENV VERSION ${VERSION:-1.8.2} 4 | 5 | RUN cd /tmp && \ 6 | wget https://github.com/GeethanadhP/xml-avro/archive/${VERSION}.zip && \ 7 | unzip ${VERSION}.zip && \ 8 | cp -a /tmp/xml-avro-${VERSION}/* /home/gradle/ && \ 9 | sed -i 's/http:/https:/g' /home/gradle/build.gradle && \ 10 | chown -R gradle:gradle /home/gradle 11 | 12 | 13 | WORKDIR /home/gradle 14 | RUN gradle build --no-daemon 15 | 16 | FROM openjdk:8-jre-slim 17 | ARG VERSION 18 | ENV VERSION ${VERSION:-1.8.2} 19 | 20 | RUN mkdir /app 21 | WORKDIR /app 22 | 23 | COPY --from=build /home/gradle/build/libs/xml-avro-all-${VERSION}.jar /app/xml-avro.jar 24 | 25 | CMD ["java", "-XX:+UnlockExperimentalVMOptions", "-XX:+UseCGroupMemoryLimitForHeap", "-Djava.security.egd=file:/dev/./urandom","-jar","/app/xml-avro.jar", "-c", "config.yml"] 26 | -------------------------------------------------------------------------------- /src/main/scala/in/dreamlabs/xmlavro/Validator.scala: -------------------------------------------------------------------------------- 1 | package in.dreamlabs.xmlavro 2 | 3 | import javax.xml.XMLConstants 4 | import javax.xml.transform.stream.StreamSource 5 | import javax.xml.validation.SchemaFactory 6 | import org.xml.sax.SAXException 7 | 8 | object Validator { 9 | 10 | def validate(xmlFile: String, xsdFile: String): Boolean = { 11 | try { 12 | val schema = SchemaFactory 13 | .newInstance(XMLConstants.W3C_XML_SCHEMA_NS_URI) 14 | .newSchema(new StreamSource("i180.xsd")) 15 | val validator = schema.newValidator() 16 | validator.validate(new StreamSource(xmlFile)) 17 | } catch { 18 | case ex: SAXException => ex.printStackTrace(); return false 19 | case ex: Exception => ex.printStackTrace() 20 | } 21 | true 22 | } 23 | 24 | def main(args: Array[String]) { 25 | println(validate("i180.xml", "i180.xsd")) 26 | } 27 | } -------------------------------------------------------------------------------- /example/books.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | Brandon Sanderson 6 | Mistborn 7 | Fantasy 8 | 50 9 | 2006-12-17T09:30:47.0Z 10 | 11 | Wonderful 12 | I love the plot twist and the new magic 13 | 14 | 15 | Unbelievable twist 16 | The best book i ever read 17 | 18 | 10 19 | 20 | 21 | Brandon Sanderson 22 | Way of Kings 23 | Fantasy 24 | 50 25 | 2006-12-17T09:30:47.0Z 26 | 27 | 28 | 29 | 30 | 31 | 32 | 10 33 | 34 | -------------------------------------------------------------------------------- /src/main/scala/in/dreamlabs/xmlavro/Converter.scala: -------------------------------------------------------------------------------- 1 | package in.dreamlabs.xmlavro 2 | 3 | import java.io._ 4 | 5 | import in.dreamlabs.xmlavro.config.{Config, ConfigParser, XMLConfig, XSDConfig} 6 | 7 | /** 8 | * Created by Royce on 22/12/2016. 9 | */ 10 | class Converter(val config: Config) { 11 | 12 | if (config.XSD isDefined) 13 | convertXSD(config.XSD.get) 14 | if (config.XML isDefined) { 15 | val xConfig = config.XML.get 16 | if (!xConfig.streamingInput) 17 | Utils.info("Converting: " + xConfig.xmlFile + " -> " + xConfig.avroFile) 18 | convertXML(xConfig) 19 | } 20 | 21 | @throws[IOException] 22 | private def convertXSD(xConfig: XSDConfig) { 23 | Utils.info("Converting: " + xConfig.xsdFile + " -> " + xConfig.avscFile) 24 | val schemaBuilder = SchemaBuilder(xConfig) 25 | schemaBuilder createSchema() 26 | } 27 | 28 | private def convertXML(xConfig: XMLConfig) { 29 | Utils.profile("Avro Conversion") { 30 | val builder = new AvroBuilder(xConfig) 31 | builder.createDatums() 32 | } 33 | } 34 | } 35 | 36 | object Converter { 37 | @throws[IOException] 38 | def main(args: Array[String]): Unit = { 39 | val config = try { 40 | if (args isEmpty) 41 | throw new IllegalArgumentException("No Arguments specified") 42 | else ConfigParser apply args 43 | } catch { 44 | case e: IllegalArgumentException => 45 | Utils.log("ERROR", 46 | "XML Avro converter\nError: " + e.getMessage + "\n\n" + ConfigParser.USAGE + "\n") 47 | System.exit(1) 48 | } 49 | Converter apply config.asInstanceOf[ConfigParser] 50 | } 51 | 52 | def apply(config: ConfigParser): Converter = new Converter(config.config) 53 | } 54 | -------------------------------------------------------------------------------- /src/test/resources/xml/iam/ArtifactResponse.xml: -------------------------------------------------------------------------------- 1 | 7 | 8 | 9 | 10 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | Hello, World! 21 | 22 | 23 | 25 | 26 | 35 | https://sp.example.com/SAML2 36 | 39 | 40 | -------------------------------------------------------------------------------- /src/test/resources/old_books.json: -------------------------------------------------------------------------------- 1 | { 2 | "book": [ 3 | { 4 | "id": "b001", 5 | "others": { 6 | "ot:random": "true" 7 | }, 8 | "author": "Brandon Sanderson", 9 | "title": "Mistborn", 10 | "genre": "Fantasy", 11 | "price": [], 12 | "pub_date": 1166347847000, 13 | "review": [ 14 | { 15 | "title": "Unbeliveable twitst", 16 | "content": null 17 | }, 18 | { 19 | "title": null, 20 | "content": "The best book i ever read" 21 | } 22 | ], 23 | "type0": [], 24 | "type2": [], 25 | "type6": null, 26 | "sold": [] 27 | }, 28 | { 29 | "id": "b002", 30 | "others": { 31 | "alias3": "A3 Angleso and Demenso", 32 | "alias2": "\r\n A2 Angleso and Demenso<\/title>\r\n " 33 | }, 34 | "author": "Dan Brown", 35 | "title": "Angels and Demons", 36 | "genre": "Mystery Thriller", 37 | "price": [], 38 | "pub_date": 1040117447000, 39 | "review": [ 40 | { 41 | "title": "Fast paced mystery", 42 | "content": null 43 | }, 44 | { 45 | "title": null, 46 | "content": "a good one i would say" 47 | } 48 | ], 49 | "type0": [ 50 | { 51 | "alias": null, 52 | "website": { 53 | "url": [] 54 | } 55 | } 56 | ], 57 | "type2": [], 58 | "type6": null, 59 | "sold": null 60 | }, 61 | { 62 | "id": "b003", 63 | "others": {}, 64 | "author": "Dan Brown", 65 | "title": "Digital Fortress", 66 | "genre": "Mystery Thriller", 67 | "price": null, 68 | "pub_date": 1071653447000, 69 | "review": [ 70 | { 71 | "title": "Best SciFi Thriller3", 72 | "content": null 73 | } 74 | ], 75 | "type0": [ 76 | { 77 | "alias": { 78 | "title": "Encryto", 79 | "language": [] 80 | }, 81 | "website": null 82 | }, 83 | { 84 | "alias": null, 85 | "website": { 86 | "url": [] 87 | } 88 | } 89 | ], 90 | "type2": [], 91 | "type6": null, 92 | "sold": [] 93 | } 94 | ] 95 | } -------------------------------------------------------------------------------- /example/books.xsd: -------------------------------------------------------------------------------- 1 | <xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:bks="urn:books" xmlns:books="http://www.books.com/XML" 2 | targetNamespace="urn:books"> 3 | <xsd:element name="books" type="bks:BooksForm"/> 4 | <xsd:complexType name="BooksForm"> 5 | <xsd:sequence> 6 | <xsd:element name="book" type="bks:BookForm" maxOccurs="unbounded"/> 7 | </xsd:sequence> 8 | </xsd:complexType> 9 | <xsd:complexType name="BookForm"> 10 | <xsd:sequence> 11 | <xsd:element name="author" type="xsd:string"/> 12 | <xsd:element name="title" type="xsd:string"/> 13 | <xsd:element name="genre" type="xsd:string"/> 14 | <xsd:element name="price" type="bks:PriceType" minOccurs="0" maxOccurs="3"/> 15 | <xsd:element name="pub_date" type="xsd:dateTime" minOccurs="0"/> 16 | <xsd:choice maxOccurs="unbounded"> 17 | <xsd:element name="review" type="bks:ReviewType"/> 18 | <xsd:sequence> 19 | <xsd:element name="alias" type="bks:AliasType"/> 20 | <xsd:element name="website" type="bks:WebsiteType"/> 21 | </xsd:sequence> 22 | <xsd:element name="sold" type="xsd:integer"/> 23 | </xsd:choice> 24 | <xsd:any namespace="##other" processContents="lax" minOccurs="0" maxOccurs="unbounded"/> 25 | </xsd:sequence> 26 | <xsd:attribute name="id" type="xsd:string"/> 27 | <xsd:anyAttribute namespace="##other" processContents="lax"/> 28 | </xsd:complexType> 29 | <xsd:complexType name="AliasType"> 30 | <xsd:sequence> 31 | <xsd:element name="title" type="xsd:string"/> 32 | <xsd:element name="language" type="xsd:string" minOccurs="0" maxOccurs="unbounded"/> 33 | </xsd:sequence> 34 | </xsd:complexType> 35 | <xsd:complexType name="ReviewType"> 36 | <xsd:sequence> 37 | <xsd:element name="title" type="xsd:string"/> 38 | <xsd:element name="content" type="xsd:string" minOccurs="0"/> 39 | </xsd:sequence> 40 | </xsd:complexType> 41 | <xsd:complexType name="WebsiteType"> 42 | <xsd:sequence> 43 | <xsd:element name="url" type="xsd:string" minOccurs="0" maxOccurs="unbounded"/> 44 | </xsd:sequence> 45 | </xsd:complexType> 46 | <xsd:complexType name="PriceType"> 47 | <xsd:simpleContent> 48 | <xsd:extension base="xsd:decimal"> 49 | <xsd:attribute name="currency" type="xsd:string" use="optional"/> 50 | </xsd:extension> 51 | </xsd:simpleContent> 52 | </xsd:complexType> 53 | </xsd:schema> -------------------------------------------------------------------------------- /src/test/resources/books.xml.bkp: -------------------------------------------------------------------------------- 1 | <?xml version="1.0" encoding="UTF-8"?> 2 | <!--Sample XML file generated by XMLSpy v2008 rel. 2 sp2 (http://www.altova.com)--> 3 | <bks:books xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:bks="urn:books" 4 | xsi:schemaLocation="urn:books C:\Geetha\xsd_sales\books.xsd" xmlns:ot="http://maven.apache.org/POM/4.0.0"> 5 | <book id="b001" ot:random="true"> 6 | <author>Brandon Sanderson</author> 7 | <title>Mistborn 8 | Fantasy 9 | 50 10 | 2006-12-17T09:30:47.0Z 11 | 12 | Wonderful 13 | I love the plot twitst and the new magic 14 | 15 | 16 | Unbeliveable twitst 17 | The best book i ever read 18 | 19 | 10 20 | 21 | 22 | Dan Brown 23 | Angels and Demons 24 | Mystery Thriller 25 | 52 26 | 2002-12-17T09:30:47.0Z 27 | 28 | Good Thriller 29 | 30 | 31 | Fast paced mystery 32 | a good one i would say 33 | 34 | 35 | A2 Angleso and Demenso 36 | 37 | A3 Angleso and Demenso 38 | 39 | www.danbrown.com 40 | 41 | 42 | 43 | Dan Brown 44 | Digital Fortress 45 | Mystery Thriller 46 | 2003-12-17T09:30:47.0Z 47 | 48 | Best SciFi Thriller 49 | 50 | 51 | Best SciFi Thriller2 52 | 53 | 54 | Best SciFi Thriller3 55 | 56 | 57 | Encryto 58 | Italian 59 | French 60 | 61 | 62 | www.danbrown.com 63 | 64 | 23 65 | 66 | 67 | -------------------------------------------------------------------------------- /gradlew.bat: -------------------------------------------------------------------------------- 1 | @if "%DEBUG%" == "" @echo off 2 | @rem ########################################################################## 3 | @rem 4 | @rem Gradle startup script for Windows 5 | @rem 6 | @rem ########################################################################## 7 | 8 | @rem Set local scope for the variables with windows NT shell 9 | if "%OS%"=="Windows_NT" setlocal 10 | 11 | set DIRNAME=%~dp0 12 | if "%DIRNAME%" == "" set DIRNAME=. 13 | set APP_BASE_NAME=%~n0 14 | set APP_HOME=%DIRNAME% 15 | 16 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 17 | set DEFAULT_JVM_OPTS= 18 | 19 | @rem Find java.exe 20 | if defined JAVA_HOME goto findJavaFromJavaHome 21 | 22 | set JAVA_EXE=java.exe 23 | %JAVA_EXE% -version >NUL 2>&1 24 | if "%ERRORLEVEL%" == "0" goto init 25 | 26 | echo. 27 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 28 | echo. 29 | echo Please set the JAVA_HOME variable in your environment to match the 30 | echo location of your Java installation. 31 | 32 | goto fail 33 | 34 | :findJavaFromJavaHome 35 | set JAVA_HOME=%JAVA_HOME:"=% 36 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 37 | 38 | if exist "%JAVA_EXE%" goto init 39 | 40 | echo. 41 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 42 | echo. 43 | echo Please set the JAVA_HOME variable in your environment to match the 44 | echo location of your Java installation. 45 | 46 | goto fail 47 | 48 | :init 49 | @rem Get command-line arguments, handling Windows variants 50 | 51 | if not "%OS%" == "Windows_NT" goto win9xME_args 52 | 53 | :win9xME_args 54 | @rem Slurp the command line arguments. 55 | set CMD_LINE_ARGS= 56 | set _SKIP=2 57 | 58 | :win9xME_args_slurp 59 | if "x%~1" == "x" goto execute 60 | 61 | set CMD_LINE_ARGS=%* 62 | 63 | :execute 64 | @rem Setup the command line 65 | 66 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 67 | 68 | @rem Execute Gradle 69 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% 70 | 71 | :end 72 | @rem End local scope for the variables with windows NT shell 73 | if "%ERRORLEVEL%"=="0" goto mainEnd 74 | 75 | :fail 76 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 77 | rem the _cmd.exe /c_ return code! 78 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 79 | exit /b 1 80 | 81 | :mainEnd 82 | if "%OS%"=="Windows_NT" endlocal 83 | 84 | :omega 85 | -------------------------------------------------------------------------------- /src/test/resources/books.xsd: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /src/test/resources/new_books.json: -------------------------------------------------------------------------------- 1 | { 2 | "book": [ 3 | { 4 | "id": "b001", 5 | "others": { 6 | "ot:random": "true" 7 | }, 8 | "author": "Brandon Sanderson", 9 | "title": "Mistborn", 10 | "genre": "Fantasy", 11 | "price": [ 12 | { 13 | "currency": null, 14 | "text_value": 50.0 15 | } 16 | ], 17 | "pub_date": 1166347847000, 18 | "review": [ 19 | { 20 | "title": "Wonderful", 21 | "content": "I love the plot twitst and the new magic" 22 | }, 23 | { 24 | "title": "Unbeliveable twitst", 25 | "content": "The best book i ever read" 26 | } 27 | ], 28 | "type0": [], 29 | "type2": [], 30 | "type6": null, 31 | "sold": "10" 32 | }, 33 | { 34 | "id": "b002", 35 | "others": {}, 36 | "author": "Dan Brown", 37 | "title": "Angels and Demons", 38 | "genre": "Mystery Thriller", 39 | "price": [ 40 | { 41 | "currency": "EUR", 42 | "text_value": null 43 | }, 44 | { 45 | "currency": null, 46 | "text_value": 52.0 47 | } 48 | ], 49 | "pub_date": 1040117447000, 50 | "review": [ 51 | { 52 | "title": "Good Thriller", 53 | "content": null 54 | }, 55 | { 56 | "title": "Fast paced mystery", 57 | "content": null 58 | }, 59 | { 60 | "title": null, 61 | "content": "a good one i would say" 62 | } 63 | ], 64 | "type0": [ 65 | { 66 | "alias": null, 67 | "website": { 68 | "url": "www.danbrown.com" 69 | } 70 | } 71 | ], 72 | "type2": [ 73 | { 74 | "type4": [ 75 | { 76 | "alias2": { 77 | "title": "A2 Angleso and Demenso", 78 | "language": [] 79 | }, 80 | "website2": null 81 | } 82 | ] 83 | } 84 | ], 85 | "type6": { 86 | "alias3": "A3 Angleso and Demenso" 87 | }, 88 | "sold": [] 89 | }, 90 | { 91 | "id": "b003", 92 | "others": {}, 93 | "author": "Dan Brown", 94 | "title": "Digital Fortress", 95 | "genre": "Mystery Thriller", 96 | "price": [], 97 | "pub_date": 1071653447000, 98 | "review": [ 99 | { 100 | "title": "Best SciFi Thriller", 101 | "content": null 102 | }, 103 | { 104 | "title": "Best SciFi Thriller2", 105 | "content": null 106 | }, 107 | { 108 | "title": "Best SciFi Thriller3", 109 | "content": null 110 | } 111 | ], 112 | "type0": [ 113 | { 114 | "alias": { 115 | "title": "Encryto", 116 | "language": [] 117 | }, 118 | "website": null 119 | }, 120 | { 121 | "alias": { 122 | "title": null, 123 | "language": "French" 124 | }, 125 | "website": null 126 | }, 127 | { 128 | "alias": null, 129 | "website": { 130 | "url": "www.danbrown.com" 131 | } 132 | } 133 | ], 134 | "type2": [], 135 | "type6": null, 136 | "sold": "23" 137 | } 138 | ] 139 | } 140 | -------------------------------------------------------------------------------- /src/main/scala/in/dreamlabs/xmlavro/config/ArgParse.scala: -------------------------------------------------------------------------------- 1 | package in.dreamlabs.xmlavro.config 2 | 3 | import javax.xml.namespace.QName 4 | 5 | import scala.collection.immutable.List 6 | import scala.collection.mutable 7 | import scala.reflect.io.Path 8 | import scala.reflect.runtime.universe._ 9 | 10 | /** 11 | * Created by Royce on 02/02/2017. 12 | */ 13 | class ArgParse(args: Seq[String]) { 14 | private val argsMap = { 15 | val map = mutable.Map[String, List[String]]() 16 | val len = args.length 17 | var i = 0 18 | while (i < len) { 19 | val arg = args(i) 20 | if (arg.startsWith("--") || (arg.startsWith("-") && arg.length == 2)) { 21 | val name = arg stripPrefix "-" stripPrefix "-" 22 | val values = mutable.ListBuffer[String]() 23 | while (i + 1 < len && !args(i + 1).startsWith("-")) { 24 | i += 1 25 | values += args(i) 26 | } 27 | map += (name -> values.toList) 28 | } 29 | i += 1 30 | } 31 | map 32 | } 33 | 34 | 35 | def opt[T: TypeTag](name: String, short: Char): Option[T] = { 36 | if (argsMap.contains(name) || argsMap.contains(short + "")) { 37 | val key = if (argsMap contains name) name else short + "" 38 | val values = argsMap(key) 39 | try 40 | Some(value[T](values)) 41 | catch { 42 | case e: IllegalArgumentException => throw new IllegalArgumentException(s"${e.getMessage} for $key", e) 43 | } 44 | } else None 45 | } 46 | 47 | private def value[T: TypeTag](original: List[String]): T = { 48 | typeOf[T] match { 49 | case t if t =:= typeOf[String] => original.fetch().asInstanceOf[T] 50 | case t if t =:= typeOf[Int] => original.fetch().toInt.asInstanceOf[T] 51 | case t if t =:= typeOf[Double] => original.fetch().toDouble.asInstanceOf[T] 52 | case t if t =:= typeOf[Boolean] => original.fetch().toBoolean.asInstanceOf[T] 53 | case t if t =:= typeOf[Path] => Path(original.fetch()).asInstanceOf[T] 54 | case t if t =:= typeOf[List[String]] => original.validate().asInstanceOf[T] 55 | case t if t =:= typeOf[List[Int]] => original.validate().map(value => value.toInt).asInstanceOf[T] 56 | case t if t =:= typeOf[List[Double]] => original.validate().map(value => value.toDouble).asInstanceOf[T] 57 | case t if t =:= typeOf[List[Boolean]] => original.validate().map(value => value.toBoolean).asInstanceOf[T] 58 | case t if t =:= typeOf[List[Path]] => original.validate().map(value => Path(value)).asInstanceOf[T] 59 | case t if t =:= typeOf[QName] => QName.valueOf(original.fetch()).asInstanceOf[T] 60 | case other => throw new IllegalArgumentException(s"Type $other is not yet supported") 61 | } 62 | } 63 | def toggle(name: String, short: Char): Option[Boolean] = { 64 | if (argsMap.contains(name) || argsMap.contains(short + "")) { 65 | val key = if (argsMap contains name) name else short + "" 66 | val values = argsMap(key) 67 | if (values.nonEmpty) 68 | throw new IllegalArgumentException(s"Too many values provided for $key") 69 | else 70 | Some(true) 71 | } else None 72 | } 73 | 74 | implicit class MyList[String](list: List[String]) { 75 | def fetch(): String = { 76 | if (list.length > 1) 77 | throw new IllegalArgumentException(s"Too many values provided") 78 | else if (list.isEmpty) 79 | throw new IllegalArgumentException(s"Too less values provided") 80 | else list.head 81 | } 82 | 83 | def validate(): List[String] = { 84 | if (list.isEmpty) 85 | throw new IllegalArgumentException(s"Too less values provided") 86 | else list 87 | } 88 | } 89 | 90 | } 91 | -------------------------------------------------------------------------------- /example/books.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type" : "record", 3 | "name" : "BooksForm", 4 | "fields" : [ { 5 | "name" : "book", 6 | "type" : { 7 | "type" : "array", 8 | "items" : { 9 | "type" : "record", 10 | "name" : "BookForm", 11 | "fields" : [ { 12 | "name" : "author", 13 | "type" : "string", 14 | "source" : "element author" 15 | }, { 16 | "name" : "title", 17 | "type" : "string", 18 | "source" : "element title" 19 | }, { 20 | "name" : "genre", 21 | "type" : "string", 22 | "source" : "element genre" 23 | }, { 24 | "name" : "price", 25 | "type" : [ "null", { 26 | "type" : "array", 27 | "items" : { 28 | "type" : "record", 29 | "name" : "PriceType", 30 | "fields" : [ { 31 | "name" : "text_value", 32 | "type" : [ "null", "double" ], 33 | "source" : "element text_value" 34 | }, { 35 | "name" : "_currency", 36 | "type" : [ "null", "string" ], 37 | "default" : null, 38 | "source" : "attribute currency" 39 | } ] 40 | } 41 | } ], 42 | "default" : null, 43 | "source" : "element price" 44 | }, { 45 | "name" : "pub_date", 46 | "type" : [ "null", "string" ], 47 | "default" : null, 48 | "source" : "element pub_date" 49 | }, { 50 | "name" : "review", 51 | "type" : [ "null", { 52 | "type" : "array", 53 | "items" : { 54 | "type" : "record", 55 | "name" : "ReviewType", 56 | "fields" : [ { 57 | "name" : "title", 58 | "type" : "string", 59 | "source" : "element title" 60 | }, { 61 | "name" : "content", 62 | "type" : [ "null", "string" ], 63 | "default" : null, 64 | "source" : "element content" 65 | } ] 66 | } 67 | } ], 68 | "default" : null, 69 | "source" : "element review" 70 | }, { 71 | "name" : "type0", 72 | "type" : { 73 | "type" : "array", 74 | "items" : { 75 | "type" : "record", 76 | "name" : "type1", 77 | "fields" : [ { 78 | "name" : "alias", 79 | "type" : { 80 | "type" : "record", 81 | "name" : "AliasType", 82 | "fields" : [ { 83 | "name" : "title", 84 | "type" : "string", 85 | "source" : "element title" 86 | }, { 87 | "name" : "language", 88 | "type" : [ "null", { 89 | "type" : "array", 90 | "items" : "string" 91 | } ], 92 | "default" : null, 93 | "source" : "element language" 94 | } ] 95 | }, 96 | "source" : "element alias" 97 | }, { 98 | "name" : "website", 99 | "type" : { 100 | "type" : "record", 101 | "name" : "WebsiteType", 102 | "fields" : [ { 103 | "name" : "url", 104 | "type" : [ "null", { 105 | "type" : "array", 106 | "items" : "string" 107 | } ], 108 | "default" : null, 109 | "source" : "element url" 110 | } ] 111 | }, 112 | "source" : "element website" 113 | } ] 114 | } 115 | } 116 | }, { 117 | "name" : "sold", 118 | "type" : [ "null", { 119 | "type" : "array", 120 | "items" : "string" 121 | } ], 122 | "default" : null, 123 | "source" : "element sold" 124 | }, { 125 | "name" : "others", 126 | "type" : { 127 | "type" : "map", 128 | "values" : "string" 129 | } 130 | }, { 131 | "name" : "_id", 132 | "type" : [ "null", "string" ], 133 | "default" : null, 134 | "source" : "attribute id" 135 | } ] 136 | } 137 | }, 138 | "source" : "element book" 139 | } ] 140 | } -------------------------------------------------------------------------------- /src/test/resources/book.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type": "record", 3 | "name": "BookForm", 4 | "fields": [ 5 | { 6 | "name": "id", 7 | "type": [ 8 | "null", 9 | "string" 10 | ], 11 | "source": "attribute id" 12 | }, 13 | { 14 | "name": "others", 15 | "type": { 16 | "type": "map", 17 | "values": "string" 18 | } 19 | }, 20 | { 21 | "name": "author", 22 | "type": "string", 23 | "source": "element author" 24 | }, 25 | { 26 | "name": "title", 27 | "type": "string", 28 | "source": "element title" 29 | }, 30 | { 31 | "name": "genre", 32 | "type": "string", 33 | "source": "element genre" 34 | }, 35 | { 36 | "name": "price", 37 | "type": { 38 | "type": "array", 39 | "items": { 40 | "type": "record", 41 | "name": "PriceType", 42 | "fields": [ 43 | { 44 | "name": "currency", 45 | "type": [ 46 | "null", 47 | "string" 48 | ], 49 | "source": "attribute currency" 50 | }, 51 | { 52 | "name": "text_value", 53 | "type": [ 54 | "null", 55 | "double" 56 | ], 57 | "source": "element text_value" 58 | } 59 | ] 60 | } 61 | }, 62 | "source": "element price" 63 | }, 64 | { 65 | "name": "pub_date", 66 | "type": [ 67 | "null", 68 | "long" 69 | ], 70 | "source": "element pub_date", 71 | "comment": "timestamp" 72 | }, 73 | { 74 | "name": "type0", 75 | "type": { 76 | "type": "array", 77 | "items": { 78 | "type": "record", 79 | "name": "type1", 80 | "fields": [ 81 | { 82 | "name": "review", 83 | "type": [ 84 | "null", 85 | { 86 | "type": "record", 87 | "name": "ReviewType", 88 | "fields": [ 89 | { 90 | "name": "title", 91 | "type": "string", 92 | "source": "element title" 93 | }, 94 | { 95 | "name": "content", 96 | "type": [ 97 | "null", 98 | "string" 99 | ], 100 | "source": "element content" 101 | } 102 | ] 103 | } 104 | ], 105 | "source": "element review" 106 | }, 107 | { 108 | "name": "alias", 109 | "type": [ 110 | "null", 111 | { 112 | "type": "record", 113 | "name": "AliasType", 114 | "fields": [ 115 | { 116 | "name": "title", 117 | "type": "string", 118 | "source": "element title" 119 | }, 120 | { 121 | "name": "language", 122 | "type": { 123 | "type": "array", 124 | "items": "string" 125 | }, 126 | "source": "element language" 127 | } 128 | ] 129 | } 130 | ], 131 | "source": "element alias" 132 | }, 133 | { 134 | "name": "website", 135 | "type": [ 136 | "null", 137 | { 138 | "type": "record", 139 | "name": "WebsiteType", 140 | "fields": [ 141 | { 142 | "name": "url", 143 | "type": { 144 | "type": "array", 145 | "items": "string" 146 | }, 147 | "source": "element url" 148 | } 149 | ] 150 | } 151 | ], 152 | "source": "element website" 153 | }, 154 | { 155 | "name": "sold", 156 | "type": [ 157 | "null", 158 | "string" 159 | ], 160 | "source": "element sold" 161 | } 162 | ] 163 | } 164 | } 165 | } 166 | ] 167 | } -------------------------------------------------------------------------------- /src/main/scala/in/dreamlabs/xmlavro/config/ConfigParser.scala: -------------------------------------------------------------------------------- 1 | package in.dreamlabs.xmlavro.config 2 | 3 | import in.dreamlabs.xmlavro.ConversionException 4 | import javax.xml.namespace.QName 5 | import org.yaml.snakeyaml.Yaml 6 | import org.yaml.snakeyaml.constructor.Constructor 7 | 8 | import scala.collection.mutable 9 | import scala.reflect.io.Path 10 | 11 | /** 12 | * Created by Royce on 21/12/2016. 13 | */ 14 | class ConfigParser(args: Seq[String]) extends ArgParse(args) { 15 | 16 | val config: Config = { 17 | val configFile = opt[Path]("config", 'c') 18 | if (configFile isDefined) { 19 | fetchConfig(configFile get) 20 | } else { 21 | new Config 22 | } 23 | } 24 | 25 | processArgs() 26 | config.validate() 27 | 28 | private def processArgs(): Unit = { 29 | val debug = toggle("debug", 'd') 30 | val baseDir = opt[Path]("baseDir", 'b') 31 | val stream = toggle("stream", 's') 32 | val xsd = opt[List[Path]]("toAvsc", 'd') 33 | val xml = opt[List[Path]]("toAvro", 'x') 34 | val splitBy = opt[String]("splitBy", 'y') 35 | val ignoreMissing = toggle("ignoreMissing", 'i') 36 | val validateSchema = opt[Path]("validateSchema", 'v') 37 | val ignoreHiveKeywords = toggle("ignoreHiveKeywords", 'h') 38 | val rootElementQName = opt[QName]("rootElementQName", 'r') 39 | 40 | if (debug isDefined) config.debug = debug.get 41 | if (baseDir isDefined) config.baseDir = baseDir 42 | if (xsd isDefined) { 43 | val tempConfig = 44 | if (config.XSD isDefined) config.XSD.get 45 | else { 46 | val temp = new XSDConfig 47 | config.XSD = Option(temp) 48 | temp 49 | } 50 | if (ignoreHiveKeywords isDefined) tempConfig.ignoreHiveKeywords = ignoreHiveKeywords.get 51 | tempConfig.rootElementQName = rootElementQName 52 | val temp = xsd.get 53 | tempConfig.xsdFile = temp.head 54 | if (temp.length > 1) tempConfig.avscFile = temp(1) 55 | if (temp.length > 2) 56 | throw new IllegalArgumentException( 57 | "Too many values provided for xsd option") 58 | } 59 | if (xml isDefined) { 60 | val tempConfig = 61 | if (config.XML isDefined) config.XML.get 62 | else { 63 | val temp = new XMLConfig 64 | config.XML = Option(temp) 65 | temp 66 | } 67 | val temp = xml.get 68 | tempConfig.avscFile = temp.head 69 | if (stream.isDefined && stream.get) { 70 | tempConfig.xmlInput = "stdin" 71 | tempConfig.avroOutput = "stdout" 72 | } else { 73 | if (temp.length > 1) tempConfig.xmlFile = temp(1) 74 | if (temp.length > 2) tempConfig.avroFile = temp(2) 75 | if (temp.length > 3) 76 | throw new IllegalArgumentException( 77 | "Too many values provided for xml option") 78 | } 79 | tempConfig.documentRootTag = "" 80 | if (splitBy isDefined) tempConfig.splitBy = splitBy.get 81 | if (ignoreMissing isDefined) tempConfig.ignoreMissing = ignoreMissing.get 82 | if (validateSchema isDefined) 83 | tempConfig.validationXSD = validateSchema 84 | } 85 | } 86 | 87 | private def fetchConfig(configFile: Path): Config = { 88 | val configReader = configFile.toFile.bufferedReader() 89 | val configData = StringBuilder.newBuilder 90 | var line = configReader.readLine() 91 | val pattern = "\\$\\{(.+?)\\}".r 92 | while (line != null) { 93 | val matches = pattern.findAllMatchIn(line) 94 | matches.foreach { 95 | tempMatch => 96 | try line = line.replace(tempMatch.matched, sys.env(tempMatch.group(1))) 97 | catch { 98 | case _: NoSuchElementException => throw ConversionException(tempMatch.group(1) + " is not found in the environment variables") 99 | } 100 | } 101 | configData append line + "\n" 102 | line = configReader.readLine() 103 | } 104 | val obj = new Yaml(new Constructor(classOf[Config])) load configData.mkString 105 | obj.asInstanceOf[Config] 106 | } 107 | } 108 | 109 | object ConfigParser { 110 | val USAGE1 = 111 | "{-d|--debug} {-b|--baseDir } -xsd|--toAvsc {} {-h|--ignoreHiveKeywords} {-r|rootElementQName }" 112 | val USAGE2 = 113 | "{-b|--baseDir } {-s|--stream|--stdout} -xml|--toAvro {} {} {-sb|--splitby } {-i|--ignoreMissing} {-v|--validateSchema }" 114 | val USAGE3 = 115 | "{-d|--debug} {-b|--baseDir } {-xsd|--toAvsc {} {-h|--ignoreHiveKeywords} {-r|rootElementQName }} {-s|--stream|--stdout} {-xml|--toAvro {} {} {-sb|--splitby }} {-i|--ignoreMissing}" 116 | val USAGE: String = "XSD to AVSC Usage : " + USAGE1 + "\nXML to AVRO Usage : " + USAGE2 + "\nMixed Usage : " + USAGE3 117 | 118 | def apply(args: Array[String]): ConfigParser = new ConfigParser(args) 119 | } 120 | -------------------------------------------------------------------------------- /src/test/resources/books.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 6 | Brandon Sanderson 7 | Mistborn 8 | Fantasy 9 | 50 10 | 2006-12-17T09:30:47.0Z 11 | 12 | Wonderful 13 | I love the plot twitst and the new magic 14 | 15 | 16 | Unbeliveable twitst 17 | The best book i ever read 18 | 19 | 10 20 | 21 | 22 | Dan Brown 23 | Angels and Demons 24 | Mystery Thriller 25 | 52 26 | 2002-12-17T09:30:47.0Z 27 | 28 | Good Thriller 29 | 30 | 31 | Fast paced mystery 32 | a good one i would say 33 | 34 | 35 | A2 Angleso and Demenso 36 | 37 | A3 Angleso and Demenso 38 | 39 | www.danbrown.com 40 | 41 | 42 | 43 | Dan Brown 44 | Digital Fortress 45 | Mystery Thriller 46 | 2003-12-17T09:30:47.0Z 47 | 48 | Best SciFi Thriller 49 | 50 | 51 | Best SciFi Thriller2 52 | 53 | 54 | Best SciFi Thriller3 55 | 56 | 57 | Encryto 58 | Italian 59 | French 60 | 61 | 62 | www.danbrown.com 63 | 64 | 23 65 | 66 | 67 | Dan Brown 68 | Digital Fortress 69 | Mystery Thriller 70 | 2003-12-17T09:30:47.0Z 71 | 72 | Best SciFi Thriller 73 | 74 | 75 | Best SciFi Thriller2 76 | 77 | 78 | Best SciFi Thriller3 79 | 80 | 81 | Encryto 82 | Italian 83 | French 84 | 85 | 86 | www.danbrown.com 87 | 88 | 23 89 | 90 | 91 | Dan Brown 92 | Digital Fortress 93 | Mystery Thriller 94 | 2003-12-17T09:30:47.0Z 95 | 96 | Best SciFi Thriller 97 | 98 | 99 | Best SciFi Thriller2 100 | 101 | 102 | Best SciFi Thriller3 103 | 104 | 105 | Encryto 106 | Italian 107 | French 108 | 109 | 110 | www.danbrown.com 111 | 112 | 23 113 | 114 | 115 | Dan Brown 116 | Digital Fortress 117 | Mystery Thriller 118 | 2003-12-17T09:30:47.0Z 119 | 120 | Best SciFi Thriller 121 | 122 | 123 | Best SciFi Thriller2 124 | 125 | 126 | Best SciFi Thriller3 127 | 128 | 129 | Encryto 130 | Italian 131 | French 132 | 133 | 134 | www.danbrown.com 135 | 136 | 23 137 | 138 | 139 | 140 | -------------------------------------------------------------------------------- /src/test/resources/books.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type" : "record", 3 | "name" : "type2", 4 | "fields" : [ { 5 | "name" : "books", 6 | "type" : [ "null", { 7 | "type" : "record", 8 | "name" : "BooksForm", 9 | "fields" : [ { 10 | "name" : "book", 11 | "type" : { 12 | "type" : "array", 13 | "items" : { 14 | "type" : "record", 15 | "name" : "BookForm", 16 | "fields" : [ { 17 | "name" : "id", 18 | "type" : [ "null", "string" ], 19 | "source" : "attribute id" 20 | }, { 21 | "name" : "others", 22 | "type" : { 23 | "type" : "map", 24 | "values" : "string" 25 | } 26 | }, { 27 | "name" : "author", 28 | "type" : "string", 29 | "source" : "element author" 30 | }, { 31 | "name" : "title", 32 | "type" : "string", 33 | "source" : "element title" 34 | }, { 35 | "name" : "genre", 36 | "type" : "string", 37 | "source" : "element genre" 38 | }, { 39 | "name" : "price", 40 | "type" : [ "null", { 41 | "type" : "array", 42 | "items" : { 43 | "type" : "record", 44 | "name" : "PriceType", 45 | "fields" : [ { 46 | "name" : "currency", 47 | "type" : [ "null", "string" ], 48 | "source" : "attribute currency" 49 | }, { 50 | "name" : "text_value", 51 | "type" : [ "null", "double" ], 52 | "source" : "element text_value" 53 | } ] 54 | } 55 | } ], 56 | "source" : "element price" 57 | }, { 58 | "name" : "pub_date", 59 | "type" : [ "null", "long" ], 60 | "source" : "element pub_date", 61 | "comment" : "timestamp" 62 | }, { 63 | "name" : "review", 64 | "type" : [ "null", { 65 | "type" : "array", 66 | "items" : { 67 | "type" : "record", 68 | "name" : "ReviewType", 69 | "fields" : [ { 70 | "name" : "title", 71 | "type" : "string", 72 | "source" : "element title" 73 | }, { 74 | "name" : "content", 75 | "type" : [ "null", "string" ], 76 | "source" : "element content" 77 | } ] 78 | } 79 | } ], 80 | "source" : "element review" 81 | }, { 82 | "name" : "type0", 83 | "type" : { 84 | "type" : "array", 85 | "items" : { 86 | "type" : "record", 87 | "name" : "type1", 88 | "fields" : [ { 89 | "name" : "alias", 90 | "type" : { 91 | "type" : "record", 92 | "name" : "AliasType", 93 | "fields" : [ { 94 | "name" : "title", 95 | "type" : "string", 96 | "source" : "element title" 97 | }, { 98 | "name" : "language", 99 | "type" : [ "null", { 100 | "type" : "array", 101 | "items" : "string" 102 | } ], 103 | "source" : "element language" 104 | } ] 105 | }, 106 | "source" : "element alias" 107 | }, { 108 | "name" : "website", 109 | "type" : { 110 | "type" : "record", 111 | "name" : "WebsiteType", 112 | "fields" : [ { 113 | "name" : "url", 114 | "type" : [ "null", { 115 | "type" : "array", 116 | "items" : "string" 117 | } ], 118 | "source" : "element url" 119 | } ] 120 | }, 121 | "source" : "element website" 122 | } ] 123 | } 124 | } 125 | }, { 126 | "name" : "sold", 127 | "type" : [ "null", { 128 | "type" : "array", 129 | "items" : "string" 130 | } ], 131 | "source" : "element sold" 132 | } ] 133 | } 134 | }, 135 | "source" : "element book" 136 | } ] 137 | } ], 138 | "source" : "element {urn:main}:books" 139 | }, { 140 | "name" : "author", 141 | "type" : [ "null", "string" ], 142 | "source" : "element {http://www.books.com/XML}:author" 143 | } ], 144 | "source" : "document" 145 | } -------------------------------------------------------------------------------- /src/main/scala/in/dreamlabs/xmlavro/XMLEvents.scala: -------------------------------------------------------------------------------- 1 | package in.dreamlabs.xmlavro 2 | 3 | import in.dreamlabs.xmlavro 4 | import in.dreamlabs.xmlavro.RichAvro._ 5 | import in.dreamlabs.xmlavro.Utils._ 6 | import org.apache.avro.Schema 7 | import org.apache.avro.Schema.Type._ 8 | import org.apache.avro.Schema.{Field, Type} 9 | import org.apache.avro.generic.GenericData.Record 10 | 11 | import scala.collection.mutable.ListBuffer 12 | import scala.util.control.Breaks.{break, breakable} 13 | 14 | /** 15 | * Created by Royce on 13/02/2017. 16 | */ 17 | object XMLEvents { 18 | val PRIMITIVES: List[Type] = 19 | List(STRING, INT, LONG, FLOAT, DOUBLE, BOOLEAN, NULL) 20 | val eleStack: ListBuffer[XNode] = ListBuffer[XNode]() 21 | val schemaPath: ListBuffer[AvroPath] = ListBuffer[AvroPath]() 22 | var rootSchema: Schema = _ 23 | var rootRecord: Record = _ 24 | private var lastSchema = rootSchema 25 | 26 | def setSchema(schema: Schema, record: Record): Unit = { 27 | rootSchema = schema 28 | rootRecord = record 29 | lastSchema = rootSchema 30 | eleStack.clear() 31 | schemaPath.clear() 32 | } 33 | 34 | def addElement(node: XNode): Boolean = { 35 | eleStack.insert(0, node) 36 | 37 | var found = false 38 | if (eleStack.length != 1) { 39 | val (field, path, _) = searchField(lastSchema, node) 40 | if (field isDefined) { 41 | schemaPath ++= path.reverse 42 | updatePath(field.get) 43 | found = true 44 | } else 45 | AvroPath.missing(eleStack) 46 | } else found = true 47 | found 48 | } 49 | 50 | def removeElement(node: XNode): Unit = { 51 | if (node.name != eleStack.head.name) 52 | throw ConversionException(s"No. of closing tags is not matching opening tags when closing ${node.name}, contact the developer") 53 | 54 | eleStack.remove(0) 55 | var count = schemaPath.size 56 | if (count != 0) { 57 | val schemaNodeName = if (SchemaBuilder.HIVE_KEYWORDS.contains(node.name.toUpperCase)) 58 | if (schemaPath.last.name != s"${node.name}_value" && Option(lastSchema.getField(s"${node.name}_value")).isEmpty) { 59 | AvroPath.warning(eleStack, s"${node.name} found in the XML is a Hive keyword, " + 60 | s"but the avsc schema is not modified to fix any possible issues, " + 61 | s"please consider updating it to ${node.name}_value or re-create the avsc with latest jar. " + 62 | s"If you updated the avsc make sure you update your table schema as well") 63 | node.name 64 | } else 65 | s"${node.name}_value" 66 | else 67 | node.name 68 | 69 | 70 | if (schemaPath.last.name == schemaNodeName && node.name != eleStack.head.name) { //Complex tag closing 71 | count = destroyLastPath() 72 | while (count != 0 && schemaPath.last.virtual) { 73 | count = destroyLastPath() 74 | } 75 | } else if (schemaPath.last.name.startsWith("type")) { 76 | while (count != 0 && schemaPath.last.virtual) { 77 | count = destroyLastPath() 78 | } 79 | } 80 | 81 | lastSchema = rootRecord.at(schemaPath.toList).getSchema 82 | } 83 | } 84 | 85 | private def destroyLastPath(): Int = { 86 | val tempPath = schemaPath.last 87 | schemaPath -= tempPath 88 | schemaPath size 89 | } 90 | 91 | def searchField( 92 | schema: Schema, 93 | node: XNode): (Option[Field], ListBuffer[AvroPath], Schema) = { 94 | var fieldSchema = schema.simplify 95 | var field = schema.deepSchema.field(node) 96 | val path = ListBuffer[AvroPath]() 97 | 98 | // If field is not a direct child in schema, search through all custom fields 99 | if (field isEmpty) 100 | breakable { 101 | for (typeField <- fieldSchema.customTypeFields()) { 102 | val (resultField, resultPath, resultSchema) = 103 | searchField(typeField.fieldSchema, node) 104 | if (resultField isDefined) { 105 | val (tempPath, tempSchema) = getPath(typeField, virtual = true) 106 | resultPath ++= tempPath 107 | path ++= resultPath 108 | field = resultField 109 | fieldSchema = resultSchema 110 | break 111 | } 112 | } 113 | } 114 | if (field isEmpty) 115 | field = schema.wildcard(node.attribute) 116 | (field, path, fieldSchema) 117 | } 118 | 119 | def getPath(field: Field, 120 | virtual: Boolean = false): (ListBuffer[AvroPath], Schema) = { 121 | val path = ListBuffer[AvroPath]() 122 | val name = field name() 123 | if (field isArray) { 124 | if (field.arrayItemType == RECORD) { 125 | path += AvroPath(name, ARRAY, schemaPath ++ path.reverse, virtual) 126 | return (path, field arraySchema) 127 | } else if (!field.isPrimitiveArray) 128 | warn(s"1 - Unknown type ${field arraySchema} for $name") 129 | } else if (field isRecord) 130 | path += AvroPath(name, RECORD, schemaPath ++ path.reverse, virtual) 131 | else if (!field.isPrimitive && !field.isMap) 132 | throw ConversionException(s"WARNING: 2 - Unknown type ${field.fieldType} for $name") 133 | (path, field fieldSchema) 134 | } 135 | 136 | def updatePath(field: Field, virtual: Boolean = false): Unit = { 137 | val name = field name() 138 | if (field isArray) { 139 | if (field.arrayItemType == RECORD) { 140 | schemaPath += AvroPath(name, ARRAY, schemaPath, virtual) 141 | lastSchema = field.arraySchema 142 | } else if (!field.isPrimitiveArray) 143 | warn(s"1 - Unknown type ${field.arraySchema} for $name") 144 | } else if (field isRecord) { 145 | schemaPath += AvroPath(name, RECORD, schemaPath, virtual) 146 | lastSchema = field.fieldSchema 147 | } else if (!field.isPrimitive && !field.isMap) 148 | throw ConversionException(s"WARNING: 2 - Unknown type ${field.fieldType} for $name") 149 | } 150 | } 151 | 152 | -------------------------------------------------------------------------------- /gradlew: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | ############################################################################## 4 | ## 5 | ## Gradle start up script for UN*X 6 | ## 7 | ############################################################################## 8 | 9 | # Attempt to set APP_HOME 10 | # Resolve links: $0 may be a link 11 | PRG="$0" 12 | # Need this for relative symlinks. 13 | while [ -h "$PRG" ] ; do 14 | ls=`ls -ld "$PRG"` 15 | link=`expr "$ls" : '.*-> \(.*\)$'` 16 | if expr "$link" : '/.*' > /dev/null; then 17 | PRG="$link" 18 | else 19 | PRG=`dirname "$PRG"`"/$link" 20 | fi 21 | done 22 | SAVED="`pwd`" 23 | cd "`dirname \"$PRG\"`/" >/dev/null 24 | APP_HOME="`pwd -P`" 25 | cd "$SAVED" >/dev/null 26 | 27 | APP_NAME="Gradle" 28 | APP_BASE_NAME=`basename "$0"` 29 | 30 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 31 | DEFAULT_JVM_OPTS="" 32 | 33 | # Use the maximum available, or set MAX_FD != -1 to use that value. 34 | MAX_FD="maximum" 35 | 36 | warn () { 37 | echo "$*" 38 | } 39 | 40 | die () { 41 | echo 42 | echo "$*" 43 | echo 44 | exit 1 45 | } 46 | 47 | # OS specific support (must be 'true' or 'false'). 48 | cygwin=false 49 | msys=false 50 | darwin=false 51 | nonstop=false 52 | case "`uname`" in 53 | CYGWIN* ) 54 | cygwin=true 55 | ;; 56 | Darwin* ) 57 | darwin=true 58 | ;; 59 | MINGW* ) 60 | msys=true 61 | ;; 62 | NONSTOP* ) 63 | nonstop=true 64 | ;; 65 | esac 66 | 67 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar 68 | 69 | # Determine the Java command to use to start the JVM. 70 | if [ -n "$JAVA_HOME" ] ; then 71 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 72 | # IBM's JDK on AIX uses strange locations for the executables 73 | JAVACMD="$JAVA_HOME/jre/sh/java" 74 | else 75 | JAVACMD="$JAVA_HOME/bin/java" 76 | fi 77 | if [ ! -x "$JAVACMD" ] ; then 78 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME 79 | 80 | Please set the JAVA_HOME variable in your environment to match the 81 | location of your Java installation." 82 | fi 83 | else 84 | JAVACMD="java" 85 | which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 86 | 87 | Please set the JAVA_HOME variable in your environment to match the 88 | location of your Java installation." 89 | fi 90 | 91 | # Increase the maximum file descriptors if we can. 92 | if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then 93 | MAX_FD_LIMIT=`ulimit -H -n` 94 | if [ $? -eq 0 ] ; then 95 | if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then 96 | MAX_FD="$MAX_FD_LIMIT" 97 | fi 98 | ulimit -n $MAX_FD 99 | if [ $? -ne 0 ] ; then 100 | warn "Could not set maximum file descriptor limit: $MAX_FD" 101 | fi 102 | else 103 | warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" 104 | fi 105 | fi 106 | 107 | # For Darwin, add options to specify how the application appears in the dock 108 | if $darwin; then 109 | GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" 110 | fi 111 | 112 | # For Cygwin, switch paths to Windows format before running java 113 | if $cygwin ; then 114 | APP_HOME=`cygpath --path --mixed "$APP_HOME"` 115 | CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` 116 | JAVACMD=`cygpath --unix "$JAVACMD"` 117 | 118 | # We build the pattern for arguments to be converted via cygpath 119 | ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` 120 | SEP="" 121 | for dir in $ROOTDIRSRAW ; do 122 | ROOTDIRS="$ROOTDIRS$SEP$dir" 123 | SEP="|" 124 | done 125 | OURCYGPATTERN="(^($ROOTDIRS))" 126 | # Add a user-defined pattern to the cygpath arguments 127 | if [ "$GRADLE_CYGPATTERN" != "" ] ; then 128 | OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" 129 | fi 130 | # Now convert the arguments - kludge to limit ourselves to /bin/sh 131 | i=0 132 | for arg in "$@" ; do 133 | CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` 134 | CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option 135 | 136 | if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition 137 | eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` 138 | else 139 | eval `echo args$i`="\"$arg\"" 140 | fi 141 | i=$((i+1)) 142 | done 143 | case $i in 144 | (0) set -- ;; 145 | (1) set -- "$args0" ;; 146 | (2) set -- "$args0" "$args1" ;; 147 | (3) set -- "$args0" "$args1" "$args2" ;; 148 | (4) set -- "$args0" "$args1" "$args2" "$args3" ;; 149 | (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; 150 | (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; 151 | (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; 152 | (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; 153 | (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; 154 | esac 155 | fi 156 | 157 | # Escape application args 158 | save () { 159 | for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done 160 | echo " " 161 | } 162 | APP_ARGS=$(save "$@") 163 | 164 | # Collect all arguments for the java command, following the shell quoting and substitution rules 165 | eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS" 166 | 167 | # by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong 168 | if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then 169 | cd "$(dirname "$0")" 170 | fi 171 | 172 | exec "$JAVACMD" "$@" 173 | -------------------------------------------------------------------------------- /src/test/resources/old_books.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type": "record", 3 | "name": "BooksForm", 4 | "fields": [ 5 | { 6 | "name": "book", 7 | "type": { 8 | "type": "array", 9 | "items": { 10 | "type": "record", 11 | "name": "BookForm", 12 | "fields": [ 13 | { 14 | "name": "id", 15 | "type": [ 16 | "null", 17 | "string" 18 | ], 19 | "source": "attribute id" 20 | }, 21 | { 22 | "name": "others", 23 | "type": { 24 | "type": "map", 25 | "values": "string" 26 | } 27 | }, 28 | { 29 | "name": "author", 30 | "type": "string", 31 | "source": "element author" 32 | }, 33 | { 34 | "name": "title", 35 | "type": "string", 36 | "source": "element title" 37 | }, 38 | { 39 | "name": "genre", 40 | "type": "string", 41 | "source": "element genre" 42 | }, 43 | { 44 | "name": "price", 45 | "type": { 46 | "type": "array", 47 | "items": { 48 | "type": "record", 49 | "name": "PriceType", 50 | "fields": [ 51 | { 52 | "name": "currency", 53 | "type": [ 54 | "null", 55 | "string" 56 | ], 57 | "source": "attribute currency" 58 | }, 59 | { 60 | "name": "text_value", 61 | "type": [ 62 | "null", 63 | "double" 64 | ], 65 | "source": "element text_value" 66 | } 67 | ] 68 | } 69 | }, 70 | "source": "element price" 71 | }, 72 | { 73 | "name": "pub_date", 74 | "type": [ 75 | "null", 76 | "long" 77 | ], 78 | "source": "element pub_date", 79 | "comment": "timestamp" 80 | }, 81 | { 82 | "name": "type0", 83 | "type": { 84 | "type": "array", 85 | "items": { 86 | "type": "record", 87 | "name": "type1", 88 | "fields": [ 89 | { 90 | "name": "review", 91 | "type": [ 92 | "null", 93 | { 94 | "type": "record", 95 | "name": "ReviewType", 96 | "fields": [ 97 | { 98 | "name": "title", 99 | "type": "string", 100 | "source": "element title" 101 | }, 102 | { 103 | "name": "content", 104 | "type": [ 105 | "null", 106 | "string" 107 | ], 108 | "source": "element content" 109 | } 110 | ] 111 | } 112 | ], 113 | "source": "element review" 114 | }, 115 | { 116 | "name": "alias", 117 | "type": [ 118 | "null", 119 | { 120 | "type": "record", 121 | "name": "AliasType", 122 | "fields": [ 123 | { 124 | "name": "title", 125 | "type": "string", 126 | "source": "element title" 127 | }, 128 | { 129 | "name": "language", 130 | "type": { 131 | "type": "array", 132 | "items": "string" 133 | }, 134 | "source": "element language" 135 | } 136 | ] 137 | } 138 | ], 139 | "source": "element alias" 140 | }, 141 | { 142 | "name": "website", 143 | "type": [ 144 | "null", 145 | { 146 | "type": "record", 147 | "name": "WebsiteType", 148 | "fields": [ 149 | { 150 | "name": "url", 151 | "type": { 152 | "type": "array", 153 | "items": "string" 154 | }, 155 | "source": "element url" 156 | } 157 | ] 158 | } 159 | ], 160 | "source": "element website" 161 | }, 162 | { 163 | "name": "sold", 164 | "type": [ 165 | "null", 166 | "string" 167 | ], 168 | "source": "element sold" 169 | } 170 | ] 171 | } 172 | } 173 | } 174 | ] 175 | } 176 | }, 177 | "source": "element book" 178 | } 179 | ] 180 | } 181 | -------------------------------------------------------------------------------- /src/test/resources/new_books.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type": "record", 3 | "name": "BooksForm", 4 | "fields": [ 5 | { 6 | "name": "book", 7 | "type": { 8 | "type": "array", 9 | "items": { 10 | "type": "record", 11 | "name": "BookForm", 12 | "fields": [ 13 | { 14 | "name": "id", 15 | "type": [ 16 | "null", 17 | "string" 18 | ], 19 | "source": "attribute id" 20 | }, 21 | { 22 | "name": "others", 23 | "type": { 24 | "type": "map", 25 | "values": "string" 26 | } 27 | }, 28 | { 29 | "name": "author", 30 | "type": "string", 31 | "source": "element author" 32 | }, 33 | { 34 | "name": "title", 35 | "type": "string", 36 | "source": "element title" 37 | }, 38 | { 39 | "name": "genre", 40 | "type": "string", 41 | "source": "element genre" 42 | }, 43 | { 44 | "name": "price", 45 | "type": [ 46 | "null", 47 | { 48 | "type": "array", 49 | "items": { 50 | "type": "record", 51 | "name": "PriceType", 52 | "fields": [ 53 | { 54 | "name": "currency", 55 | "type": [ 56 | "null", 57 | "string" 58 | ], 59 | "source": "attribute currency" 60 | }, 61 | { 62 | "name": "text_value", 63 | "type": [ 64 | "null", 65 | "double" 66 | ], 67 | "source": "element text_value" 68 | } 69 | ] 70 | } 71 | } 72 | ], 73 | "source": "element price" 74 | }, 75 | { 76 | "name": "pub_date", 77 | "type": [ 78 | "null", 79 | "long" 80 | ], 81 | "source": "element pub_date", 82 | "comment": "timestamp" 83 | }, 84 | { 85 | "name": "review", 86 | "type": [ 87 | "null", 88 | { 89 | "type": "array", 90 | "items": { 91 | "type": "record", 92 | "name": "ReviewType", 93 | "fields": [ 94 | { 95 | "name": "title", 96 | "type": "string", 97 | "source": "element title" 98 | }, 99 | { 100 | "name": "content", 101 | "type": [ 102 | "null", 103 | "string" 104 | ], 105 | "source": "element content" 106 | } 107 | ] 108 | } 109 | } 110 | ], 111 | "source": "element review" 112 | }, 113 | { 114 | "name": "type0", 115 | "type": { 116 | "type": "array", 117 | "items": { 118 | "type": "record", 119 | "name": "type1", 120 | "fields": [ 121 | { 122 | "name": "alias", 123 | "type": { 124 | "type": "record", 125 | "name": "AliasType", 126 | "fields": [ 127 | { 128 | "name": "title", 129 | "type": "string", 130 | "source": "element title" 131 | }, 132 | { 133 | "name": "language", 134 | "type": [ 135 | "null", 136 | { 137 | "type": "array", 138 | "items": "string" 139 | } 140 | ], 141 | "source": "element language" 142 | } 143 | ] 144 | }, 145 | "source": "element alias" 146 | }, 147 | { 148 | "name": "website", 149 | "type": { 150 | "type": "record", 151 | "name": "WebsiteType", 152 | "fields": [ 153 | { 154 | "name": "url", 155 | "type": [ 156 | "null", 157 | { 158 | "type": "array", 159 | "items": "string" 160 | } 161 | ], 162 | "source": "element url" 163 | } 164 | ] 165 | }, 166 | "source": "element website" 167 | } 168 | ] 169 | } 170 | } 171 | }, 172 | { 173 | "name": "sold", 174 | "type": [ 175 | "null", 176 | { 177 | "type": "array", 178 | "items": "string" 179 | } 180 | ], 181 | "source": "element sold" 182 | } 183 | ] 184 | } 185 | }, 186 | "source": "element book" 187 | } 188 | ] 189 | } 190 | -------------------------------------------------------------------------------- /src/main/scala/in/dreamlabs/xmlavro/XMLDocument.scala: -------------------------------------------------------------------------------- 1 | package in.dreamlabs.xmlavro 2 | 3 | import java.io.{IOException, PipedReader, PipedWriter, PrintWriter} 4 | import javax.xml.XMLConstants 5 | import javax.xml.stream.events.XMLEvent 6 | import javax.xml.stream.{XMLEventFactory, XMLEventWriter, XMLOutputFactory} 7 | import javax.xml.transform.stream.StreamSource 8 | import javax.xml.validation.{Schema, SchemaFactory} 9 | 10 | import in.dreamlabs.xmlavro.Utils.{info, log, warn} 11 | import in.dreamlabs.xmlavro.config.XMLConfig 12 | import org.xml.sax.SAXParseException 13 | 14 | import scala.collection.mutable 15 | import scala.reflect.io.{File, Path} 16 | 17 | /** 18 | * Created by Royce on 06/03/2017. 19 | */ 20 | class XMLDocument(val id: Int, val uniqueKey: Option[String], config: XMLConfig) { 21 | private val events = mutable.ListBuffer[XMLEvent]() 22 | @volatile var error = false 23 | private var exceptionList: mutable.ListBuffer[Exception] = 24 | mutable.ListBuffer() 25 | private var pipeIn: PipedReader = _ 26 | private var pipeOut: PipedWriter = _ 27 | private var eventOut: XMLEventWriter = _ 28 | private var errorDataFile, errorMetaFile: File = _ 29 | private val locker: AnyRef = new AnyRef 30 | val docText = s"document #$id${ 31 | if (uniqueKey.isDefined) 32 | s" with Unique ID: \'${uniqueKey.get.toString}\'" 33 | else "" 34 | }" 35 | 36 | info("Processing " + docText) 37 | 38 | if (config.errorFile isDefined) { 39 | val filePath = config.errorFile.get 40 | val fileName = filePath.stripExtension 41 | val fileSuffix = if (uniqueKey isDefined) s"${id}__${uniqueKey.get}" else s"$id" 42 | val parent = filePath.parent 43 | errorDataFile = Path(s"${fileName}__$fileSuffix") 44 | .toAbsoluteWithRoot(parent) 45 | .addExtension("xml") 46 | .toFile 47 | errorMetaFile = Path(s"${fileName}__$fileSuffix") 48 | .toAbsoluteWithRoot(parent) 49 | .addExtension("MD") 50 | .toFile 51 | } 52 | 53 | private var validationThread = if (config.validationXSD isDefined) { 54 | pipeIn = new PipedReader() 55 | pipeOut = new PipedWriter(pipeIn) 56 | eventOut = XMLOutputFactory.newInstance().createXMLEventWriter(pipeOut) 57 | Option(new Thread { 58 | override def run(): Unit = { 59 | val validator = XMLDocument.schema.newValidator() 60 | try validator.validate(new StreamSource(pipeIn)) 61 | catch { 62 | case e: SAXParseException => 63 | val message = s"XSD validation failed - Line: ${e.getLineNumber}, Column: ${e.getColumnNumber}, Message: ${e.getMessage}" 64 | fail(ConversionException(message)) 65 | case e: Exception => 66 | warn("Exception in thread: " + e.getMessage) 67 | fail(e) 68 | } finally { 69 | pipeIn.close() 70 | info(s"Finished xsd validation on " + docText) 71 | } 72 | } 73 | }) 74 | } else None 75 | 76 | if (validationThread isDefined) validationThread.get.start() 77 | 78 | def add(event: XMLEvent): Unit = locker.synchronized { 79 | if (config.errorFile isDefined) events += event 80 | if (validationThread.isDefined && !error) eventOut.add(event) 81 | } 82 | 83 | def fail(exception: Exception, wait: Boolean = false): Unit = { 84 | if (wait) { 85 | var thread: Thread = null 86 | validationThread.synchronized { 87 | if (validationThread.isDefined) 88 | thread = validationThread.get 89 | } 90 | if (Option(thread) isDefined) 91 | thread.join(2000) 92 | } 93 | locker.synchronized { 94 | error = true 95 | exceptionList += exception 96 | validationThread.synchronized { 97 | if (validationThread isDefined) validationThread = None 98 | } 99 | } 100 | } 101 | 102 | def close(): Unit = this.synchronized { 103 | if (error) { 104 | val reasons = { 105 | val builder = StringBuilder.newBuilder 106 | exceptionList.foreach(exc => 107 | builder.append(exc.getMessage).append(", ")) 108 | builder.mkString.stripSuffix(", ") 109 | } 110 | log(config.docErrorLevel, 111 | s"Failed processing $docText with reason '$reasons'") 112 | if (config.errorFile.isDefined) { 113 | info( 114 | s"Saving the failed $docText in '$errorDataFile' with message in '$errorMetaFile'") 115 | val dataOut = XMLOutputFactory 116 | .newInstance() 117 | .createXMLEventWriter(errorDataFile.bufferedWriter()) 118 | events += XMLEventFactory.newInstance().createSpace("\n") 119 | events.foreach(dataOut.add) 120 | dataOut.flush() 121 | dataOut.close() 122 | val metaOut = new PrintWriter(errorMetaFile.bufferedWriter()) 123 | metaOut.write(reasons) 124 | metaOut.flush() 125 | metaOut.close() 126 | } 127 | } 128 | 129 | var thread: Thread = null 130 | validationThread.synchronized { 131 | if (validationThread isDefined) thread = validationThread.get 132 | } 133 | if (Option(thread) isDefined) { 134 | try { 135 | eventOut.flush() 136 | pipeOut.flush() 137 | eventOut.close() 138 | pipeOut.close() 139 | info(s"Waiting for xsd validation of $docText to finish") 140 | thread.join(5000) 141 | if (thread.isAlive) { 142 | warn( 143 | s"Schema validation timed out for $docText, ignoring and proceeding further") 144 | pipeIn.close() 145 | } 146 | } catch { 147 | case e: Exception => 148 | warn( 149 | s"Failed to close pipes for $docText with message '${e.getMessage}', ignoring and proceeding further") 150 | } 151 | } 152 | info(s"Closed document #$id") 153 | } 154 | } 155 | 156 | object XMLDocument { 157 | private var schema: Schema = _ 158 | private var count: Int = 0 159 | var config: XMLConfig = _ 160 | 161 | def apply(uniqueKey: Option[String]): XMLDocument = { 162 | if (count == 0 && config.errorFile.isDefined) { 163 | config.errorFile.get.delete() 164 | } 165 | count += 1 166 | if (Option(schema).isEmpty && config.validationXSD.isDefined) 167 | schema = SchemaFactory 168 | .newInstance(XMLConstants.W3C_XML_SCHEMA_NS_URI) 169 | .newSchema(config.validationXSD.get.jfile) 170 | new XMLDocument(count, uniqueKey, config) 171 | } 172 | 173 | def closeAll(): Unit = { 174 | if (config.qaDir.isDefined) { 175 | val qaDir = config.qaDir.get 176 | if (!qaDir.exists) 177 | qaDir.jfile.mkdir() 178 | try { 179 | val docCountOut = Path("DOCUMENT_COUNT") 180 | .toAbsoluteWithRoot(qaDir) 181 | .toFile 182 | .bufferedWriter() 183 | docCountOut.write(count + "") 184 | docCountOut.close() 185 | } catch { 186 | case e: IOException => 187 | warn("Problem occurred while writing DOCUMENT_COUNT to QA DIR :" + e.getMessage) 188 | } 189 | } 190 | } 191 | } 192 | -------------------------------------------------------------------------------- /src/main/scala/in/dreamlabs/xmlavro/RichAvro.scala: -------------------------------------------------------------------------------- 1 | package in.dreamlabs.xmlavro 2 | 3 | import java.util 4 | 5 | import in.dreamlabs.xmlavro.RichAvro.{caseSensitive, ignoreCaseFor} 6 | import org.apache.avro.Schema 7 | import org.apache.avro.Schema.Type._ 8 | import org.apache.avro.Schema.{Field, Type} 9 | import org.apache.avro.generic.GenericData 10 | import org.apache.avro.generic.GenericData.Record 11 | 12 | import scala.collection.JavaConverters._ 13 | import scala.collection.mutable 14 | import scala.util.control.Breaks._ 15 | 16 | /** 17 | * Created by Royce on 26/01/2017. 18 | */ 19 | trait RichAvro { 20 | 21 | implicit class RichRecord(record: Record) { 22 | def at(path: List[AvroPath]): Record = { 23 | var resultRecord = record 24 | path.foreach { path => 25 | if (path.pathType == ARRAY) { 26 | var array = 27 | resultRecord.get(path name).asInstanceOf[util.List[AnyRef]] 28 | if (array == null || array.size() - 1 < path.index) { 29 | val arraySchema = 30 | resultRecord.getSchema.getField(path name).arraySchema 31 | if (array == null) { 32 | array = new util.ArrayList[AnyRef]() 33 | resultRecord.put(path name, array) 34 | } 35 | resultRecord = arraySchema.newRecord 36 | array.add(resultRecord) 37 | } else 38 | resultRecord = array.get(path index).asInstanceOf[Record] 39 | } else { 40 | val tempSchema = 41 | resultRecord.getSchema.getField(path name).fieldSchema 42 | var tempRecord = resultRecord.get(path name).asInstanceOf[Record] 43 | if (tempRecord == null) { 44 | tempRecord = tempSchema.newRecord 45 | resultRecord.put(path name, tempRecord) 46 | } 47 | resultRecord = tempRecord 48 | } 49 | } 50 | resultRecord 51 | } 52 | 53 | def add(node: XNode, value: String): Unit = { 54 | val schema = record.getSchema 55 | var fieldOp = schema field node 56 | var wildcard = false 57 | //TODO Handle wildcard data properly 58 | 59 | if (fieldOp isEmpty) { 60 | fieldOp = schema wildcard (node attribute) 61 | if (fieldOp isDefined) wildcard = true 62 | else if (value.trim() != "" && node.name!="nil") 63 | AvroPath.missing(XMLEvents.eleStack, node) 64 | } 65 | if (fieldOp isDefined) { 66 | val field = fieldOp.get 67 | 68 | if (wildcard) { 69 | val wildField = 70 | record.get(field name).asInstanceOf[util.Map[String, AnyRef]] 71 | val existingVal = wildField.get(node name) 72 | if (Option(existingVal) isEmpty) 73 | wildField.put(node name, value) 74 | else { 75 | existingVal match { 76 | case existingList: util.ArrayList[AnyRef] => 77 | existingList.add(value) 78 | case _ => 79 | val list = new util.ArrayList[AnyRef]() 80 | list.add(existingVal) 81 | list.add(value) 82 | wildField.put(node name, list) 83 | } 84 | 85 | } 86 | 87 | } else { 88 | if (field isArray) { 89 | var array = record.get(field name).asInstanceOf[util.List[AnyRef]] 90 | if (array == null) { 91 | array = new util.ArrayList[AnyRef]() 92 | record.put(field name, array) 93 | } 94 | array.add(AvroUtils.createValue(field arrayItemType, value)) 95 | } else if (field.fieldType == STRING) { 96 | val currentValue = record.get(field name) 97 | if (currentValue != null) record.put(field name, s"$currentValue$value") 98 | else record put(field name, value) 99 | } else 100 | record.put(field name, 101 | AvroUtils.createValue(field fieldType, value)) 102 | } 103 | } 104 | } 105 | } 106 | 107 | implicit class RichSchema(schema: Schema) { 108 | val PRIMITIVES: List[Type] = 109 | List(STRING, INT, LONG, FLOAT, DOUBLE, BOOLEAN, NULL) 110 | 111 | def wildcard(attribute: Boolean): Option[Field] = 112 | Option(schema.simplify.getField(XNode.WILDCARD)) 113 | 114 | def field(node: XNode): Option[Field] = { 115 | var resultField: Option[Field] = None 116 | val tempSchema = schema.simplify 117 | breakable { 118 | tempSchema.getFields.forEach { field => 119 | val sourceField = field.getProp(XNode.SOURCE) 120 | if (Option(sourceField).isEmpty && field.name == XNode.WILDCARD) 121 | break 122 | else if (Option(sourceField).isEmpty && field.name.matches("type\\d+")){ 123 | // Do nothing 124 | } 125 | else if (node sourceMatches(sourceField, caseSensitive, ignoreCaseFor)) { 126 | resultField = Some(field) 127 | break 128 | } 129 | } 130 | } 131 | if (resultField isEmpty) 132 | resultField = Option(tempSchema.getField(XNode.TEXT_VALUE)) 133 | resultField 134 | } 135 | 136 | def simplify: Schema = 137 | if (schema.getType == UNION) schema.getTypes.get(1) else schema 138 | 139 | def customTypeFields(): mutable.Buffer[Field] = 140 | schema.simplify.getFields.asScala.filter(_.name.matches("type\\d+")) 141 | 142 | def deepSchema: Schema = schema getType match { 143 | case UNION => schema.getTypes.get(1) 144 | case ARRAY => 145 | val itemType = schema getElementType() 146 | if (itemType.getType == UNION) 147 | itemType.getTypes.get(1) 148 | else 149 | itemType 150 | case _ => schema 151 | } 152 | 153 | def isArray: Boolean = schema.getType == ARRAY 154 | 155 | def isRecord: Boolean = schema.getType == RECORD 156 | 157 | def isMap: Boolean = schema.getType == MAP 158 | 159 | def isPrimitive: Boolean = PRIMITIVES.contains(schemaType) 160 | 161 | def schemaType: Type = schema.getType 162 | 163 | def arraySchema: Schema = schema.getElementType 164 | 165 | def isPrimitiveArray: Boolean = PRIMITIVES contains arrayItemType 166 | 167 | def arrayItemType: Type = schema.getElementType.getType 168 | 169 | def newRecord: Record = { 170 | val record = new GenericData.Record(schema) 171 | for (field <- record.getSchema.getFields.asScala) { 172 | if (field isArray) 173 | record.put(field.name, new util.ArrayList[AnyRef]()) 174 | if (field.name == XNode.WILDCARD) 175 | record.put(field.name, new util.HashMap[String, AnyRef]()) 176 | } 177 | record 178 | } 179 | 180 | } 181 | 182 | implicit class RichField(field: Field) { 183 | 184 | def fieldType: Type = fieldSchema.getType 185 | 186 | def isArray: Boolean = fieldSchema.isArray 187 | 188 | def fieldSchema: Schema = field.schema().simplify 189 | 190 | def isRecord: Boolean = fieldSchema.isRecord 191 | 192 | def isMap: Boolean = fieldSchema.isMap 193 | 194 | def isPrimitive: Boolean = fieldSchema.isPrimitive 195 | 196 | def isWildcard: Boolean = 197 | if (field.name() == XNode.WILDCARD && field.isMap && Option(field.getProp(XNode.SOURCE)).isEmpty) true else false 198 | 199 | def arraySchema: Schema = fieldSchema.arraySchema 200 | 201 | def arrayItemType: Type = fieldSchema.arrayItemType 202 | 203 | def isPrimitiveArray: Boolean = fieldSchema.isPrimitiveArray 204 | 205 | } 206 | 207 | } 208 | 209 | object RichAvro extends RichAvro { 210 | var ignoreMissing = false 211 | var caseSensitive = true 212 | var ignoreCaseFor: List[String] = _ 213 | var suppressWarnings = false 214 | } 215 | -------------------------------------------------------------------------------- /src/test/resources/new_books2.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type": "record", 3 | "name": "BooksForm", 4 | "fields": [ 5 | { 6 | "name": "book", 7 | "type": { 8 | "type": "array", 9 | "items": { 10 | "type": "record", 11 | "name": "BookForm", 12 | "fields": [ 13 | { 14 | "name": "id", 15 | "type": [ 16 | "null", 17 | "string" 18 | ], 19 | "source": "attribute id" 20 | }, 21 | { 22 | "name": "others", 23 | "type": { 24 | "type": "map", 25 | "values": "string" 26 | } 27 | }, 28 | { 29 | "name": "author", 30 | "type": "string", 31 | "source": "element author" 32 | }, 33 | { 34 | "name": "title", 35 | "type": "string", 36 | "source": "element title" 37 | }, 38 | { 39 | "name": "genre", 40 | "type": "string", 41 | "source": "element genre" 42 | }, 43 | { 44 | "name": "price", 45 | "type": [ 46 | "null", 47 | { 48 | "type": "array", 49 | "items": { 50 | "type": "record", 51 | "name": "PriceType", 52 | "fields": [ 53 | { 54 | "name": "currency", 55 | "type": [ 56 | "null", 57 | "string" 58 | ], 59 | "source": "attribute currency" 60 | }, 61 | { 62 | "name": "text_value", 63 | "type": [ 64 | "null", 65 | "double" 66 | ], 67 | "source": "element text_value" 68 | } 69 | ] 70 | } 71 | } 72 | ], 73 | "source": "element price" 74 | }, 75 | { 76 | "name": "pub_date", 77 | "type": [ 78 | "null", 79 | "long" 80 | ], 81 | "source": "element pub_date", 82 | "comment": "timestamp" 83 | }, 84 | { 85 | "name": "review", 86 | "type": [ 87 | "null", 88 | { 89 | "type": "array", 90 | "items": { 91 | "type": "record", 92 | "name": "ReviewType", 93 | "fields": [ 94 | { 95 | "name": "title", 96 | "type": "string", 97 | "source": "element title" 98 | }, 99 | { 100 | "name": "content", 101 | "type": [ 102 | "null", 103 | "string" 104 | ], 105 | "source": "element content" 106 | } 107 | ] 108 | } 109 | } 110 | ], 111 | "source": "element review" 112 | }, 113 | { 114 | "name": "type0", 115 | "type": { 116 | "type": "array", 117 | "items": { 118 | "type": "record", 119 | "name": "type1", 120 | "fields": [ 121 | { 122 | "name": "alias", 123 | "type": { 124 | "type": "record", 125 | "name": "AliasType", 126 | "fields": [ 127 | { 128 | "name": "title", 129 | "type": "string", 130 | "source": "element title" 131 | }, 132 | { 133 | "name": "language", 134 | "type": [ 135 | "null", 136 | { 137 | "type": "array", 138 | "items": "string" 139 | } 140 | ], 141 | "source": "element language" 142 | } 143 | ] 144 | }, 145 | "source": "element alias" 146 | }, 147 | { 148 | "name": "website", 149 | "type": { 150 | "type": "record", 151 | "name": "WebsiteType", 152 | "fields": [ 153 | { 154 | "name": "url", 155 | "type": [ 156 | "null", 157 | { 158 | "type": "array", 159 | "items": "string" 160 | } 161 | ], 162 | "source": "element url" 163 | } 164 | ] 165 | }, 166 | "source": "element website" 167 | } 168 | ] 169 | } 170 | } 171 | }, 172 | { 173 | "name": "type2", 174 | "type": { 175 | "type": "array", 176 | "items": { 177 | "type": "record", 178 | "name": "type3", 179 | "fields": [ 180 | { 181 | "name": "type4", 182 | "type": { 183 | "type": "array", 184 | "items": { 185 | "type": "record", 186 | "name": "type5", 187 | "fields": [ 188 | { 189 | "name": "alias2", 190 | "type": "AliasType", 191 | "source": "element alias2" 192 | }, 193 | { 194 | "name": "website2", 195 | "type": "WebsiteType", 196 | "source": "element website2" 197 | } 198 | ] 199 | } 200 | } 201 | } 202 | ] 203 | } 204 | } 205 | }, 206 | { 207 | "name": "type6", 208 | "type": { 209 | "type": "record", 210 | "name": "type7", 211 | "fields": [ 212 | { 213 | "name": "alias3", 214 | "type": "string", 215 | "source": "element alias3" 216 | } 217 | ] 218 | } 219 | }, 220 | { 221 | "name": "sold", 222 | "type": [ 223 | "null", 224 | { 225 | "type": "array", 226 | "items": "string" 227 | } 228 | ], 229 | "source": "element sold" 230 | } 231 | ] 232 | } 233 | }, 234 | "source": "element book" 235 | } 236 | ] 237 | } 238 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # XSD => Avsc & XML => Avro 2 | No longer maintained actively 3 | 4 | **** 5 | This project was initially a fork of [xml-avro-elodina](https://github.com/elodina/xml-avro). 6 | Later evolved to separate project with lotsss of bug fixes, memory & performance improvements, options, re-coded in Scala 7 | **** 8 | - Converts any XSD to a proper usable Avro schema (Avsc) 9 | - Converts any XML to avro using the provided schema. What can it do? See the list below. 10 | - Handle any large size XML (even in GigaBytes), as it streams the xml 11 | - Read xml from stdin and output to stdout 12 | - Validate the XML with XSD 13 | - Split the data at any specified element (can have any no.of splits) 14 | - Handle multiple documents in single file (useful when streaming continuous data) 15 | - Write out failed documents without killing the whole process 16 | - Completely configurable 17 | 18 | ### Running Project 19 | 1. `git clone` to clone the repository to local 20 | 2. `gradle build` to generate the jar file 21 | 3. `java -jar ./build/libs/xml-avro-all-.jar -c ` to run the code (options as below) 22 | 23 | Check `./example/config.yml` for sample configuration file 24 | 25 | ### Config File 26 | Create yml config file as per the below format 27 | ``` 28 | debug: false # Enable printing of debug messages 29 | baseDir: "files" # Base directory where most files are relative to 30 | namespaces: true # Enable/Disable usage of namespaces in schema/conversion - Optional (default: true) 31 | 32 | XML: # Convert XML 33 | xmlInput: stdin # Source of XML [ stdin | "somefile.xml" ] 34 | avscFile: "books.avsc" # Avsc file to use for conversion - (If not using splits) 35 | avroOutput: stdout # Traget location [ stdout | "somefile.avro" ] - Optional (Uses the xmlInput to assume the output) (If not using splits) 36 | documentRootTag: "books" # Root tag of the XML (without namespace) 37 | validationXSD: "books.xsd" # Enable validation with specified xsd 38 | ignoreMissing: true # Incase you use a smaller version of avsc (to take only required tags), 39 | # tags in the xml may not exist in the trimmed avsc.. 40 | # This option enables to ignore the missing tags instead of failing 41 | suppressWarnings: true # In case of a lot of missing fields don't print them as warnings 42 | split: # Split the avro records based on specifed list 43 | - 44 | by: "bookName" # Split tag name 45 | avscFile: "name.avsc" # Avsc File for the split part 46 | avroFile: "name.avro" # Avro file name to save to 47 | - 48 | by: "bookPublisher" 49 | avscFile: "publisher.avsc" 50 | avroFile: "publisher.avro" 51 | qaDir: "some path" # Writes some count details 52 | caseSensitive: true # Tags matching xml & avsc are case sensitive - Optional (default: true) 53 | ignoreCaseFor: # Ignore case senitivity for the below list 54 | - "SomeTag" 55 | docErrorLevel: "WARNING" # Use this level to log in case of error in a document 56 | errorFile: "failures.xml" # Writes the failed documents to this file 57 | useAvroInput: true # Read xml data from inside an avro file 58 | inputAvroMappings: # Set of mappings from source field name to target, use "xmlInput" as target to mark it as the xml data, use "unique_id" as target to mark the value as unique key 59 | "headers" : "avroHeader" 60 | "body" : "xmlInput" 61 | "headers.unique_id" : "unique_id" 62 | XSD: 63 | xsdFile: "somefile.xsd" # Source of XSD 64 | avscFile: "books.avsc" # Avsc file to save as - Optional (Uses the xsdFile to assume the output) 65 | stringTimestamp: true # Represent timestamp as string instead of long. Defaults to false. Setting this value to "true" overrides XSD.logicalTypes.xsDateTime to "string". 66 | attributePrefix: "_" # Optional, will assign the specified prefix for attributes in the avsc schema 67 | 68 | ignoreHiveKeywords: true # Do not suffix field name with `_value` when matching Hive keywords. Default value is false. 69 | rootElementQName: "{ns}name" # Only generate schema for root element matching this QName 70 | logicalTypes: 71 | xsDateTime: "long" # Configures the Avro mapping of xs:dateTime XML types. [ long | string | timestamp-micros | timestamp-millis ] 72 | # "long" (the default) maps xs:dateTime types to regular Avro "long". Same as the default mapping for xs:dateTime in older xml-avro versions. 73 | # "string" maps xs:dateTime types to Avro "string" 74 | # "timestamp-micros" maps xs:dateTime types to Avro "timestamp-micros" logical type annotating a "long". 75 | # "timestamp-millis" maps xs:dateTime types to Avro "timestamp-millis" logical type annotating a "long". 76 | # Note: Setting the stringTimestamp will override this config value to "string" for backward compatibility reasons. 77 | xsDate: "string" # Configures the Avro mapping of xs:date XML types. [ string | date ]. 78 | # "string" (the default) maps xs:date types to Avro "string" 79 | # "date" maps xs:date types to Avro "date" logical type annotating an "int". 80 | xsTime: "string" # Configures the Avro mapping of xs:time XML types. [ string | time-micros | time-millis ] 81 | # "string" (the default) maps xs:time types to Avro "string". 82 | # "time-micros" maps xs:time types to Avro "time-micros" logical type annotating a "long". 83 | # "time-millis" maps xs:time types to Avro "time-millis" logical type annotating a "long". 84 | # 85 | xsDecimal: # Configurations controlling the mapping of xs:decimal XML types 86 | # 87 | avroType: "decimal" # Configures the Avro type mapping of xs:Decimal derived xml types. 88 | # Possible values are: [ double | string | decimal ] 89 | # - "double" (the default) maps xs:decimal types to Avro "double". 90 | # - "string" maps xs:decimal types to Avro "string". 91 | # - "decimal" maps xs:decimal types to Avro "decimal" logical types annotating "bytes". 92 | # When using the "decimal" option, the mandatory precision and scale properties of the Avro 93 | # decimal type are picked up from any xs:totalDigits and xs:fractionDigits restriction facets, if any. 94 | # In the absense of these restriction facets, the mapping will instead fall back to using a backup strategy defined 95 | # by a combination of the fallbackType, fallbackPrecision and fallbackScale configurations. 96 | # 97 | fallbackType: "string" # Configures a fallback type mapping for xs:decimal types with unrestricted precision and scale. (i.e. types without 98 | # declared xs:totalDigits and xs:fractionDigits restriction facets). This configuration is ignored, unless the 99 | # avroType setting is configured to "decimal". 100 | # The possible values are: [ string | double | decimal ] 101 | # All options are identical to those described under the avroType configuration, the only exception being 102 | # "decimal" that uses the fallbackPrecision and fallbackScale configurations as a defaults for missing 103 | # precision and scale information. 104 | # 105 | fallbackPrecision: 5 # Configures the fallback precision for decimal types without declared xs:totalDigits 106 | # and restriction. Required when fallbackType is set to "decimal". 107 | # 108 | fallbackScale: 3 # Configures the fallback scale for decimal types without declared xs:fractionDigits restriction. 109 | # Required when fallbackType is set to "decimal". 110 | ``` 111 | 112 | ## Docker 113 | 114 | ### Build docker image 115 | 116 | 117 | ```sh 118 | docker build -t xml-avro:v1.8.2 --build-arg VERSION=1.8.2 . 119 | ``` 120 | 121 | ### Run with docker 122 | 123 | ```sh 124 | docker run --rm -v $(pwd)/example:/app/example -v $(pwd)/example/config.yml:/app/config.yml xml-avro:latest 125 | ``` 126 | -------------------------------------------------------------------------------- /src/main/scala/in/dreamlabs/xmlavro/Supporters.scala: -------------------------------------------------------------------------------- 1 | package in.dreamlabs.xmlavro 2 | 3 | import java.util.{Calendar, TimeZone} 4 | import javax.xml.bind.DatatypeConverter 5 | 6 | import in.dreamlabs.xmlavro.RichAvro.{ignoreMissing, suppressWarnings} 7 | import in.dreamlabs.xmlavro.Utils._ 8 | import org.apache.avro.Schema.Type 9 | import AvroPath.countsMap 10 | import org.apache.avro.Schema.Type._ 11 | import org.apache.xerces.xni.XNIException 12 | import org.apache.xerces.xni.parser.{XMLErrorHandler, XMLParseException} 13 | import org.apache.xerces.xs.XSObject 14 | import org.w3c.dom.{DOMError, DOMErrorHandler} 15 | import org.xml.sax.{ErrorHandler, SAXParseException} 16 | 17 | import scala.collection.mutable 18 | import scala.collection.mutable.ListBuffer 19 | 20 | /** 21 | * Created by Royce on 20/01/2017. 22 | */ 23 | case class ConversionException(message: String = null, cause: Throwable = null) 24 | extends RuntimeException(message, cause) { 25 | def this(cause: Throwable) = this(null, cause) 26 | } 27 | 28 | class XSDErrorHandler extends XMLErrorHandler with DOMErrorHandler { 29 | private var exception: Option[XMLParseException] = None 30 | private var error: Option[DOMError] = None 31 | 32 | @throws[XNIException] 33 | def warning(domain: String, 34 | key: String, 35 | exception: XMLParseException): Unit = 36 | if (this.exception isEmpty) this.exception = Option(exception) 37 | 38 | @throws[XNIException] 39 | def error(domain: String, key: String, exception: XMLParseException): Unit = 40 | if (this.exception isEmpty) this.exception = Option(exception) 41 | 42 | @throws[XNIException] 43 | def fatalError(domain: String, 44 | key: String, 45 | exception: XMLParseException): Unit = 46 | if (this.exception isEmpty) this.exception = Option(exception) 47 | 48 | def handleError(error: DOMError): Boolean = { 49 | if (this.error isEmpty) this.error = Option(error) 50 | false 51 | } 52 | 53 | def check(): Unit = { 54 | if (exception isDefined) throw new ConversionException(exception.get) 55 | if (error isDefined) { 56 | error.get.getRelatedException match { 57 | case cause: Throwable => throw new ConversionException(cause) 58 | case _ => 59 | } 60 | val locator = error.get.getLocation 61 | val location = "at:" + locator.getUri + ", line:" + locator.getLineNumber + ", char:" + locator.getColumnNumber 62 | throw ConversionException(location + " " + error.get.getMessage) 63 | } 64 | } 65 | } 66 | 67 | class ValidationErrorHandler(var xml: XMLDocument) extends ErrorHandler { 68 | def warning(exception: SAXParseException): Unit = { 69 | handle(exception) 70 | } 71 | 72 | def error(exception: SAXParseException): Unit = { 73 | handle(exception) 74 | } 75 | 76 | def fatalError(exception: SAXParseException): Unit = { 77 | handle(exception) 78 | } 79 | 80 | private def handle(exception: SAXParseException): Unit = xml.fail(exception) 81 | } 82 | 83 | case class XNode(name: String, 84 | nsURI: String, 85 | nsName: String, 86 | attribute: Boolean) { 87 | var parentNS: String = _ 88 | val element: Boolean = !attribute 89 | 90 | def sourceMatches(sourceTag: String, 91 | caseSensitive: Boolean, 92 | ignoreList: List[String]): Boolean = { 93 | val matches = 94 | if (caseSensitive) 95 | if (ignoreList contains sourceTag.toLowerCase) 96 | source.equalsIgnoreCase(sourceTag) || parentNSSource 97 | .equalsIgnoreCase(sourceTag) 98 | else 99 | source == sourceTag || parentNSSource == sourceTag 100 | else 101 | source.equalsIgnoreCase(sourceTag) || parentNSSource.equalsIgnoreCase( 102 | sourceTag) 103 | matches 104 | } 105 | 106 | def source: String = 107 | (if (attribute) "attribute" else "element") + s" ${fullName()}" 108 | 109 | def parentNSSource: String = 110 | (if (attribute) "attribute" else "element") + s" ${fullName(other = true)}" 111 | 112 | def fullName(other: Boolean = false): String = 113 | if (other) 114 | s"${if (option(parentNS) isDefined) parentNS + ":" else ""}$name" 115 | else 116 | s"${if (option(nsURI) isDefined) nsURI + ":" else ""}$name" 117 | 118 | override def toString: String = 119 | s"${if (option(nsName) isDefined) nsName + ":" else ""}$name" 120 | } 121 | 122 | object XNode { 123 | val SOURCE = "source" 124 | val DOCUMENT = "document" 125 | val WILDCARD = "others" 126 | val TEXT_VALUE = "text_value" 127 | var namespaces = true 128 | 129 | def apply(ele: XSObject, attribute: Boolean = false): XNode = 130 | new XNode(ele.getName, ele.getNamespace, null, attribute) 131 | 132 | def apply(parentNode: XNode, 133 | name: String, 134 | nsURI: String, 135 | nsName: String, 136 | attribute: Boolean): XNode = { 137 | val node = new XNode(name, nsURI, nsName, attribute) 138 | if (option(nsURI) isEmpty) 139 | if (option(parentNode.nsURI) isDefined) node.parentNS = parentNode.nsURI 140 | else node.parentNS = parentNode.parentNS 141 | node 142 | } 143 | 144 | def textNode: XNode = new XNode(TEXT_VALUE, null, null, attribute = false) 145 | 146 | def wildNode(attribute: Boolean): XNode = 147 | new XNode(WILDCARD, null, null, attribute) 148 | } 149 | 150 | class AvroPath(val name: String, 151 | val pathType: Type, 152 | currentPath: ListBuffer[AvroPath], 153 | val virtual: Boolean = false) { 154 | 155 | private val innerName = { 156 | val builder = StringBuilder.newBuilder 157 | builder append s"$name" 158 | currentPath.foreach(path => 159 | builder append path.toString) 160 | builder.mkString 161 | } 162 | 163 | val index: Int = 164 | if (countsMap contains innerName) { 165 | var currentIndex = countsMap(innerName) 166 | currentIndex += 1 167 | countsMap += (innerName -> currentIndex) 168 | currentIndex 169 | } else { 170 | countsMap += (innerName -> 0) 171 | 0 172 | } 173 | 174 | def destroy(): Unit = { 175 | var currentIndex = countsMap(innerName) 176 | currentIndex -= 1 177 | countsMap += (innerName -> currentIndex) 178 | } 179 | 180 | override def toString: String = 181 | if (pathType == ARRAY) s"$name[$index]" else name 182 | } 183 | 184 | object AvroPath { 185 | val countsMap: mutable.Map[String, Int] = mutable.Map[String, Int]() 186 | val warnedNodes: ListBuffer[String] = ListBuffer[String]() 187 | 188 | def apply(name: String, 189 | pathType: Type, 190 | currentPath: ListBuffer[AvroPath], 191 | virtual: Boolean = false) = 192 | new AvroPath(name, pathType, currentPath, virtual) 193 | 194 | def reset(): Unit = countsMap.clear() 195 | 196 | def missing(eleStack: ListBuffer[XNode], node: XNode = null): Unit = { 197 | 198 | val builder = StringBuilder.newBuilder 199 | var missingStack = eleStack 200 | var missingNode = node 201 | if (Option(node) isEmpty) { 202 | missingStack = eleStack.tail 203 | missingNode = eleStack.head 204 | } 205 | 206 | val ignoreList = List("noNamespaceSchemaLocation") 207 | if (!ignoreList.contains(missingNode.name)) { 208 | missingStack.reverse.foreach(ele => builder append s"$ele/") 209 | builder.append(s"${if (missingNode attribute) "@" else ""}${missingNode name}") 210 | val fullNode = builder.mkString 211 | if (!warnedNodes.contains(fullNode)) { 212 | warnedNodes += fullNode 213 | val message = s"$fullNode is not found in Schema (even as a wildcard)" 214 | if (ignoreMissing && !suppressWarnings) 215 | warn(message) 216 | else if (!ignoreMissing) 217 | throw ConversionException(message) 218 | } 219 | } 220 | } 221 | 222 | def warning(eleStack: ListBuffer[XNode], message: String):Unit={ 223 | val builder = StringBuilder.newBuilder 224 | builder.append("In path ") 225 | eleStack.reverse.foreach(ele => builder append s"$ele/") 226 | builder.append(", ") 227 | builder.append(message) 228 | val finalMessage = builder.mkString 229 | if (!warnedNodes.contains(finalMessage)) { 230 | warnedNodes += finalMessage 231 | if (!suppressWarnings) 232 | warn(finalMessage) 233 | else 234 | throw ConversionException(finalMessage) 235 | } 236 | } 237 | } 238 | 239 | 240 | object AvroUtils { 241 | private val TIMESTAMP_PATTERN = 242 | "^(\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.*\\d*)Z?$" 243 | 244 | var timeZone: TimeZone = TimeZone.getTimeZone("UTC-0") 245 | 246 | def createValue(nodeType: Type, content: String): AnyRef = { 247 | val result = nodeType match { 248 | case BOOLEAN => content.toLowerCase == "true" || content == "1" 249 | case INT => content.toInt 250 | case LONG => 251 | if (content contains "T") parseDateFrom(content trim) 252 | else content.toLong 253 | case FLOAT => content.toFloat 254 | case DOUBLE => content.toDouble 255 | case STRING => content 256 | case other => throw ConversionException(s"Unsupported type $other") 257 | } 258 | result.asInstanceOf[AnyRef] 259 | } 260 | 261 | private def parseDateFrom(text: String): Long = { 262 | var cal = DatatypeConverter.parseDateTime(text) 263 | if (text matches TIMESTAMP_PATTERN) 264 | cal.setTimeZone(timeZone) 265 | cal.getTimeInMillis 266 | //Local 267 | val tsp = 268 | if (!text.matches(TIMESTAMP_PATTERN)) text.substring(0, 19) 269 | else text 270 | cal = DatatypeConverter.parseDateTime(tsp) 271 | cal.setTimeZone(timeZone) 272 | cal.getTimeInMillis 273 | } 274 | } 275 | 276 | object Utils { 277 | var debugEnabled = false 278 | 279 | def option(text: String): Option[String] = { 280 | if (Option(text) isDefined) 281 | if (text.trim == "") None else Option(text) 282 | else None 283 | } 284 | 285 | def debug(text: String): Unit = if (debugEnabled) log("DEBUG", text) 286 | 287 | def info(text: String): Unit = log("INFO", text) 288 | 289 | def warn(text: String): Unit = log("WARNING", text) 290 | 291 | def log(level: String, text: String, duplicates:Boolean = true): Unit = { 292 | System.err.println(s"${Calendar.getInstance().getTime} ${level.toUpperCase}: $text") 293 | } 294 | 295 | def profile(tag: String)(op: => Unit): Unit = { 296 | val start = Calendar.getInstance().getTimeInMillis 297 | op 298 | val end = Calendar.getInstance().getTimeInMillis 299 | info(s"$tag took: ${(end - start) / 1000.0} seconds") 300 | } 301 | } 302 | -------------------------------------------------------------------------------- /src/main/python/avsc_fix.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | from collections import OrderedDict 4 | 5 | import os 6 | 7 | 8 | class AvroSchema: 9 | def __init__(self, file_path): 10 | with open(file_path) as file_in: 11 | root = json.load(file_in) 12 | self._root_name = root.get('name') 13 | self._level = 0 14 | self._known_types_list = [] 15 | 16 | file_name = os.path.basename(file_path) 17 | self._root_prefix = file_name.split('.')[0] 18 | self._prefix = self._root_prefix 19 | self._base_fields = [] 20 | 21 | for node in root.get('fields'): 22 | self._base_fields.append(Node(node)) 23 | self._fields = self._base_fields 24 | 25 | # Recreate the schema splitting with specified element and save to new file 26 | def recreate_schema(self, split_by=None, new_file=None): 27 | self._known_types_list = [] 28 | if split_by: 29 | if split_by != self._root_name: 30 | search_res = self._search(self._base_fields, split_by) 31 | if search_res: 32 | search_res = search_res.content 33 | search_res.name = split_by 34 | self._fields = search_res 35 | else: 36 | split_by = None 37 | 38 | if not self._fields: 39 | print 'Split element {} not found'.format(split_by) 40 | exit(1) 41 | schema = self._generate_schema(self._fields) 42 | 43 | if not split_by: 44 | schema['source'] = 'document' 45 | if new_file: 46 | with open(new_file, 'w') as file_out: 47 | json.dump(schema, file_out, indent=2) 48 | return schema 49 | 50 | def _generate_schema(self, node): 51 | schema = OrderedDict() 52 | # Generate schema for list of nodes 53 | if type(node) is list: 54 | inner_schema = [] 55 | schema['type'] = 'record' 56 | schema['name'] = self._root_name 57 | for inner_node in node: 58 | inner_schema.append(self._generate_schema(inner_node)) 59 | schema['fields'] = inner_schema 60 | else: 61 | # Generate schema for primitive types 62 | if node.node_type == Node.primitive_type: 63 | schema['name'] = node.name 64 | if node.optional: 65 | schema['type'] = ['null', node.source_type] 66 | else: 67 | schema['type'] = node.source_type 68 | schema['source'] = node.source 69 | # Generate schema for complex types 70 | elif node.node_type == Node.complex_type: 71 | sql_type = node.sql_type 72 | inner_type = OrderedDict() 73 | # primitive complexes 74 | if type(node.content) is str: 75 | if sql_type == 'ARRAY': 76 | inner_type['type'] = 'array' 77 | if node.original_type: 78 | inner_type['items'] = node.original_type 79 | else: 80 | inner_type['items'] = node.content 81 | elif sql_type == 'MAP': 82 | inner_type['type'] = 'map' 83 | if node.original_type: 84 | inner_type['values'] = \ 85 | node.original_type.split(',')[ 86 | 1].strip() 87 | else: 88 | inner_type['values'] = node.content.split(',')[ 89 | 1].strip() 90 | else: 91 | inner_type['type'] = 'record' 92 | inner_type['fields'] = node.content 93 | schema['name'] = node.name 94 | if node.optional: 95 | schema['type'] = ['null', inner_type] 96 | else: 97 | schema['type'] = inner_type 98 | if node.name != 'others': 99 | schema['source'] = node.source 100 | # custom complexes 101 | else: 102 | # Array 103 | if sql_type == 'ARRAY': 104 | inner_type['type'] = 'array' 105 | inner_type['items'] = self._generate_schema( 106 | node.content) 107 | schema['name'] = node.name 108 | if node.optional: 109 | schema['type'] = ['null', inner_type] 110 | else: 111 | schema['type'] = inner_type 112 | schema['source'] = node.source 113 | # Map 114 | elif sql_type == 'MAP': 115 | inner_type['type'] = 'map' 116 | inner_type['values'] = self._generate_schema( 117 | node.content) 118 | schema['name'] = node.name 119 | if node.optional: 120 | schema['type'] = ['null', inner_type] 121 | else: 122 | schema['type'] = inner_type 123 | schema['source'] = node.source 124 | # Struct 125 | else: 126 | schema['name'] = node.name 127 | if node.optional: 128 | schema['type'] = ['null', self._generate_schema( 129 | node.content)] 130 | else: 131 | schema['type'] = self._generate_schema(node.content) 132 | schema['source'] = node.source 133 | # Generate schema for custom defined types 134 | else: 135 | if node.name not in self._known_types_list: 136 | schema = self._generate_schema(node.content) 137 | self._known_types_list.append(node.name) 138 | schema['name'] = node.name 139 | else: 140 | schema = node.name 141 | return schema 142 | 143 | def _search(self, node, key): 144 | if type(node) is list: 145 | for inner_node in node: 146 | search_res = self._search(inner_node, key) 147 | if search_res: 148 | break 149 | else: 150 | if node.node_type == Node.primitive_type: 151 | search_res = node if node.name == key else None 152 | elif node.node_type == Node.complex_type: 153 | # primitive complexes 154 | if type(node.content) is str: 155 | search_res = node if node.name == key else None 156 | # custom complexes 157 | else: 158 | if node.name == key: 159 | search_res = node 160 | else: 161 | search_res = self._search(node.content, key) 162 | else: 163 | search_res = self._search(node.content, key) 164 | return search_res 165 | 166 | 167 | class Node: 168 | primitive_type = 'PRIMITIVE' 169 | complex_type = 'COMPLEX' 170 | custom_type = 'CUSTOM' 171 | 172 | primitives_map = {'int': 'int', 'long': 'bigint', 'float': 'float', 173 | 'double': 'double', 'bytes': 'string', 174 | 'string': 'string', 'boolean': 'boolean'} 175 | type_dict = {} 176 | 177 | def __init__(self, node): 178 | self.sql_type = None 179 | self.content = None 180 | self.optional = False 181 | self.source = None 182 | self.comment = None 183 | self.original_type = None 184 | self.name = str(node.get('name')) 185 | node_type = node.get('type') 186 | 187 | # Parsing union - complex type and 188 | # take valid complex/primitive type in the union 189 | if type(node_type) is list: 190 | node_type = node_type[1] 191 | self.optional = True 192 | 193 | # Detect Primitives 194 | if node_type in Node.primitives_map.keys(): 195 | self.node_type = Node.primitive_type 196 | self.sql_type = Node.primitives_map[node_type] 197 | self.source = str(node.get('source')) 198 | if 'comment' in node: 199 | self.comment = node['comment'] 200 | self.source_type = node_type 201 | 202 | # Parse the inner record 203 | elif node_type == 'record': 204 | self.node_type = self.custom_type 205 | self.content = self._parse_list(node.get('fields')) 206 | self.source = str(node.get('source')) 207 | Node.type_dict[self.name] = self 208 | 209 | # Parse complex types 210 | else: 211 | self.node_type = self.complex_type 212 | self.sql_type, self.content = self._parse_complex_type(node_type) 213 | self.source = str(node.get('source')) 214 | 215 | # Parse a list of nodes 216 | @staticmethod 217 | def _parse_list(element_list): 218 | node_list = [] 219 | for node in element_list: 220 | node_list.append(Node(node)) 221 | return node_list 222 | 223 | # Parse a complex datatype 224 | def _parse_complex_type(self, node_type): 225 | if type(node_type) is dict: 226 | temp_type = node_type.get('type') 227 | # Parse array complex type 228 | if temp_type == 'array': 229 | items = node_type.get('items') 230 | 231 | # Array of primitives 232 | if items in Node.primitives_map.keys(): 233 | self.original_type = items 234 | return 'ARRAY', Node.primitives_map[items] 235 | 236 | # Array of known custom types 237 | elif items in Node.type_dict.keys(): 238 | return 'ARRAY', Node.type_dict[items] 239 | 240 | # Array of new custom types 241 | else: 242 | return 'ARRAY', Node(items) 243 | 244 | # Parse map complex type 245 | elif temp_type == 'map': 246 | value_type = node_type.get('values') 247 | 248 | # Map of primitives 249 | if value_type in Node.primitives_map.keys(): 250 | self.original_type = '{}, {}'.format('STRING', value_type) 251 | return 'MAP', '{}, {}'.format('STRING', Node.primitives_map[ 252 | value_type]) 253 | 254 | # Map of custom types 255 | else: 256 | if value_type in Node.type_dict.keys(): 257 | value_type = self.type_dict[node_type] 258 | else: 259 | print '1 - {} type not found in the schema'.format( 260 | node_type) 261 | exit(1) 262 | return 'MAP', '{}, {}'.format('STRING', value_type) 263 | 264 | # Parse other struct types 265 | else: 266 | return 'STRUCT', Node(node_type) 267 | 268 | elif node_type in Node.type_dict.keys(): 269 | return 'STRUCT', self.type_dict[node_type] 270 | else: 271 | print '2 - {} type not found in the schema'.format(node_type) 272 | exit(1) 273 | 274 | def __repr__(self): 275 | optional = 'Optional' if self.optional else 'Mandatory' 276 | return '{}, {}, {}, {}, {}'.format(self.name, self.node_type, 277 | self.sql_type, optional, self.source) 278 | 279 | 280 | file_path = sys.argv[1] 281 | split_by = sys.argv[2] 282 | temp = AvroSchema(file_path) 283 | temp.recreate_schema(split_by=split_by, new_file=file_path) 284 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- /src/main/scala/in/dreamlabs/xmlavro/config/Config.scala: -------------------------------------------------------------------------------- 1 | package in.dreamlabs.xmlavro.config 2 | 3 | import java.util 4 | 5 | import in.dreamlabs.xmlavro.ConversionException 6 | import in.dreamlabs.xmlavro.Utils.option 7 | import javax.xml.namespace.QName 8 | 9 | import scala.beans.BeanProperty 10 | import scala.collection.JavaConverters._ 11 | import scala.reflect.io.Path 12 | 13 | /** 14 | * Created by Royce on 01/02/2017. 15 | */ 16 | class Config() { 17 | @BeanProperty var dynamic: Boolean = false 18 | @BeanProperty var dynamicSource: String = "" 19 | @BeanProperty var debug: Boolean = false 20 | var baseDir: Option[Path] = None 21 | @BeanProperty var namespaces: Boolean = true 22 | var XSD: Option[XSDConfig] = None 23 | var XML: Option[XMLConfig] = None 24 | 25 | def getBaseDir: String = if (baseDir isDefined) baseDir.get.path else null 26 | 27 | def setBaseDir(value: String): Unit = 28 | baseDir = Option(Path(value).toAbsolute) 29 | 30 | def getXSD: XSDConfig = XSD.orNull 31 | 32 | def setXSD(value: XSDConfig): Unit = XSD = Option(value) 33 | 34 | def getXML: XMLConfig = XML.orNull 35 | 36 | def setXML(value: XMLConfig): Unit = XML = Option(value) 37 | 38 | def validate(): Unit = { 39 | if (XSD isDefined) { 40 | XSD.get.namespaces = namespaces 41 | XSD.get.debug = debug 42 | XSD.get.baseDir = baseDir 43 | XSD.get.validate() 44 | } 45 | if (XML isDefined) { 46 | XML.get.namespaces = namespaces 47 | XML.get.debug = debug 48 | XML.get.baseDir = baseDir 49 | XML.get.validate(XSD) 50 | } 51 | } 52 | } 53 | 54 | class XSDConfig { 55 | var namespaces: Boolean = _ 56 | var debug: Boolean = _ 57 | var baseDir: Option[Path] = _ 58 | var xsdFile: Path = _ 59 | var avscFile: Path = _ 60 | 61 | @BeanProperty var logicalTypes: LogicalTypesConfig = _ 62 | @BeanProperty var rebuildChoice: Boolean = true 63 | @BeanProperty var stringTimestamp: Boolean = false 64 | @BeanProperty var ignoreHiveKeywords: Boolean = false 65 | @BeanProperty var rootElementQName: Option[QName] = None 66 | @BeanProperty var attributePrefix: String = "" 67 | 68 | def getXsdFile: String = xsdFile.path 69 | 70 | def setXsdFile(value: String): Unit = xsdFile = Path(value) 71 | 72 | def getAvscFile: String = avscFile.path 73 | 74 | def setAvscFile(value: String): Unit = avscFile = Path(value) 75 | 76 | def validate(): Unit = { 77 | if (baseDir.isDefined) { 78 | xsdFile = xsdFile toAbsoluteWithRoot baseDir.get 79 | if (Option(avscFile) isDefined) 80 | avscFile = avscFile toAbsoluteWithRoot baseDir.get 81 | else 82 | avscFile = xsdFile changeExtension "avsc" 83 | } 84 | logicalTypes = Option(logicalTypes) getOrElse new LogicalTypesConfig 85 | logicalTypes.validate() 86 | if (stringTimestamp) { 87 | logicalTypes.xsDateTime = LogicalType.STRING 88 | } 89 | } 90 | } 91 | 92 | object LogicalType { 93 | /** 94 | * Logical type "timestamp-millis" annotating a long type. 95 | */ 96 | val TIMESTAMP_MILLIS = "timestamp-millis" 97 | /** 98 | * Logical type "timestamp-micros" annotating a long type. 99 | */ 100 | val TIMESTAMP_MICROS = "timestamp-micros" 101 | 102 | /** 103 | * Logical type "times-millis" annotating a long type. 104 | */ 105 | val TIME_MILLIS = "time-millis" 106 | 107 | /** 108 | * Logical type "times-micros" annotating a long type. 109 | */ 110 | val TIME_MICROS = "time-micros" 111 | 112 | /** 113 | * Logical type "date" annotating an int type. 114 | */ 115 | val DATE = "date" 116 | 117 | /** 118 | * Dummy logical type for handling values as string without indicating a logicalType. 119 | */ 120 | val STRING = "string" 121 | 122 | /** 123 | * Dummy logical type for handling values as long without indicating a logicalType. 124 | */ 125 | val LONG = "long" 126 | 127 | } 128 | 129 | class LogicalTypesConfig { 130 | 131 | @BeanProperty 132 | var xsDateTime: String = LogicalType.LONG 133 | @BeanProperty 134 | var xsTime: String = LogicalType.STRING 135 | @BeanProperty 136 | var xsDate: String = LogicalType.STRING 137 | @BeanProperty 138 | var xsDecimal: XSDecimalConfig = new XSDecimalConfig 139 | 140 | def validate(): Unit = { 141 | xsDateTime = Option(xsDateTime) getOrElse "" 142 | xsDateTime match { 143 | case LogicalType.LONG 144 | | LogicalType.STRING 145 | | LogicalType.TIMESTAMP_MILLIS 146 | | LogicalType.TIMESTAMP_MICROS => /* accept */ 147 | case _ => 148 | throw new IllegalArgumentException("Invalid configuration for xs:dateTime logical type.") 149 | } 150 | 151 | xsTime = Option(xsTime) getOrElse "" 152 | xsTime match { 153 | case LogicalType.STRING 154 | | LogicalType.TIME_MILLIS 155 | | LogicalType.TIME_MICROS => /* accept */ 156 | case _ => 157 | throw new IllegalArgumentException("Invalid configuration for xs:time logical type.") 158 | } 159 | 160 | xsDate = Option(xsDate) getOrElse "" 161 | xsDate match { 162 | case LogicalType.STRING | LogicalType.DATE => /* accept */ 163 | case _ => 164 | throw new IllegalArgumentException("Invalid configuration for xs:date logical type.") 165 | } 166 | 167 | xsDecimal = Option(xsDecimal) getOrElse new XSDecimalConfig 168 | xsDecimal.validate() 169 | } 170 | 171 | } 172 | 173 | object XSDecimalConfigLogicalType { 174 | val DOUBLE = "double" 175 | 176 | val STRING = "string" 177 | 178 | val DECIMAL = "decimal" 179 | } 180 | 181 | 182 | class XSDecimalConfig { 183 | @BeanProperty 184 | var avroType = XSDecimalConfigLogicalType.DOUBLE 185 | @BeanProperty 186 | var fallbackType = XSDecimalConfigLogicalType.STRING 187 | @BeanProperty 188 | var fallbackPrecision : Integer = null 189 | @BeanProperty 190 | var fallbackScale : Integer = 0 191 | 192 | def validate(): Unit = { 193 | 194 | val acceptedAvroTypes = List( 195 | XSDecimalConfigLogicalType.DECIMAL, 196 | XSDecimalConfigLogicalType.DOUBLE, 197 | XSDecimalConfigLogicalType.STRING 198 | ) 199 | 200 | if (!acceptedAvroTypes.contains(avroType)) { 201 | throw new IllegalArgumentException(s"Invalid configuration value '$avroType' for xsDecimal avroType.") 202 | } 203 | 204 | if (!acceptedAvroTypes.contains(fallbackType)) { 205 | throw new IllegalArgumentException(s"Invalid configuration value '$fallbackType' for xsDecimal fallbackType.") 206 | } 207 | 208 | if (fallbackType == XSDecimalConfigLogicalType.DECIMAL) { 209 | if (Option(fallbackPrecision) isEmpty) { 210 | throw new IllegalArgumentException(s"Missing xsDecimal fallbackPrecision " + 211 | s"configuration for '$fallbackType' fallback type.") 212 | } 213 | if (Option(fallbackScale) isEmpty) { 214 | throw new IllegalArgumentException(s"Missing xsDecimal fallbackScale " + 215 | s"configuration for '$fallbackType' fallback type.") 216 | } 217 | if (fallbackPrecision <= 0) { 218 | throw new IllegalArgumentException(s"Invalid configuration value $fallbackPrecision for xsDecimal fallbackPrecision.") 219 | } 220 | if (fallbackScale <= 0 || fallbackScale > fallbackPrecision) { 221 | throw new IllegalArgumentException(s"Invalid configuration value $fallbackScale for xsDecimal fallbackScale.") 222 | } 223 | } 224 | 225 | } 226 | } 227 | 228 | class XMLConfig { 229 | var namespaces: Boolean = _ 230 | var debug: Boolean = _ 231 | var baseDir: Option[Path] = None 232 | var qaDir: Option[Path] = None 233 | var xmlFile: Path = _ 234 | var streamingInput, streamingOutput: Boolean = false 235 | var validationXSD: Option[Path] = None 236 | var splitBy: String = "" 237 | var avscFile: Path = _ 238 | var avroFile: Path = _ 239 | var errorFile: Option[Path] = None 240 | 241 | @BeanProperty var documentRootTag: String = _ 242 | @BeanProperty var ignoreMissing: Boolean = false 243 | @BeanProperty var suppressWarnings: Boolean = false 244 | @BeanProperty var xmlInput: String = _ 245 | @BeanProperty var avroOutput: String = _ 246 | @BeanProperty var docErrorLevel: String = "WARNING" 247 | @BeanProperty var split: util.List[AvroSplit] = 248 | new util.ArrayList[AvroSplit]() 249 | @BeanProperty var caseSensitive: Boolean = true 250 | @BeanProperty var ignoreCaseFor: util.List[String] = 251 | new util.ArrayList[String] 252 | 253 | @BeanProperty var useAvroInput: Boolean = false 254 | var inputAvroMappings: Map[String, String] = _ 255 | var inputAvroKey: String = _ 256 | var inputAvroUniqueKey: Option[String] = None 257 | 258 | 259 | def getQaDir: String = if (qaDir isDefined) qaDir.get.path else null 260 | 261 | def setQaDir(value: String): Unit = qaDir = Option(Path(value)) 262 | 263 | def getValidationXSD: String = 264 | if (validationXSD isDefined) validationXSD.get.path else null 265 | 266 | def setValidationXSD(value: String): Unit = 267 | validationXSD = Option(Path(value)) 268 | 269 | def getErrorFile: String = 270 | if (errorFile isDefined) errorFile.get.path else null 271 | 272 | def setErrorFile(value: String): Unit = 273 | errorFile = Option(Path(value)) 274 | 275 | def getAvscFile: String = avscFile.path 276 | 277 | def setAvscFile(value: String): Unit = avscFile = Path(value) 278 | 279 | def getAvroFile: String = avroFile.path 280 | 281 | def setAvroFile(value: String): Unit = avroFile = Path(value) 282 | 283 | def getInputAvroMappings: util.Map[String, String] = 284 | if (Option(inputAvroMappings) isDefined) inputAvroMappings.asJava else null 285 | 286 | def setInputAvroMappings(value: util.Map[String, String]): Unit = 287 | inputAvroMappings = value.asScala.toMap 288 | 289 | def validate(xsdConfig: Option[XSDConfig]): Unit = { 290 | if (Option(xmlInput) isDefined) 291 | if (xmlInput == "stdin") { 292 | streamingInput = true 293 | if (Option(avroOutput).isEmpty || avroOutput == "stdout") 294 | streamingOutput = true 295 | else avroFile = Path(avroOutput) 296 | } else { 297 | xmlFile = Path(xmlInput) 298 | if (Option(avroOutput) isDefined) avroFile = Path(avroOutput) 299 | else avroFile = xmlFile changeExtension "avro" 300 | } else 301 | throw ConversionException("XML Input is not specified in the config") 302 | 303 | if (baseDir.isDefined && !streamingInput) 304 | xmlFile = xmlFile toAbsoluteWithRoot baseDir.get 305 | 306 | if (baseDir.isDefined && !streamingOutput) 307 | avroFile = avroFile toAbsoluteWithRoot baseDir.get 308 | 309 | if (Option(avscFile).isDefined) { 310 | if (baseDir.isDefined) 311 | avscFile = avscFile toAbsoluteWithRoot baseDir.get 312 | } else if (xsdConfig.isDefined) 313 | avscFile = xsdConfig.get.xsdFile changeExtension "avsc" 314 | 315 | if (baseDir.isDefined && validationXSD.isDefined) 316 | validationXSD = Option(validationXSD.get.toAbsoluteWithRoot(baseDir.get)) 317 | 318 | if (baseDir.isDefined && qaDir.isDefined) 319 | qaDir = Option(qaDir.get.toAbsoluteWithRoot(baseDir.get)) 320 | 321 | if (baseDir.isDefined && errorFile.isDefined) 322 | errorFile = Option(errorFile.get.toAbsoluteWithRoot(baseDir.get)) 323 | 324 | if (Option(documentRootTag) isEmpty) 325 | throw ConversionException("Document Root Tag is not specified in the config") 326 | 327 | if (option(splitBy) isEmpty) 328 | splitBy = documentRootTag 329 | 330 | if (split isEmpty) { 331 | val tempSplit = new AvroSplit 332 | tempSplit.avscFile = avscFile 333 | tempSplit.avroFile = avroFile 334 | tempSplit.stream = streamingOutput 335 | tempSplit.by = splitBy 336 | split.add(tempSplit) 337 | } 338 | 339 | split.forEach(item => item.validate(baseDir)) 340 | 341 | if (useAvroInput) { 342 | inputAvroMappings.foreach { 343 | case (key, value) => 344 | if (value == "xmlInput") inputAvroKey = key 345 | else if (value == "unique_id") inputAvroUniqueKey = Option(key) 346 | } 347 | 348 | if (Option(inputAvroKey) isEmpty) 349 | throw ConversionException("No xmlInput specified in inputAvroMappings") 350 | } 351 | } 352 | } 353 | 354 | class AvroSplit { 355 | @BeanProperty var by: String = "" 356 | var avscFile: Path = _ 357 | var avroFile: Path = _ 358 | var stream: Boolean = false 359 | 360 | def getAvscFile: String = avscFile.path 361 | 362 | def setAvscFile(value: String): Unit = avscFile = Path(value) 363 | 364 | def getAvroFile: String = avroFile.path 365 | 366 | def setAvroFile(value: String): Unit = avroFile = Path(value) 367 | 368 | def validate(baseDir: Option[Path]): Unit = { 369 | if (option(by) isEmpty) 370 | ConversionException("Split by is not specified in the config") 371 | 372 | if (Option(avroFile) isEmpty) 373 | ConversionException( 374 | s"Avro Output is not specified in the config for tag $by") 375 | else if (baseDir isDefined) 376 | avroFile = avroFile toAbsoluteWithRoot baseDir.get 377 | 378 | if (Option(avscFile) isEmpty) 379 | ConversionException( 380 | s"Avsc Schema is not specified in the config for tag $by") 381 | else if (baseDir isDefined) 382 | avscFile = avscFile toAbsoluteWithRoot baseDir.get 383 | } 384 | } 385 | -------------------------------------------------------------------------------- /src/main/scala/in/dreamlabs/xmlavro/AvroBuilder.scala: -------------------------------------------------------------------------------- 1 | package in.dreamlabs.xmlavro 2 | 3 | import java.io._ 4 | import java.nio.ByteBuffer 5 | import java.util 6 | import javax.xml.stream.XMLInputFactory 7 | import javax.xml.stream.XMLStreamConstants._ 8 | import javax.xml.stream.events.{Attribute, EndElement, StartElement, XMLEvent} 9 | 10 | import in.dreamlabs.xmlavro.AvroBuilder.unknown 11 | import in.dreamlabs.xmlavro.RichAvro._ 12 | import in.dreamlabs.xmlavro.XMLEvents.{addElement, eleStack, removeElement} 13 | import in.dreamlabs.xmlavro.config.XMLConfig 14 | import org.apache.avro.Schema 15 | import org.apache.avro.file.{CodecFactory, DataFileStream, DataFileWriter} 16 | import org.apache.avro.generic.GenericData.Record 17 | import org.apache.avro.generic.{GenericDatumReader, GenericRecord} 18 | import org.apache.avro.specific.SpecificDatumWriter 19 | import in.dreamlabs.xmlavro.Utils.info 20 | import scala.collection.JavaConverters._ 21 | import scala.collection.mutable 22 | import org.apache.commons.io.input.CountingInputStream 23 | 24 | /** 25 | * Created by Royce on 25/01/2017. 26 | */ 27 | class AvroBuilder(config: XMLConfig) { 28 | Utils.debugEnabled = config.debug 29 | RichAvro.caseSensitive = config.caseSensitive 30 | RichAvro.ignoreCaseFor = 31 | config.ignoreCaseFor.asScala.toList.map(element => element.toLowerCase) 32 | RichAvro.ignoreMissing = config.ignoreMissing 33 | RichAvro.suppressWarnings = config.suppressWarnings 34 | XNode.namespaces = config.namespaces 35 | XMLDocument.config = config 36 | 37 | private val writers = mutable.Map[String, DataFileWriter[Record]]() 38 | private val schemas = mutable.Map[String, Schema]() 39 | private val streams = mutable.ListBuffer[OutputStream]() 40 | 41 | def createDatums(): Unit = { 42 | config.split.forEach { split => 43 | val schema = new Schema.Parser().parse(split.avscFile.jfile) 44 | val datumWriter = new SpecificDatumWriter[Record](schema) 45 | val fileWriter = new DataFileWriter[Record](datumWriter) 46 | fileWriter setCodec (CodecFactory snappyCodec) 47 | val avroOut = 48 | if (split stream) new BufferedOutputStream(System.out) 49 | else split.avroFile.toFile.bufferedOutput() 50 | fileWriter create(schema, avroOut) 51 | streams += avroOut 52 | writers += split.by -> fileWriter 53 | schemas += split.by -> schema 54 | } 55 | 56 | val sourceInput = 57 | if (config.streamingInput) new BufferedInputStream(System.in) 58 | else config.xmlFile.toFile.bufferedInput() 59 | 60 | if (config.useAvroInput) { 61 | val avroReader = new DataFileStream[GenericRecord]( 62 | sourceInput, 63 | new GenericDatumReader[GenericRecord]()) 64 | var avroCount = 0 65 | avroReader.forEach { record => 66 | val xmlIn = new BufferedInputStream( 67 | new ByteArrayInputStream( 68 | record.get(config.inputAvroKey).asInstanceOf[ByteBuffer].array())) 69 | var uniqueKey = if (config.inputAvroUniqueKey isDefined) { 70 | val keys = config.inputAvroUniqueKey.get.split('.') 71 | val valueMap = 72 | record.get(keys(0)).asInstanceOf[util.HashMap[AnyRef, AnyRef]] 73 | var found: Option[String] = None 74 | valueMap.forEach { 75 | case (key, value) => 76 | if (key.toString.equals(keys(1))) { 77 | found = Some(value.toString) 78 | } 79 | } 80 | found 81 | } else None 82 | avroCount += 1 83 | info(s"Loading avro record #$avroCount for Unique ID: ${uniqueKey}") 84 | createFromXML(xmlIn, Some(record), uniqueKey) 85 | info(s"Finished avro record #$avroCount for Unique ID: ${uniqueKey}") 86 | } 87 | avroReader.close() 88 | sourceInput.close() 89 | } else { 90 | createFromXML(sourceInput) 91 | } 92 | 93 | XMLDocument.closeAll() 94 | 95 | writers.values.foreach { writer => 96 | writer.flush() 97 | writer.close() 98 | } 99 | 100 | streams.foreach(_.close()) 101 | } 102 | 103 | def createFromXML(xmlIn: InputStream, 104 | sourceAvro: Option[GenericRecord] = None, 105 | uniqueKey: Option[String] = None): Unit = { 106 | val countingStream = new CountingInputStream(xmlIn) 107 | val reader = XMLInputFactory.newInstance.createXMLEventReader(countingStream) 108 | var splitRecord: Record = null 109 | var splitFound, documentFound: Boolean = false 110 | var proceed: Boolean = false 111 | var parentEle: String = "" 112 | var currentDoc: Option[XMLDocument] = None 113 | var prevEvent: XMLEvent = null 114 | var lastPrintMB: Long = 0 115 | 116 | while (reader.hasNext) { 117 | var event: XMLEvent = null 118 | try { 119 | event = reader.nextEvent 120 | if (Utils.debugEnabled){ 121 | val currentMB = countingStream.getByteCount/1024/1024 122 | if (currentMB > lastPrintMB){ 123 | Utils.debug(s"Processed ${currentMB} Mb") 124 | lastPrintMB = currentMB 125 | } 126 | } 127 | } catch { 128 | case e: Exception => 129 | currentDoc match { 130 | case None => 131 | Utils.log(config.docErrorLevel, 132 | s"No XML data received, ${e.getMessage} ") 133 | return 134 | case Some(doc) => 135 | doc.fail( 136 | ConversionException(s"Invalid XML received, ${e.getMessage} ", 137 | e), 138 | wait = true) 139 | documentFound = false 140 | currentDoc.get close() 141 | currentDoc = None 142 | return 143 | } 144 | } 145 | if (Option(event) isDefined) { 146 | try { 147 | if (currentDoc isDefined) 148 | currentDoc.get add event 149 | event getEventType match { 150 | case START_DOCUMENT | END_DOCUMENT => //Ignore 151 | case START_ELEMENT => 152 | if (writers contains "") { 153 | writers += event.name -> writers("") 154 | schemas += event.name -> schemas("") 155 | writers remove "" 156 | schemas remove "" 157 | } 158 | if (config.documentRootTag == event.name) { 159 | documentFound = true 160 | proceed = true 161 | splitFound = false 162 | currentDoc = Some(XMLDocument(uniqueKey)) 163 | currentDoc.get add event 164 | } 165 | 166 | if (currentDoc.isDefined && !currentDoc.get.error) { 167 | if (writers.contains(event.name)) { 168 | if (splitFound) 169 | ConversionException( 170 | "Splits cannot be inside each other, they should be completely separated tags") 171 | splitFound = true 172 | splitRecord = schemas(event name).newRecord 173 | XMLEvents.setSchema(schemas(event name), splitRecord) 174 | AvroPath.reset() 175 | proceed = true 176 | } 177 | 178 | if (splitFound && proceed) { 179 | proceed = event push() 180 | parentEle = event.fullName 181 | 182 | if (event.hasAttributes && proceed) { 183 | val record = splitRecord.at(event path) 184 | event.attributes foreach { 185 | case (xEle, value) => 186 | record.add(xEle, value) 187 | } 188 | } 189 | } 190 | } 191 | case CHARACTERS => 192 | if (splitFound && proceed && currentDoc.isDefined && !currentDoc.get.error && event.hasText) { 193 | val record = splitRecord.at(event path) 194 | record.add(event element, event text) 195 | } 196 | case END_ELEMENT => 197 | if (splitFound && proceed && currentDoc.isDefined && !currentDoc.get.error && prevEvent.isStartElement) { 198 | if (event.path.nonEmpty) { 199 | val path = event.path.last.name 200 | if (path != event.name) { 201 | val record = splitRecord.at(event path) 202 | record.add(event element, "") 203 | } 204 | } 205 | } 206 | if (currentDoc.isDefined && !currentDoc.get.error) { 207 | if (splitFound && (proceed || event.fullName == parentEle)) { 208 | proceed = true 209 | event pop() 210 | if (writers.contains(event.name)) { 211 | if (sourceAvro isDefined) { 212 | config.inputAvroMappings.foreach { 213 | case (source, target) => 214 | if ((source != config.inputAvroKey) && !config.inputAvroUniqueKey 215 | .contains(source)) { 216 | splitRecord.put(target, sourceAvro.get.get(source)) 217 | } 218 | } 219 | } 220 | val writer = writers(event name) 221 | writer append splitRecord 222 | Utils.info( 223 | s"Writing avro record for ${currentDoc.get.docText} split at ${event.name}") 224 | splitFound = false 225 | } 226 | } 227 | } 228 | case COMMENT => // Do nothing 229 | case other => unknown(other.toString, event) 230 | } 231 | } catch { 232 | case e: Exception => 233 | currentDoc match { 234 | case None => throw new ConversionException(e) 235 | case Some(doc) => 236 | var innerMessage = 237 | s"'${event.toString}' after ${prevEvent.toString} at Line: ${event.getLocation.getLineNumber}, Column: ${event.getLocation.getColumnNumber}" 238 | val message = 239 | s"${e.toString}${if (config.debug) "\n" + e.getStackTrace.mkString("\n")} occurred while processing $innerMessage" 240 | doc.fail(ConversionException(message), wait = true) 241 | } 242 | proceed = false 243 | } finally { 244 | if (event.isEndElement && config.documentRootTag == event.name) { 245 | documentFound = false 246 | currentDoc.get close() 247 | currentDoc = None 248 | } 249 | prevEvent = event 250 | } 251 | } 252 | } 253 | xmlIn.close() 254 | } 255 | 256 | implicit class RichXMLEvent(event: XMLEvent) { 257 | 258 | private val startEle: Option[StartElement] = 259 | if (event isStartElement) 260 | Some(event.asStartElement()) 261 | else 262 | None 263 | 264 | private val endEle: Option[EndElement] = 265 | if (event isEndElement) 266 | Some(event.asEndElement()) 267 | else 268 | None 269 | 270 | val attributes: mutable.LinkedHashMap[XNode, String] = { 271 | val attrMap = mutable.LinkedHashMap.empty[XNode, String] 272 | if (startEle isDefined) { 273 | val attrs = startEle.get.getAttributes 274 | while (attrs.hasNext) { 275 | val attr = attrs.next().asInstanceOf[Attribute] 276 | val name = attr.getName 277 | if (name.getLocalPart.toLowerCase() != "schemalocation") 278 | attrMap += XNode(name.getLocalPart, 279 | name.getNamespaceURI, 280 | name.getPrefix, 281 | attribute = true) -> attr.getValue 282 | } 283 | } 284 | attrMap 285 | } 286 | 287 | def path: List[AvroPath] = XMLEvents.schemaPath.toList 288 | 289 | def hasAttributes: Boolean = attributes nonEmpty 290 | 291 | def push(): Boolean = { 292 | if (eleStack.isEmpty) 293 | addElement(XNode(name, nsURI, nsName, attribute = false)) 294 | else addElement(XNode(element, name, nsURI, nsName, attribute = false)) 295 | } 296 | 297 | private def nsURI: String = 298 | if (startEle isDefined) startEle.get.getName.getNamespaceURI 299 | else if (endEle isDefined) endEle.get.getName.getNamespaceURI 300 | else element.nsURI 301 | 302 | private def nsName: String = 303 | if (startEle isDefined) startEle.get.getName.getPrefix 304 | else if (endEle isDefined) endEle.get.getName.getPrefix 305 | else element.nsName 306 | 307 | def element: XNode = eleStack.head 308 | 309 | def name: String = 310 | if (startEle isDefined) startEle.get.getName.getLocalPart 311 | else if (endEle isDefined) endEle.get.getName.getLocalPart 312 | else element.name 313 | 314 | def fullName: String = { 315 | XNode(name, nsURI, nsName, attribute = false).fullName() 316 | } 317 | 318 | def pop(): Unit = 319 | removeElement(XNode(name, nsURI, nsName, attribute = false)) 320 | 321 | def text: String = event.asCharacters().getData 322 | 323 | def hasText: Boolean = text.trim() != "" || text.matches(" +") 324 | } 325 | 326 | } 327 | 328 | object AvroBuilder { 329 | private def unknown(message: String, event: XMLEvent) = 330 | Utils.warn(s"WARNING: Unknown $message: $event") 331 | } 332 | -------------------------------------------------------------------------------- /src/test/resources/xml/iam/SAML_response.asvc: -------------------------------------------------------------------------------- 1 | { 2 | "type" : "record", 3 | "name" : "ArtifactResponseType", 4 | "fields" : [ { 5 | "name" : "ID", 6 | "type" : "string", 7 | "source" : "attribute ID" 8 | }, { 9 | "name" : "InResponseTo", 10 | "type" : [ "string", "null" ], 11 | "source" : "attribute InResponseTo" 12 | }, { 13 | "name" : "Version", 14 | "type" : "string", 15 | "source" : "attribute Version" 16 | }, { 17 | "name" : "IssueInstant", 18 | "type" : "string", 19 | "source" : "attribute IssueInstant" 20 | }, { 21 | "name" : "Destination", 22 | "type" : [ "string", "null" ], 23 | "source" : "attribute Destination" 24 | }, { 25 | "name" : "Consent", 26 | "type" : [ "string", "null" ], 27 | "source" : "attribute Consent" 28 | }, { 29 | "name" : "Issuer", 30 | "type" : [ { 31 | "type" : "record", 32 | "name" : "NameIDType", 33 | "fields" : [ { 34 | "name" : "NameQualifier", 35 | "type" : [ "string", "null" ], 36 | "source" : "attribute NameQualifier" 37 | }, { 38 | "name" : "SPNameQualifier", 39 | "type" : [ "string", "null" ], 40 | "source" : "attribute SPNameQualifier" 41 | }, { 42 | "name" : "Format", 43 | "type" : [ "string", "null" ], 44 | "source" : "attribute Format" 45 | }, { 46 | "name" : "SPProvidedID", 47 | "type" : [ "string", "null" ], 48 | "source" : "attribute SPProvidedID" 49 | } ] 50 | }, "null" ], 51 | "source" : "element Issuer" 52 | }, { 53 | "name" : "Signature", 54 | "type" : [ { 55 | "type" : "record", 56 | "name" : "SignatureType", 57 | "fields" : [ { 58 | "name" : "Id", 59 | "type" : [ "string", "null" ], 60 | "source" : "attribute Id" 61 | }, { 62 | "name" : "SignedInfo", 63 | "type" : { 64 | "type" : "record", 65 | "name" : "SignedInfoType", 66 | "fields" : [ { 67 | "name" : "Id", 68 | "type" : [ "string", "null" ], 69 | "source" : "attribute Id" 70 | }, { 71 | "name" : "CanonicalizationMethod", 72 | "type" : { 73 | "type" : "record", 74 | "name" : "CanonicalizationMethodType", 75 | "fields" : [ { 76 | "name" : "Algorithm", 77 | "type" : "string", 78 | "source" : "attribute Algorithm" 79 | }, { 80 | "name" : "others", 81 | "type" : { 82 | "type" : "map", 83 | "values" : "string" 84 | } 85 | } ] 86 | }, 87 | "source" : "element CanonicalizationMethod" 88 | }, { 89 | "name" : "SignatureMethod", 90 | "type" : { 91 | "type" : "record", 92 | "name" : "SignatureMethodType", 93 | "fields" : [ { 94 | "name" : "Algorithm", 95 | "type" : "string", 96 | "source" : "attribute Algorithm" 97 | }, { 98 | "name" : "HMACOutputLength", 99 | "type" : [ "string", "null" ], 100 | "source" : "element HMACOutputLength" 101 | }, { 102 | "name" : "others", 103 | "type" : { 104 | "type" : "map", 105 | "values" : "string" 106 | } 107 | } ] 108 | }, 109 | "source" : "element SignatureMethod" 110 | }, { 111 | "name" : "Reference", 112 | "type" : { 113 | "type" : "array", 114 | "items" : { 115 | "type" : "record", 116 | "name" : "ReferenceType", 117 | "fields" : [ { 118 | "name" : "Id", 119 | "type" : [ "string", "null" ], 120 | "source" : "attribute Id" 121 | }, { 122 | "name" : "URI", 123 | "type" : [ "string", "null" ], 124 | "source" : "attribute URI" 125 | }, { 126 | "name" : "Type", 127 | "type" : [ "string", "null" ], 128 | "source" : "attribute Type" 129 | }, { 130 | "name" : "Transforms", 131 | "type" : [ { 132 | "type" : "record", 133 | "name" : "TransformsType", 134 | "fields" : [ { 135 | "name" : "Transform", 136 | "type" : { 137 | "type" : "array", 138 | "items" : { 139 | "type" : "record", 140 | "name" : "TransformType", 141 | "fields" : [ { 142 | "name" : "Algorithm", 143 | "type" : "string", 144 | "source" : "attribute Algorithm" 145 | }, { 146 | "name" : "others", 147 | "type" : { 148 | "type" : "map", 149 | "values" : "string" 150 | } 151 | }, { 152 | "name" : "XPath", 153 | "type" : [ "string", "null" ], 154 | "source" : "element XPath" 155 | } ] 156 | } 157 | }, 158 | "source" : "element Transform" 159 | } ] 160 | }, "null" ], 161 | "source" : "element Transforms" 162 | }, { 163 | "name" : "DigestMethod", 164 | "type" : { 165 | "type" : "record", 166 | "name" : "DigestMethodType", 167 | "fields" : [ { 168 | "name" : "Algorithm", 169 | "type" : "string", 170 | "source" : "attribute Algorithm" 171 | }, { 172 | "name" : "others", 173 | "type" : { 174 | "type" : "map", 175 | "values" : "string" 176 | } 177 | } ] 178 | }, 179 | "source" : "element DigestMethod" 180 | }, { 181 | "name" : "DigestValue", 182 | "type" : "string", 183 | "source" : "element DigestValue" 184 | } ] 185 | } 186 | }, 187 | "source" : "element Reference" 188 | } ] 189 | }, 190 | "source" : "element SignedInfo" 191 | }, { 192 | "name" : "SignatureValue", 193 | "type" : { 194 | "type" : "record", 195 | "name" : "SignatureValueType", 196 | "fields" : [ { 197 | "name" : "Id", 198 | "type" : [ "string", "null" ], 199 | "source" : "attribute Id" 200 | } ] 201 | }, 202 | "source" : "element SignatureValue" 203 | }, { 204 | "name" : "KeyInfo", 205 | "type" : [ { 206 | "type" : "record", 207 | "name" : "KeyInfoType", 208 | "fields" : [ { 209 | "name" : "Id", 210 | "type" : [ "string", "null" ], 211 | "source" : "attribute Id" 212 | }, { 213 | "name" : "KeyName", 214 | "type" : [ "string", "null" ], 215 | "source" : "element KeyName" 216 | }, { 217 | "name" : "KeyValue", 218 | "type" : [ { 219 | "type" : "record", 220 | "name" : "KeyValueType", 221 | "fields" : [ { 222 | "name" : "DSAKeyValue", 223 | "type" : [ { 224 | "type" : "record", 225 | "name" : "DSAKeyValueType", 226 | "fields" : [ { 227 | "name" : "P", 228 | "type" : "string", 229 | "source" : "element P" 230 | }, { 231 | "name" : "Q", 232 | "type" : "string", 233 | "source" : "element Q" 234 | }, { 235 | "name" : "G", 236 | "type" : [ "string", "null" ], 237 | "source" : "element G" 238 | }, { 239 | "name" : "Y", 240 | "type" : "string", 241 | "source" : "element Y" 242 | }, { 243 | "name" : "J", 244 | "type" : [ "string", "null" ], 245 | "source" : "element J" 246 | }, { 247 | "name" : "Seed", 248 | "type" : "string", 249 | "source" : "element Seed" 250 | }, { 251 | "name" : "PgenCounter", 252 | "type" : "string", 253 | "source" : "element PgenCounter" 254 | } ] 255 | }, "null" ], 256 | "source" : "element DSAKeyValue" 257 | }, { 258 | "name" : "RSAKeyValue", 259 | "type" : [ { 260 | "type" : "record", 261 | "name" : "RSAKeyValueType", 262 | "fields" : [ { 263 | "name" : "Modulus", 264 | "type" : "string", 265 | "source" : "element Modulus" 266 | }, { 267 | "name" : "Exponent", 268 | "type" : "string", 269 | "source" : "element Exponent" 270 | } ] 271 | }, "null" ], 272 | "source" : "element RSAKeyValue" 273 | }, { 274 | "name" : "others", 275 | "type" : { 276 | "type" : "map", 277 | "values" : "string" 278 | } 279 | } ] 280 | }, "null" ], 281 | "source" : "element KeyValue" 282 | }, { 283 | "name" : "RetrievalMethod", 284 | "type" : [ { 285 | "type" : "record", 286 | "name" : "RetrievalMethodType", 287 | "fields" : [ { 288 | "name" : "URI", 289 | "type" : [ "string", "null" ], 290 | "source" : "attribute URI" 291 | }, { 292 | "name" : "Type", 293 | "type" : [ "string", "null" ], 294 | "source" : "attribute Type" 295 | }, { 296 | "name" : "Transforms", 297 | "type" : [ "TransformsType", "null" ], 298 | "source" : "element Transforms" 299 | } ] 300 | }, "null" ], 301 | "source" : "element RetrievalMethod" 302 | }, { 303 | "name" : "X509Data", 304 | "type" : [ { 305 | "type" : "record", 306 | "name" : "X509DataType", 307 | "fields" : [ { 308 | "name" : "X509IssuerSerial", 309 | "type" : [ { 310 | "type" : "record", 311 | "name" : "X509IssuerSerialType", 312 | "fields" : [ { 313 | "name" : "X509IssuerName", 314 | "type" : "string", 315 | "source" : "element X509IssuerName" 316 | }, { 317 | "name" : "X509SerialNumber", 318 | "type" : "string", 319 | "source" : "element X509SerialNumber" 320 | } ] 321 | }, "null" ], 322 | "source" : "element X509IssuerSerial" 323 | }, { 324 | "name" : "X509SKI", 325 | "type" : [ "string", "null" ], 326 | "source" : "element X509SKI" 327 | }, { 328 | "name" : "X509SubjectName", 329 | "type" : [ "string", "null" ], 330 | "source" : "element X509SubjectName" 331 | }, { 332 | "name" : "X509Certificate", 333 | "type" : [ "string", "null" ], 334 | "source" : "element X509Certificate" 335 | }, { 336 | "name" : "X509CRL", 337 | "type" : [ "string", "null" ], 338 | "source" : "element X509CRL" 339 | }, { 340 | "name" : "others", 341 | "type" : { 342 | "type" : "map", 343 | "values" : "string" 344 | } 345 | } ] 346 | }, "null" ], 347 | "source" : "element X509Data" 348 | }, { 349 | "name" : "PGPData", 350 | "type" : [ { 351 | "type" : "record", 352 | "name" : "PGPDataType", 353 | "fields" : [ { 354 | "name" : "PGPKeyID", 355 | "type" : [ "string", "null" ], 356 | "source" : "element PGPKeyID" 357 | }, { 358 | "name" : "PGPKeyPacket0", 359 | "type" : [ "string", "null" ], 360 | "source" : "element PGPKeyPacket" 361 | }, { 362 | "name" : "others", 363 | "type" : { 364 | "type" : "map", 365 | "values" : "string" 366 | } 367 | } ] 368 | }, "null" ], 369 | "source" : "element PGPData" 370 | }, { 371 | "name" : "SPKIData", 372 | "type" : [ { 373 | "type" : "record", 374 | "name" : "SPKIDataType", 375 | "fields" : [ { 376 | "name" : "SPKISexp", 377 | "type" : "string", 378 | "source" : "element SPKISexp" 379 | }, { 380 | "name" : "others", 381 | "type" : { 382 | "type" : "map", 383 | "values" : "string" 384 | } 385 | } ] 386 | }, "null" ], 387 | "source" : "element SPKIData" 388 | }, { 389 | "name" : "MgmtData", 390 | "type" : [ "string", "null" ], 391 | "source" : "element MgmtData" 392 | }, { 393 | "name" : "others", 394 | "type" : { 395 | "type" : "map", 396 | "values" : "string" 397 | } 398 | } ] 399 | }, "null" ], 400 | "source" : "element KeyInfo" 401 | }, { 402 | "name" : "Object", 403 | "type" : { 404 | "type" : "array", 405 | "items" : { 406 | "type" : "record", 407 | "name" : "ObjectType", 408 | "fields" : [ { 409 | "name" : "Id", 410 | "type" : [ "string", "null" ], 411 | "source" : "attribute Id" 412 | }, { 413 | "name" : "MimeType", 414 | "type" : [ "string", "null" ], 415 | "source" : "attribute MimeType" 416 | }, { 417 | "name" : "Encoding", 418 | "type" : [ "string", "null" ], 419 | "source" : "attribute Encoding" 420 | }, { 421 | "name" : "others", 422 | "type" : { 423 | "type" : "map", 424 | "values" : "string" 425 | } 426 | } ] 427 | } 428 | }, 429 | "source" : "element Object" 430 | } ] 431 | }, "null" ], 432 | "source" : "element Signature" 433 | }, { 434 | "name" : "Extensions", 435 | "type" : [ { 436 | "type" : "record", 437 | "name" : "ExtensionsType", 438 | "fields" : [ { 439 | "name" : "others", 440 | "type" : { 441 | "type" : "map", 442 | "values" : "string" 443 | } 444 | } ] 445 | }, "null" ], 446 | "source" : "element Extensions" 447 | }, { 448 | "name" : "Status", 449 | "type" : { 450 | "type" : "record", 451 | "name" : "StatusType", 452 | "fields" : [ { 453 | "name" : "StatusCode", 454 | "type" : { 455 | "type" : "record", 456 | "name" : "StatusCodeType", 457 | "fields" : [ { 458 | "name" : "Value", 459 | "type" : "string", 460 | "source" : "attribute Value" 461 | }, { 462 | "name" : "StatusCode", 463 | "type" : [ "StatusCodeType", "null" ], 464 | "source" : "element StatusCode" 465 | } ] 466 | }, 467 | "source" : "element StatusCode" 468 | }, { 469 | "name" : "StatusMessage", 470 | "type" : [ "string", "null" ], 471 | "source" : "element StatusMessage" 472 | }, { 473 | "name" : "StatusDetail", 474 | "type" : [ { 475 | "type" : "record", 476 | "name" : "StatusDetailType", 477 | "fields" : [ { 478 | "name" : "others", 479 | "type" : { 480 | "type" : "map", 481 | "values" : "string" 482 | } 483 | } ] 484 | }, "null" ], 485 | "source" : "element StatusDetail" 486 | } ] 487 | }, 488 | "source" : "element Status" 489 | }, { 490 | "name" : "others", 491 | "type" : { 492 | "type" : "map", 493 | "values" : "string" 494 | } 495 | } ] 496 | } -------------------------------------------------------------------------------- /src/test/resources/temp: -------------------------------------------------------------------------------- 1 | 2 | CY_NAV_00211 3 | CY_20150901145144 4 | 2015-09-01T14:51:44+02:00 5 | CY_NAV 6 | I2029 7 | International Basket Level Sales 8 | POSLog 9 | 1 10 | 1 11 | 54.00 12 | 13 | 9482 14 |
AKROPOLIS 15 | 16 |
17 |
18 | 19 | 20 | 19 21 | 34 22 | 2015-08-27T20:07:22+02:00 23 | 2015-08-27T20:07:06+02:00 24 | 2015-08-27 25 | 101 26 | EUR 27 | 000000012 28 | 0 29 | 2015-08-27T20:07:06+02:00 30 | 20318178 31 | T17/NU 32 | 20318178 33 | 34 | 30.00 35 | 27.00 36 | 54.00 37 | 2.00000 38 | 0 39 | 6.00 40 | 1 41 | 10.00000 42 | 43 | 13:5 44 | 1 45 | 2 46 | 60.00 47 | 2.00000 48 | 49 | 50 | 0 51 | 54.00 52 | 8.62 53 | 19.00000 54 | S 55 | CYS 56 | CYVAT 57 | 58 | 59 | 54.00 60 | 54.00 61 | False 62 | 1 63 | 0 64 | 11:3 65 | 2607 66 | 2801 67 | 68 | 69 | 70 | 1 71 | 2015-08-27T20:07:06+02:00 72 | 1 73 | 54.00 74 | 8.62 75 | 19.00000 76 | S 77 | CYS 78 | CYVAT 79 | 80 | 81 | 82 | 2 83 | 2015-08-27T20:07:06+02:00 84 | 138 85 | 54.00 86 | 15 87 | 88 | 89 | 90 | 54.00 91 | 45.38 92 | 8.62 93 | 94 | 95 | 2015-08-27T20:07:06+02:00 96 | 97 | 98 |
99 |
100 |
101 |
102 | 103 | CY_NAV_00211 104 | CY_20150901145144 105 | 2015-09-01T14:51:44+02:00 106 | CY_NAV 107 | I2029 108 | International Basket Level Sales 109 | POSLog 110 | 1 111 | 1 112 | 54.00 113 | 114 | 9482 115 |
AKROPOLIS 116 | 117 |
118 |
119 | 120 | 121 | 19 122 | 34 123 | 2015-08-27T20:07:22+02:00 124 | 2015-08-27T20:07:06+02:00 125 | 2015-08-27 126 | 101 127 | EUR 128 | 000000012 129 | 0 130 | 2015-08-27T20:07:06+02:00 131 | 20318178 132 | T17/NU 133 | 20318178 134 | 135 | 30.00 136 | 27.00 137 | 54.00 138 | 2.00000 139 | 0 140 | 6.00 141 | 1 142 | 10.00000 143 | 144 | 13:5 145 | 1 146 | 2 147 | 60.00 148 | 2.00000 149 | 150 | 151 | 0 152 | 54.00 153 | 8.62 154 | 19.00000 155 | S 156 | CYS 157 | CYVAT 158 | 159 | 160 | 54.00 161 | 54.00 162 | False 163 | 1 164 | 0 165 | 11:3 166 | 2607 167 | 2801 168 | 169 | 170 | 171 | 1 172 | 2015-08-27T20:07:06+02:00 173 | 1 174 | 54.00 175 | 8.62 176 | 19.00000 177 | S 178 | CYS 179 | CYVAT 180 | 181 | 182 | 183 | 2 184 | 2015-08-27T20:07:06+02:00 185 | 138 186 | 54.00 187 | 15 188 | 189 | 190 | 191 | 54.00 192 | 45.38 193 | 8.62 194 | 195 | 196 | 2015-08-27T20:07:06+02:00 197 | 198 | 199 |
200 |
201 |
202 |
203 | 204 | CY_NAV_00211 205 | CY_20150901145144 206 | 2015-09-01T14:51:44+02:00 207 | CY_NAV 208 | I2029 209 | International Basket Level Sales 210 | POSLog 211 | 1 212 | 1 213 | 54.00 214 | 215 | 9482 216 |
AKROPOLIS 217 | 218 |
219 |
220 | 221 | 222 | 19 223 | 34 224 | 2015-08-27T20:07:22+02:00 225 | 2015-08-27T20:07:06+02:00 226 | 2015-08-27 227 | 101 228 | EUR 229 | 000000012 230 | 0 231 | 2015-08-27T20:07:06+02:00 232 | 20318178 233 | T17/NU 234 | 20318178 235 | 236 | 30.00 237 | 27.00 238 | 54.00 239 | 2.00000 240 | 0 241 | 6.00 242 | 1 243 | 10.00000 244 | 245 | 13:5 246 | 1 247 | 2 248 | 60.00 249 | 2.00000 250 | 251 | 252 | 0 253 | 54.00 254 | 8.62 255 | 19.00000 256 | S 257 | CYS 258 | CYVAT 259 | 260 | 261 | 54.00 262 | 54.00 263 | False 264 | 1 265 | 0 266 | 11:3 267 | 2607 268 | 2801 269 | 270 | 271 | 272 | 1 273 | 2015-08-27T20:07:06+02:00 274 | 1 275 | 54.00 276 | 8.62 277 | 19.00000 278 | S 279 | CYS 280 | CYVAT 281 | 282 | 283 | 284 | 2 285 | 2015-08-27T20:07:06+02:00 286 | 138 287 | 54.00 288 | 15 289 | 290 | 291 | 292 | 54.00 293 | 45.38 294 | 8.62 295 | 296 | 297 | 2015-08-27T20:07:06+02:00 298 | 299 | 300 |
301 |
302 |
303 |
304 | 305 | CY_NAV_00211 306 | CY_20150901145144 307 | 2015-09-01T14:51:44+02:00 308 | CY_NAV 309 | I2029 310 | International Basket Level Sales 311 | POSLog 312 | 1 313 | 1 314 | 54.00 315 | 316 | 9482 317 |
AKROPOLIS 318 | 319 |
320 |
321 | 322 | 323 | 19 324 | 34 325 | 2015-08-27T20:07:22+02:00 326 | 2015-08-27T20:07:06+02:00 327 | 2015-08-27 328 | 101 329 | EUR 330 | 000000012 331 | 0 332 | 2015-08-27T20:07:06+02:00 333 | 20318178 334 | T17/NU 335 | 20318178 336 | 337 | 30.00 338 | 27.00 339 | 54.00 340 | 2.00000 341 | 0 342 | 6.00 343 | 1 344 | 10.00000 345 | 346 | 13:5 347 | 1 348 | 2 349 | 60.00 350 | 2.00000 351 | 352 | 353 | 0 354 | 54.00 355 | 8.62 356 | 19.00000 357 | S 358 | CYS 359 | CYVAT 360 | 361 | 362 | 54.00 363 | 54.00 364 | False 365 | 1 366 | 0 367 | 11:3 368 | 2607 369 | 2801 370 | 371 | 372 | 373 | 1 374 | 2015-08-27T20:07:06+02:00 375 | 1 376 | 54.00 377 | 8.62 378 | 19.00000 379 | S 380 | CYS 381 | CYVAT 382 | 383 | 384 | 385 | 2 386 | 2015-08-27T20:07:06+02:00 387 | 138 388 | 54.00 389 | 15 390 | 391 | 392 | 393 | 54.00 394 | 45.38 395 | 8.62 396 | 397 | 398 | 2015-08-27T20:07:06+02:00 399 | 400 | 401 |
402 |
403 |
404 |
405 | --------------------------------------------------------------------------------