├── src
├── test
│ └── resources
│ │ ├── books.avro
│ │ ├── xml
│ │ └── iam
│ │ │ ├── ArtifactResponse.avro
│ │ │ ├── AuthnRequest.avro
│ │ │ ├── AuthnRequest.xml
│ │ │ ├── ArtifactResponse.xml
│ │ │ └── SAML_response.asvc
│ │ ├── avsc.yml
│ │ ├── books1.xsd
│ │ ├── avro.yml
│ │ ├── both.yml
│ │ ├── old_books.json
│ │ ├── books.xml.bkp
│ │ ├── books.xsd
│ │ ├── new_books.json
│ │ ├── book.avsc
│ │ ├── books.xml
│ │ ├── books.avsc
│ │ ├── old_books.avsc
│ │ ├── new_books.avsc
│ │ ├── new_books2.avsc
│ │ └── temp
└── main
│ ├── scala
│ └── in
│ │ └── dreamlabs
│ │ └── xmlavro
│ │ ├── Validator.scala
│ │ ├── Converter.scala
│ │ ├── config
│ │ ├── ArgParse.scala
│ │ ├── ConfigParser.scala
│ │ └── Config.scala
│ │ ├── XMLEvents.scala
│ │ ├── XMLDocument.scala
│ │ ├── RichAvro.scala
│ │ ├── Supporters.scala
│ │ └── AvroBuilder.scala
│ └── python
│ └── avsc_fix.py
├── settings.gradle
├── .gitignore
├── gradle
└── wrapper
│ └── gradle-wrapper.properties
├── example
├── config.yml
├── books.xml
├── books.xsd
└── books.avsc
├── Dockerfile
├── gradlew.bat
├── gradlew
├── README.md
└── LICENSE
/src/test/resources/books.avro:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/settings.gradle:
--------------------------------------------------------------------------------
1 | rootProject.name = 'xml-avro'
--------------------------------------------------------------------------------
/src/test/resources/xml/iam/ArtifactResponse.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeethanadhP/xml-avro/HEAD/src/test/resources/xml/iam/ArtifactResponse.avro
--------------------------------------------------------------------------------
/src/test/resources/xml/iam/AuthnRequest.avro:
--------------------------------------------------------------------------------
1 | 0 0Haaf23196-1773-2113-474a-fe114412ab722.0&2004-12-05T09:21:59 furn:oasis:names:tc:SAML:2.0:nameid-format:transient
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | target
2 | *.class
3 | *.iml
4 | *.idea
5 | tmp
6 |
7 | # Package Files #
8 | *.jar
9 | *.war
10 | *.ear
11 | /xsd/
12 | /bin/
13 | /.gradle/
14 | /build/
15 | /out/
16 |
17 | example/books\.avro
18 | /.project
19 |
--------------------------------------------------------------------------------
/src/test/resources/avsc.yml:
--------------------------------------------------------------------------------
1 | dynamic: true
2 | dynamicSource: ENVIRONMENT
3 | debug: true
4 | baseDir: "src/test/resources"
5 | namespaces: true
6 | XSD:
7 | xsdFile: "books.xsd"
8 | avscFile: "books.avsc"
9 | rebuildChoice: true
10 |
11 |
12 |
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/src/test/resources/books1.xsd:
--------------------------------------------------------------------------------
1 |
3 |
4 |
5 |
--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | #Mon Jul 22 11:43:36 CEST 2019
2 | distributionBase=GRADLE_USER_HOME
3 | distributionPath=wrapper/dists
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 | distributionUrl=https\://services.gradle.org/distributions/gradle-4.10.3-all.zip
7 |
--------------------------------------------------------------------------------
/example/config.yml:
--------------------------------------------------------------------------------
1 | baseDir: "example"
2 | debug: false
3 | XSD:
4 | xsdFile: "books.xsd"
5 | avscFile: "books.avsc"
6 | stringTimestamp: true
7 | attributePrefix: "_"
8 |
9 | XML:
10 | xmlInput: "books.xml"
11 | avroOutput: "books.avro"
12 | documentRootTag: "books"
13 | avscFile: "books.avsc"
14 | validationXSD: "books.xsd"
15 | ignoreMissing: false
16 | caseSensitive: true
--------------------------------------------------------------------------------
/src/test/resources/avro.yml:
--------------------------------------------------------------------------------
1 | dynamic: true
2 | dynamicSource: ENVIRONMENT
3 | debug: false
4 | baseDir: "test/resources"
5 | namespaces: true
6 |
7 | XML:
8 | xmlFile: "books.xml"
9 | avscFile: "books.avsc"
10 | avroFile: "books.avro"
11 | validationXSD: "books.xsd"
12 | splitBy: "book"
13 | ignoreWarnings: true
14 | streamingInput: true
15 | caseSensitive: true
16 | ignoreCaseFor:
17 | - "Something"
18 |
19 |
20 |
21 |
22 |
23 |
--------------------------------------------------------------------------------
/src/test/resources/both.yml:
--------------------------------------------------------------------------------
1 | dynamic: true
2 | dynamicSource: ENVIRONMENT
3 | debug: false
4 | baseDir: "test/resources"
5 | namespaces: true
6 | XSD:
7 | xsdFile: "books.xsd"
8 | avscFile: "books.avsc"
9 | rebuildChoice: true
10 |
11 | XML:
12 | xmlFile: "books.xml"
13 | avscFile: "books.avsc"
14 | avroFile: "books.avro"
15 | validationXSD: "books.xsd"
16 | splitBy: "book"
17 | ignoreWarnings: true
18 | streamingInput: true
19 | caseSensitive: true
20 | ignoreCaseFor:
21 | - "Something"
22 |
23 |
24 |
25 |
26 |
27 |
--------------------------------------------------------------------------------
/src/test/resources/xml/iam/AuthnRequest.xml:
--------------------------------------------------------------------------------
1 |
9 | https://sp.example.com/SAML2
10 |
13 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM gradle:6.3.0 AS build
2 | ARG VERSION
3 | ENV VERSION ${VERSION:-1.8.2}
4 |
5 | RUN cd /tmp && \
6 | wget https://github.com/GeethanadhP/xml-avro/archive/${VERSION}.zip && \
7 | unzip ${VERSION}.zip && \
8 | cp -a /tmp/xml-avro-${VERSION}/* /home/gradle/ && \
9 | sed -i 's/http:/https:/g' /home/gradle/build.gradle && \
10 | chown -R gradle:gradle /home/gradle
11 |
12 |
13 | WORKDIR /home/gradle
14 | RUN gradle build --no-daemon
15 |
16 | FROM openjdk:8-jre-slim
17 | ARG VERSION
18 | ENV VERSION ${VERSION:-1.8.2}
19 |
20 | RUN mkdir /app
21 | WORKDIR /app
22 |
23 | COPY --from=build /home/gradle/build/libs/xml-avro-all-${VERSION}.jar /app/xml-avro.jar
24 |
25 | CMD ["java", "-XX:+UnlockExperimentalVMOptions", "-XX:+UseCGroupMemoryLimitForHeap", "-Djava.security.egd=file:/dev/./urandom","-jar","/app/xml-avro.jar", "-c", "config.yml"]
26 |
--------------------------------------------------------------------------------
/src/main/scala/in/dreamlabs/xmlavro/Validator.scala:
--------------------------------------------------------------------------------
1 | package in.dreamlabs.xmlavro
2 |
3 | import javax.xml.XMLConstants
4 | import javax.xml.transform.stream.StreamSource
5 | import javax.xml.validation.SchemaFactory
6 | import org.xml.sax.SAXException
7 |
8 | object Validator {
9 |
10 | def validate(xmlFile: String, xsdFile: String): Boolean = {
11 | try {
12 | val schema = SchemaFactory
13 | .newInstance(XMLConstants.W3C_XML_SCHEMA_NS_URI)
14 | .newSchema(new StreamSource("i180.xsd"))
15 | val validator = schema.newValidator()
16 | validator.validate(new StreamSource(xmlFile))
17 | } catch {
18 | case ex: SAXException => ex.printStackTrace(); return false
19 | case ex: Exception => ex.printStackTrace()
20 | }
21 | true
22 | }
23 |
24 | def main(args: Array[String]) {
25 | println(validate("i180.xml", "i180.xsd"))
26 | }
27 | }
--------------------------------------------------------------------------------
/example/books.xml:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 | Brandon Sanderson
6 | Mistborn
7 | Fantasy
8 | 50
9 | 2006-12-17T09:30:47.0Z
10 |
11 | Wonderful
12 | I love the plot twist and the new magic
13 |
14 |
15 | Unbelievable twist
16 | The best book i ever read
17 |
18 | 10
19 |
20 |
21 | Brandon Sanderson
22 | Way of Kings
23 | Fantasy
24 | 50
25 | 2006-12-17T09:30:47.0Z
26 |
27 |
28 |
29 |
30 |
31 |
32 | 10
33 |
34 |
--------------------------------------------------------------------------------
/src/main/scala/in/dreamlabs/xmlavro/Converter.scala:
--------------------------------------------------------------------------------
1 | package in.dreamlabs.xmlavro
2 |
3 | import java.io._
4 |
5 | import in.dreamlabs.xmlavro.config.{Config, ConfigParser, XMLConfig, XSDConfig}
6 |
7 | /**
8 | * Created by Royce on 22/12/2016.
9 | */
10 | class Converter(val config: Config) {
11 |
12 | if (config.XSD isDefined)
13 | convertXSD(config.XSD.get)
14 | if (config.XML isDefined) {
15 | val xConfig = config.XML.get
16 | if (!xConfig.streamingInput)
17 | Utils.info("Converting: " + xConfig.xmlFile + " -> " + xConfig.avroFile)
18 | convertXML(xConfig)
19 | }
20 |
21 | @throws[IOException]
22 | private def convertXSD(xConfig: XSDConfig) {
23 | Utils.info("Converting: " + xConfig.xsdFile + " -> " + xConfig.avscFile)
24 | val schemaBuilder = SchemaBuilder(xConfig)
25 | schemaBuilder createSchema()
26 | }
27 |
28 | private def convertXML(xConfig: XMLConfig) {
29 | Utils.profile("Avro Conversion") {
30 | val builder = new AvroBuilder(xConfig)
31 | builder.createDatums()
32 | }
33 | }
34 | }
35 |
36 | object Converter {
37 | @throws[IOException]
38 | def main(args: Array[String]): Unit = {
39 | val config = try {
40 | if (args isEmpty)
41 | throw new IllegalArgumentException("No Arguments specified")
42 | else ConfigParser apply args
43 | } catch {
44 | case e: IllegalArgumentException =>
45 | Utils.log("ERROR",
46 | "XML Avro converter\nError: " + e.getMessage + "\n\n" + ConfigParser.USAGE + "\n")
47 | System.exit(1)
48 | }
49 | Converter apply config.asInstanceOf[ConfigParser]
50 | }
51 |
52 | def apply(config: ConfigParser): Converter = new Converter(config.config)
53 | }
54 |
--------------------------------------------------------------------------------
/src/test/resources/xml/iam/ArtifactResponse.xml:
--------------------------------------------------------------------------------
1 |
7 |
8 |
9 |
10 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 | Hello, World!
21 |
22 |
23 |
25 |
26 |
35 | https://sp.example.com/SAML2
36 |
39 |
40 |
--------------------------------------------------------------------------------
/src/test/resources/old_books.json:
--------------------------------------------------------------------------------
1 | {
2 | "book": [
3 | {
4 | "id": "b001",
5 | "others": {
6 | "ot:random": "true"
7 | },
8 | "author": "Brandon Sanderson",
9 | "title": "Mistborn",
10 | "genre": "Fantasy",
11 | "price": [],
12 | "pub_date": 1166347847000,
13 | "review": [
14 | {
15 | "title": "Unbeliveable twitst",
16 | "content": null
17 | },
18 | {
19 | "title": null,
20 | "content": "The best book i ever read"
21 | }
22 | ],
23 | "type0": [],
24 | "type2": [],
25 | "type6": null,
26 | "sold": []
27 | },
28 | {
29 | "id": "b002",
30 | "others": {
31 | "alias3": "A3 Angleso and Demenso",
32 | "alias2": "\r\n
A2 Angleso and Demenso<\/title>\r\n "
33 | },
34 | "author": "Dan Brown",
35 | "title": "Angels and Demons",
36 | "genre": "Mystery Thriller",
37 | "price": [],
38 | "pub_date": 1040117447000,
39 | "review": [
40 | {
41 | "title": "Fast paced mystery",
42 | "content": null
43 | },
44 | {
45 | "title": null,
46 | "content": "a good one i would say"
47 | }
48 | ],
49 | "type0": [
50 | {
51 | "alias": null,
52 | "website": {
53 | "url": []
54 | }
55 | }
56 | ],
57 | "type2": [],
58 | "type6": null,
59 | "sold": null
60 | },
61 | {
62 | "id": "b003",
63 | "others": {},
64 | "author": "Dan Brown",
65 | "title": "Digital Fortress",
66 | "genre": "Mystery Thriller",
67 | "price": null,
68 | "pub_date": 1071653447000,
69 | "review": [
70 | {
71 | "title": "Best SciFi Thriller3",
72 | "content": null
73 | }
74 | ],
75 | "type0": [
76 | {
77 | "alias": {
78 | "title": "Encryto",
79 | "language": []
80 | },
81 | "website": null
82 | },
83 | {
84 | "alias": null,
85 | "website": {
86 | "url": []
87 | }
88 | }
89 | ],
90 | "type2": [],
91 | "type6": null,
92 | "sold": []
93 | }
94 | ]
95 | }
--------------------------------------------------------------------------------
/example/books.xsd:
--------------------------------------------------------------------------------
1 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
--------------------------------------------------------------------------------
/src/test/resources/books.xml.bkp:
--------------------------------------------------------------------------------
1 |
2 |
3 |
5 |
6 | Brandon Sanderson
7 | Mistborn
8 | Fantasy
9 | 50
10 | 2006-12-17T09:30:47.0Z
11 |
12 | Wonderful
13 | I love the plot twitst and the new magic
14 |
15 |
16 | Unbeliveable twitst
17 | The best book i ever read
18 |
19 | 10
20 |
21 |
22 | Dan Brown
23 | Angels and Demons
24 | Mystery Thriller
25 | 52
26 | 2002-12-17T09:30:47.0Z
27 |
28 | Good Thriller
29 |
30 |
31 | Fast paced mystery
32 | a good one i would say
33 |
34 |
35 | A2 Angleso and Demenso
36 |
37 | A3 Angleso and Demenso
38 |
39 | www.danbrown.com
40 |
41 |
42 |
43 | Dan Brown
44 | Digital Fortress
45 | Mystery Thriller
46 | 2003-12-17T09:30:47.0Z
47 |
48 | Best SciFi Thriller
49 |
50 |
51 | Best SciFi Thriller2
52 |
53 |
54 | Best SciFi Thriller3
55 |
56 |
57 | Encryto
58 | Italian
59 | French
60 |
61 |
62 | www.danbrown.com
63 |
64 | 23
65 |
66 |
67 |
--------------------------------------------------------------------------------
/gradlew.bat:
--------------------------------------------------------------------------------
1 | @if "%DEBUG%" == "" @echo off
2 | @rem ##########################################################################
3 | @rem
4 | @rem Gradle startup script for Windows
5 | @rem
6 | @rem ##########################################################################
7 |
8 | @rem Set local scope for the variables with windows NT shell
9 | if "%OS%"=="Windows_NT" setlocal
10 |
11 | set DIRNAME=%~dp0
12 | if "%DIRNAME%" == "" set DIRNAME=.
13 | set APP_BASE_NAME=%~n0
14 | set APP_HOME=%DIRNAME%
15 |
16 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
17 | set DEFAULT_JVM_OPTS=
18 |
19 | @rem Find java.exe
20 | if defined JAVA_HOME goto findJavaFromJavaHome
21 |
22 | set JAVA_EXE=java.exe
23 | %JAVA_EXE% -version >NUL 2>&1
24 | if "%ERRORLEVEL%" == "0" goto init
25 |
26 | echo.
27 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
28 | echo.
29 | echo Please set the JAVA_HOME variable in your environment to match the
30 | echo location of your Java installation.
31 |
32 | goto fail
33 |
34 | :findJavaFromJavaHome
35 | set JAVA_HOME=%JAVA_HOME:"=%
36 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
37 |
38 | if exist "%JAVA_EXE%" goto init
39 |
40 | echo.
41 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
42 | echo.
43 | echo Please set the JAVA_HOME variable in your environment to match the
44 | echo location of your Java installation.
45 |
46 | goto fail
47 |
48 | :init
49 | @rem Get command-line arguments, handling Windows variants
50 |
51 | if not "%OS%" == "Windows_NT" goto win9xME_args
52 |
53 | :win9xME_args
54 | @rem Slurp the command line arguments.
55 | set CMD_LINE_ARGS=
56 | set _SKIP=2
57 |
58 | :win9xME_args_slurp
59 | if "x%~1" == "x" goto execute
60 |
61 | set CMD_LINE_ARGS=%*
62 |
63 | :execute
64 | @rem Setup the command line
65 |
66 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
67 |
68 | @rem Execute Gradle
69 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
70 |
71 | :end
72 | @rem End local scope for the variables with windows NT shell
73 | if "%ERRORLEVEL%"=="0" goto mainEnd
74 |
75 | :fail
76 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
77 | rem the _cmd.exe /c_ return code!
78 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
79 | exit /b 1
80 |
81 | :mainEnd
82 | if "%OS%"=="Windows_NT" endlocal
83 |
84 | :omega
85 |
--------------------------------------------------------------------------------
/src/test/resources/books.xsd:
--------------------------------------------------------------------------------
1 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
--------------------------------------------------------------------------------
/src/test/resources/new_books.json:
--------------------------------------------------------------------------------
1 | {
2 | "book": [
3 | {
4 | "id": "b001",
5 | "others": {
6 | "ot:random": "true"
7 | },
8 | "author": "Brandon Sanderson",
9 | "title": "Mistborn",
10 | "genre": "Fantasy",
11 | "price": [
12 | {
13 | "currency": null,
14 | "text_value": 50.0
15 | }
16 | ],
17 | "pub_date": 1166347847000,
18 | "review": [
19 | {
20 | "title": "Wonderful",
21 | "content": "I love the plot twitst and the new magic"
22 | },
23 | {
24 | "title": "Unbeliveable twitst",
25 | "content": "The best book i ever read"
26 | }
27 | ],
28 | "type0": [],
29 | "type2": [],
30 | "type6": null,
31 | "sold": "10"
32 | },
33 | {
34 | "id": "b002",
35 | "others": {},
36 | "author": "Dan Brown",
37 | "title": "Angels and Demons",
38 | "genre": "Mystery Thriller",
39 | "price": [
40 | {
41 | "currency": "EUR",
42 | "text_value": null
43 | },
44 | {
45 | "currency": null,
46 | "text_value": 52.0
47 | }
48 | ],
49 | "pub_date": 1040117447000,
50 | "review": [
51 | {
52 | "title": "Good Thriller",
53 | "content": null
54 | },
55 | {
56 | "title": "Fast paced mystery",
57 | "content": null
58 | },
59 | {
60 | "title": null,
61 | "content": "a good one i would say"
62 | }
63 | ],
64 | "type0": [
65 | {
66 | "alias": null,
67 | "website": {
68 | "url": "www.danbrown.com"
69 | }
70 | }
71 | ],
72 | "type2": [
73 | {
74 | "type4": [
75 | {
76 | "alias2": {
77 | "title": "A2 Angleso and Demenso",
78 | "language": []
79 | },
80 | "website2": null
81 | }
82 | ]
83 | }
84 | ],
85 | "type6": {
86 | "alias3": "A3 Angleso and Demenso"
87 | },
88 | "sold": []
89 | },
90 | {
91 | "id": "b003",
92 | "others": {},
93 | "author": "Dan Brown",
94 | "title": "Digital Fortress",
95 | "genre": "Mystery Thriller",
96 | "price": [],
97 | "pub_date": 1071653447000,
98 | "review": [
99 | {
100 | "title": "Best SciFi Thriller",
101 | "content": null
102 | },
103 | {
104 | "title": "Best SciFi Thriller2",
105 | "content": null
106 | },
107 | {
108 | "title": "Best SciFi Thriller3",
109 | "content": null
110 | }
111 | ],
112 | "type0": [
113 | {
114 | "alias": {
115 | "title": "Encryto",
116 | "language": []
117 | },
118 | "website": null
119 | },
120 | {
121 | "alias": {
122 | "title": null,
123 | "language": "French"
124 | },
125 | "website": null
126 | },
127 | {
128 | "alias": null,
129 | "website": {
130 | "url": "www.danbrown.com"
131 | }
132 | }
133 | ],
134 | "type2": [],
135 | "type6": null,
136 | "sold": "23"
137 | }
138 | ]
139 | }
140 |
--------------------------------------------------------------------------------
/src/main/scala/in/dreamlabs/xmlavro/config/ArgParse.scala:
--------------------------------------------------------------------------------
1 | package in.dreamlabs.xmlavro.config
2 |
3 | import javax.xml.namespace.QName
4 |
5 | import scala.collection.immutable.List
6 | import scala.collection.mutable
7 | import scala.reflect.io.Path
8 | import scala.reflect.runtime.universe._
9 |
10 | /**
11 | * Created by Royce on 02/02/2017.
12 | */
13 | class ArgParse(args: Seq[String]) {
14 | private val argsMap = {
15 | val map = mutable.Map[String, List[String]]()
16 | val len = args.length
17 | var i = 0
18 | while (i < len) {
19 | val arg = args(i)
20 | if (arg.startsWith("--") || (arg.startsWith("-") && arg.length == 2)) {
21 | val name = arg stripPrefix "-" stripPrefix "-"
22 | val values = mutable.ListBuffer[String]()
23 | while (i + 1 < len && !args(i + 1).startsWith("-")) {
24 | i += 1
25 | values += args(i)
26 | }
27 | map += (name -> values.toList)
28 | }
29 | i += 1
30 | }
31 | map
32 | }
33 |
34 |
35 | def opt[T: TypeTag](name: String, short: Char): Option[T] = {
36 | if (argsMap.contains(name) || argsMap.contains(short + "")) {
37 | val key = if (argsMap contains name) name else short + ""
38 | val values = argsMap(key)
39 | try
40 | Some(value[T](values))
41 | catch {
42 | case e: IllegalArgumentException => throw new IllegalArgumentException(s"${e.getMessage} for $key", e)
43 | }
44 | } else None
45 | }
46 |
47 | private def value[T: TypeTag](original: List[String]): T = {
48 | typeOf[T] match {
49 | case t if t =:= typeOf[String] => original.fetch().asInstanceOf[T]
50 | case t if t =:= typeOf[Int] => original.fetch().toInt.asInstanceOf[T]
51 | case t if t =:= typeOf[Double] => original.fetch().toDouble.asInstanceOf[T]
52 | case t if t =:= typeOf[Boolean] => original.fetch().toBoolean.asInstanceOf[T]
53 | case t if t =:= typeOf[Path] => Path(original.fetch()).asInstanceOf[T]
54 | case t if t =:= typeOf[List[String]] => original.validate().asInstanceOf[T]
55 | case t if t =:= typeOf[List[Int]] => original.validate().map(value => value.toInt).asInstanceOf[T]
56 | case t if t =:= typeOf[List[Double]] => original.validate().map(value => value.toDouble).asInstanceOf[T]
57 | case t if t =:= typeOf[List[Boolean]] => original.validate().map(value => value.toBoolean).asInstanceOf[T]
58 | case t if t =:= typeOf[List[Path]] => original.validate().map(value => Path(value)).asInstanceOf[T]
59 | case t if t =:= typeOf[QName] => QName.valueOf(original.fetch()).asInstanceOf[T]
60 | case other => throw new IllegalArgumentException(s"Type $other is not yet supported")
61 | }
62 | }
63 | def toggle(name: String, short: Char): Option[Boolean] = {
64 | if (argsMap.contains(name) || argsMap.contains(short + "")) {
65 | val key = if (argsMap contains name) name else short + ""
66 | val values = argsMap(key)
67 | if (values.nonEmpty)
68 | throw new IllegalArgumentException(s"Too many values provided for $key")
69 | else
70 | Some(true)
71 | } else None
72 | }
73 |
74 | implicit class MyList[String](list: List[String]) {
75 | def fetch(): String = {
76 | if (list.length > 1)
77 | throw new IllegalArgumentException(s"Too many values provided")
78 | else if (list.isEmpty)
79 | throw new IllegalArgumentException(s"Too less values provided")
80 | else list.head
81 | }
82 |
83 | def validate(): List[String] = {
84 | if (list.isEmpty)
85 | throw new IllegalArgumentException(s"Too less values provided")
86 | else list
87 | }
88 | }
89 |
90 | }
91 |
--------------------------------------------------------------------------------
/example/books.avsc:
--------------------------------------------------------------------------------
1 | {
2 | "type" : "record",
3 | "name" : "BooksForm",
4 | "fields" : [ {
5 | "name" : "book",
6 | "type" : {
7 | "type" : "array",
8 | "items" : {
9 | "type" : "record",
10 | "name" : "BookForm",
11 | "fields" : [ {
12 | "name" : "author",
13 | "type" : "string",
14 | "source" : "element author"
15 | }, {
16 | "name" : "title",
17 | "type" : "string",
18 | "source" : "element title"
19 | }, {
20 | "name" : "genre",
21 | "type" : "string",
22 | "source" : "element genre"
23 | }, {
24 | "name" : "price",
25 | "type" : [ "null", {
26 | "type" : "array",
27 | "items" : {
28 | "type" : "record",
29 | "name" : "PriceType",
30 | "fields" : [ {
31 | "name" : "text_value",
32 | "type" : [ "null", "double" ],
33 | "source" : "element text_value"
34 | }, {
35 | "name" : "_currency",
36 | "type" : [ "null", "string" ],
37 | "default" : null,
38 | "source" : "attribute currency"
39 | } ]
40 | }
41 | } ],
42 | "default" : null,
43 | "source" : "element price"
44 | }, {
45 | "name" : "pub_date",
46 | "type" : [ "null", "string" ],
47 | "default" : null,
48 | "source" : "element pub_date"
49 | }, {
50 | "name" : "review",
51 | "type" : [ "null", {
52 | "type" : "array",
53 | "items" : {
54 | "type" : "record",
55 | "name" : "ReviewType",
56 | "fields" : [ {
57 | "name" : "title",
58 | "type" : "string",
59 | "source" : "element title"
60 | }, {
61 | "name" : "content",
62 | "type" : [ "null", "string" ],
63 | "default" : null,
64 | "source" : "element content"
65 | } ]
66 | }
67 | } ],
68 | "default" : null,
69 | "source" : "element review"
70 | }, {
71 | "name" : "type0",
72 | "type" : {
73 | "type" : "array",
74 | "items" : {
75 | "type" : "record",
76 | "name" : "type1",
77 | "fields" : [ {
78 | "name" : "alias",
79 | "type" : {
80 | "type" : "record",
81 | "name" : "AliasType",
82 | "fields" : [ {
83 | "name" : "title",
84 | "type" : "string",
85 | "source" : "element title"
86 | }, {
87 | "name" : "language",
88 | "type" : [ "null", {
89 | "type" : "array",
90 | "items" : "string"
91 | } ],
92 | "default" : null,
93 | "source" : "element language"
94 | } ]
95 | },
96 | "source" : "element alias"
97 | }, {
98 | "name" : "website",
99 | "type" : {
100 | "type" : "record",
101 | "name" : "WebsiteType",
102 | "fields" : [ {
103 | "name" : "url",
104 | "type" : [ "null", {
105 | "type" : "array",
106 | "items" : "string"
107 | } ],
108 | "default" : null,
109 | "source" : "element url"
110 | } ]
111 | },
112 | "source" : "element website"
113 | } ]
114 | }
115 | }
116 | }, {
117 | "name" : "sold",
118 | "type" : [ "null", {
119 | "type" : "array",
120 | "items" : "string"
121 | } ],
122 | "default" : null,
123 | "source" : "element sold"
124 | }, {
125 | "name" : "others",
126 | "type" : {
127 | "type" : "map",
128 | "values" : "string"
129 | }
130 | }, {
131 | "name" : "_id",
132 | "type" : [ "null", "string" ],
133 | "default" : null,
134 | "source" : "attribute id"
135 | } ]
136 | }
137 | },
138 | "source" : "element book"
139 | } ]
140 | }
--------------------------------------------------------------------------------
/src/test/resources/book.avsc:
--------------------------------------------------------------------------------
1 | {
2 | "type": "record",
3 | "name": "BookForm",
4 | "fields": [
5 | {
6 | "name": "id",
7 | "type": [
8 | "null",
9 | "string"
10 | ],
11 | "source": "attribute id"
12 | },
13 | {
14 | "name": "others",
15 | "type": {
16 | "type": "map",
17 | "values": "string"
18 | }
19 | },
20 | {
21 | "name": "author",
22 | "type": "string",
23 | "source": "element author"
24 | },
25 | {
26 | "name": "title",
27 | "type": "string",
28 | "source": "element title"
29 | },
30 | {
31 | "name": "genre",
32 | "type": "string",
33 | "source": "element genre"
34 | },
35 | {
36 | "name": "price",
37 | "type": {
38 | "type": "array",
39 | "items": {
40 | "type": "record",
41 | "name": "PriceType",
42 | "fields": [
43 | {
44 | "name": "currency",
45 | "type": [
46 | "null",
47 | "string"
48 | ],
49 | "source": "attribute currency"
50 | },
51 | {
52 | "name": "text_value",
53 | "type": [
54 | "null",
55 | "double"
56 | ],
57 | "source": "element text_value"
58 | }
59 | ]
60 | }
61 | },
62 | "source": "element price"
63 | },
64 | {
65 | "name": "pub_date",
66 | "type": [
67 | "null",
68 | "long"
69 | ],
70 | "source": "element pub_date",
71 | "comment": "timestamp"
72 | },
73 | {
74 | "name": "type0",
75 | "type": {
76 | "type": "array",
77 | "items": {
78 | "type": "record",
79 | "name": "type1",
80 | "fields": [
81 | {
82 | "name": "review",
83 | "type": [
84 | "null",
85 | {
86 | "type": "record",
87 | "name": "ReviewType",
88 | "fields": [
89 | {
90 | "name": "title",
91 | "type": "string",
92 | "source": "element title"
93 | },
94 | {
95 | "name": "content",
96 | "type": [
97 | "null",
98 | "string"
99 | ],
100 | "source": "element content"
101 | }
102 | ]
103 | }
104 | ],
105 | "source": "element review"
106 | },
107 | {
108 | "name": "alias",
109 | "type": [
110 | "null",
111 | {
112 | "type": "record",
113 | "name": "AliasType",
114 | "fields": [
115 | {
116 | "name": "title",
117 | "type": "string",
118 | "source": "element title"
119 | },
120 | {
121 | "name": "language",
122 | "type": {
123 | "type": "array",
124 | "items": "string"
125 | },
126 | "source": "element language"
127 | }
128 | ]
129 | }
130 | ],
131 | "source": "element alias"
132 | },
133 | {
134 | "name": "website",
135 | "type": [
136 | "null",
137 | {
138 | "type": "record",
139 | "name": "WebsiteType",
140 | "fields": [
141 | {
142 | "name": "url",
143 | "type": {
144 | "type": "array",
145 | "items": "string"
146 | },
147 | "source": "element url"
148 | }
149 | ]
150 | }
151 | ],
152 | "source": "element website"
153 | },
154 | {
155 | "name": "sold",
156 | "type": [
157 | "null",
158 | "string"
159 | ],
160 | "source": "element sold"
161 | }
162 | ]
163 | }
164 | }
165 | }
166 | ]
167 | }
--------------------------------------------------------------------------------
/src/main/scala/in/dreamlabs/xmlavro/config/ConfigParser.scala:
--------------------------------------------------------------------------------
1 | package in.dreamlabs.xmlavro.config
2 |
3 | import in.dreamlabs.xmlavro.ConversionException
4 | import javax.xml.namespace.QName
5 | import org.yaml.snakeyaml.Yaml
6 | import org.yaml.snakeyaml.constructor.Constructor
7 |
8 | import scala.collection.mutable
9 | import scala.reflect.io.Path
10 |
11 | /**
12 | * Created by Royce on 21/12/2016.
13 | */
14 | class ConfigParser(args: Seq[String]) extends ArgParse(args) {
15 |
16 | val config: Config = {
17 | val configFile = opt[Path]("config", 'c')
18 | if (configFile isDefined) {
19 | fetchConfig(configFile get)
20 | } else {
21 | new Config
22 | }
23 | }
24 |
25 | processArgs()
26 | config.validate()
27 |
28 | private def processArgs(): Unit = {
29 | val debug = toggle("debug", 'd')
30 | val baseDir = opt[Path]("baseDir", 'b')
31 | val stream = toggle("stream", 's')
32 | val xsd = opt[List[Path]]("toAvsc", 'd')
33 | val xml = opt[List[Path]]("toAvro", 'x')
34 | val splitBy = opt[String]("splitBy", 'y')
35 | val ignoreMissing = toggle("ignoreMissing", 'i')
36 | val validateSchema = opt[Path]("validateSchema", 'v')
37 | val ignoreHiveKeywords = toggle("ignoreHiveKeywords", 'h')
38 | val rootElementQName = opt[QName]("rootElementQName", 'r')
39 |
40 | if (debug isDefined) config.debug = debug.get
41 | if (baseDir isDefined) config.baseDir = baseDir
42 | if (xsd isDefined) {
43 | val tempConfig =
44 | if (config.XSD isDefined) config.XSD.get
45 | else {
46 | val temp = new XSDConfig
47 | config.XSD = Option(temp)
48 | temp
49 | }
50 | if (ignoreHiveKeywords isDefined) tempConfig.ignoreHiveKeywords = ignoreHiveKeywords.get
51 | tempConfig.rootElementQName = rootElementQName
52 | val temp = xsd.get
53 | tempConfig.xsdFile = temp.head
54 | if (temp.length > 1) tempConfig.avscFile = temp(1)
55 | if (temp.length > 2)
56 | throw new IllegalArgumentException(
57 | "Too many values provided for xsd option")
58 | }
59 | if (xml isDefined) {
60 | val tempConfig =
61 | if (config.XML isDefined) config.XML.get
62 | else {
63 | val temp = new XMLConfig
64 | config.XML = Option(temp)
65 | temp
66 | }
67 | val temp = xml.get
68 | tempConfig.avscFile = temp.head
69 | if (stream.isDefined && stream.get) {
70 | tempConfig.xmlInput = "stdin"
71 | tempConfig.avroOutput = "stdout"
72 | } else {
73 | if (temp.length > 1) tempConfig.xmlFile = temp(1)
74 | if (temp.length > 2) tempConfig.avroFile = temp(2)
75 | if (temp.length > 3)
76 | throw new IllegalArgumentException(
77 | "Too many values provided for xml option")
78 | }
79 | tempConfig.documentRootTag = ""
80 | if (splitBy isDefined) tempConfig.splitBy = splitBy.get
81 | if (ignoreMissing isDefined) tempConfig.ignoreMissing = ignoreMissing.get
82 | if (validateSchema isDefined)
83 | tempConfig.validationXSD = validateSchema
84 | }
85 | }
86 |
87 | private def fetchConfig(configFile: Path): Config = {
88 | val configReader = configFile.toFile.bufferedReader()
89 | val configData = StringBuilder.newBuilder
90 | var line = configReader.readLine()
91 | val pattern = "\\$\\{(.+?)\\}".r
92 | while (line != null) {
93 | val matches = pattern.findAllMatchIn(line)
94 | matches.foreach {
95 | tempMatch =>
96 | try line = line.replace(tempMatch.matched, sys.env(tempMatch.group(1)))
97 | catch {
98 | case _: NoSuchElementException => throw ConversionException(tempMatch.group(1) + " is not found in the environment variables")
99 | }
100 | }
101 | configData append line + "\n"
102 | line = configReader.readLine()
103 | }
104 | val obj = new Yaml(new Constructor(classOf[Config])) load configData.mkString
105 | obj.asInstanceOf[Config]
106 | }
107 | }
108 |
109 | object ConfigParser {
110 | val USAGE1 =
111 | "{-d|--debug} {-b|--baseDir } -xsd|--toAvsc {} {-h|--ignoreHiveKeywords} {-r|rootElementQName }"
112 | val USAGE2 =
113 | "{-b|--baseDir } {-s|--stream|--stdout} -xml|--toAvro {} {} {-sb|--splitby } {-i|--ignoreMissing} {-v|--validateSchema }"
114 | val USAGE3 =
115 | "{-d|--debug} {-b|--baseDir } {-xsd|--toAvsc {} {-h|--ignoreHiveKeywords} {-r|rootElementQName }} {-s|--stream|--stdout} {-xml|--toAvro {} {} {-sb|--splitby }} {-i|--ignoreMissing}"
116 | val USAGE: String = "XSD to AVSC Usage : " + USAGE1 + "\nXML to AVRO Usage : " + USAGE2 + "\nMixed Usage : " + USAGE3
117 |
118 | def apply(args: Array[String]): ConfigParser = new ConfigParser(args)
119 | }
120 |
--------------------------------------------------------------------------------
/src/test/resources/books.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
5 |
6 | Brandon Sanderson
7 | Mistborn
8 | Fantasy
9 | 50
10 | 2006-12-17T09:30:47.0Z
11 |
12 | Wonderful
13 | I love the plot twitst and the new magic
14 |
15 |
16 | Unbeliveable twitst
17 | The best book i ever read
18 |
19 | 10
20 |
21 |
22 | Dan Brown
23 | Angels and Demons
24 | Mystery Thriller
25 | 52
26 | 2002-12-17T09:30:47.0Z
27 |
28 | Good Thriller
29 |
30 |
31 | Fast paced mystery
32 | a good one i would say
33 |
34 |
35 | A2 Angleso and Demenso
36 |
37 | A3 Angleso and Demenso
38 |
39 | www.danbrown.com
40 |
41 |
42 |
43 | Dan Brown
44 | Digital Fortress
45 | Mystery Thriller
46 | 2003-12-17T09:30:47.0Z
47 |
48 | Best SciFi Thriller
49 |
50 |
51 | Best SciFi Thriller2
52 |
53 |
54 | Best SciFi Thriller3
55 |
56 |
57 | Encryto
58 | Italian
59 | French
60 |
61 |
62 | www.danbrown.com
63 |
64 | 23
65 |
66 |
67 | Dan Brown
68 | Digital Fortress
69 | Mystery Thriller
70 | 2003-12-17T09:30:47.0Z
71 |
72 | Best SciFi Thriller
73 |
74 |
75 | Best SciFi Thriller2
76 |
77 |
78 | Best SciFi Thriller3
79 |
80 |
81 | Encryto
82 | Italian
83 | French
84 |
85 |
86 | www.danbrown.com
87 |
88 | 23
89 |
90 |
91 | Dan Brown
92 | Digital Fortress
93 | Mystery Thriller
94 | 2003-12-17T09:30:47.0Z
95 |
96 | Best SciFi Thriller
97 |
98 |
99 | Best SciFi Thriller2
100 |
101 |
102 | Best SciFi Thriller3
103 |
104 |
105 | Encryto
106 | Italian
107 | French
108 |
109 |
110 | www.danbrown.com
111 |
112 | 23
113 |
114 |
115 | Dan Brown
116 | Digital Fortress
117 | Mystery Thriller
118 | 2003-12-17T09:30:47.0Z
119 |
120 | Best SciFi Thriller
121 |
122 |
123 | Best SciFi Thriller2
124 |
125 |
126 | Best SciFi Thriller3
127 |
128 |
129 | Encryto
130 | Italian
131 | French
132 |
133 |
134 | www.danbrown.com
135 |
136 | 23
137 |
138 |
139 |
140 |
--------------------------------------------------------------------------------
/src/test/resources/books.avsc:
--------------------------------------------------------------------------------
1 | {
2 | "type" : "record",
3 | "name" : "type2",
4 | "fields" : [ {
5 | "name" : "books",
6 | "type" : [ "null", {
7 | "type" : "record",
8 | "name" : "BooksForm",
9 | "fields" : [ {
10 | "name" : "book",
11 | "type" : {
12 | "type" : "array",
13 | "items" : {
14 | "type" : "record",
15 | "name" : "BookForm",
16 | "fields" : [ {
17 | "name" : "id",
18 | "type" : [ "null", "string" ],
19 | "source" : "attribute id"
20 | }, {
21 | "name" : "others",
22 | "type" : {
23 | "type" : "map",
24 | "values" : "string"
25 | }
26 | }, {
27 | "name" : "author",
28 | "type" : "string",
29 | "source" : "element author"
30 | }, {
31 | "name" : "title",
32 | "type" : "string",
33 | "source" : "element title"
34 | }, {
35 | "name" : "genre",
36 | "type" : "string",
37 | "source" : "element genre"
38 | }, {
39 | "name" : "price",
40 | "type" : [ "null", {
41 | "type" : "array",
42 | "items" : {
43 | "type" : "record",
44 | "name" : "PriceType",
45 | "fields" : [ {
46 | "name" : "currency",
47 | "type" : [ "null", "string" ],
48 | "source" : "attribute currency"
49 | }, {
50 | "name" : "text_value",
51 | "type" : [ "null", "double" ],
52 | "source" : "element text_value"
53 | } ]
54 | }
55 | } ],
56 | "source" : "element price"
57 | }, {
58 | "name" : "pub_date",
59 | "type" : [ "null", "long" ],
60 | "source" : "element pub_date",
61 | "comment" : "timestamp"
62 | }, {
63 | "name" : "review",
64 | "type" : [ "null", {
65 | "type" : "array",
66 | "items" : {
67 | "type" : "record",
68 | "name" : "ReviewType",
69 | "fields" : [ {
70 | "name" : "title",
71 | "type" : "string",
72 | "source" : "element title"
73 | }, {
74 | "name" : "content",
75 | "type" : [ "null", "string" ],
76 | "source" : "element content"
77 | } ]
78 | }
79 | } ],
80 | "source" : "element review"
81 | }, {
82 | "name" : "type0",
83 | "type" : {
84 | "type" : "array",
85 | "items" : {
86 | "type" : "record",
87 | "name" : "type1",
88 | "fields" : [ {
89 | "name" : "alias",
90 | "type" : {
91 | "type" : "record",
92 | "name" : "AliasType",
93 | "fields" : [ {
94 | "name" : "title",
95 | "type" : "string",
96 | "source" : "element title"
97 | }, {
98 | "name" : "language",
99 | "type" : [ "null", {
100 | "type" : "array",
101 | "items" : "string"
102 | } ],
103 | "source" : "element language"
104 | } ]
105 | },
106 | "source" : "element alias"
107 | }, {
108 | "name" : "website",
109 | "type" : {
110 | "type" : "record",
111 | "name" : "WebsiteType",
112 | "fields" : [ {
113 | "name" : "url",
114 | "type" : [ "null", {
115 | "type" : "array",
116 | "items" : "string"
117 | } ],
118 | "source" : "element url"
119 | } ]
120 | },
121 | "source" : "element website"
122 | } ]
123 | }
124 | }
125 | }, {
126 | "name" : "sold",
127 | "type" : [ "null", {
128 | "type" : "array",
129 | "items" : "string"
130 | } ],
131 | "source" : "element sold"
132 | } ]
133 | }
134 | },
135 | "source" : "element book"
136 | } ]
137 | } ],
138 | "source" : "element {urn:main}:books"
139 | }, {
140 | "name" : "author",
141 | "type" : [ "null", "string" ],
142 | "source" : "element {http://www.books.com/XML}:author"
143 | } ],
144 | "source" : "document"
145 | }
--------------------------------------------------------------------------------
/src/main/scala/in/dreamlabs/xmlavro/XMLEvents.scala:
--------------------------------------------------------------------------------
1 | package in.dreamlabs.xmlavro
2 |
3 | import in.dreamlabs.xmlavro
4 | import in.dreamlabs.xmlavro.RichAvro._
5 | import in.dreamlabs.xmlavro.Utils._
6 | import org.apache.avro.Schema
7 | import org.apache.avro.Schema.Type._
8 | import org.apache.avro.Schema.{Field, Type}
9 | import org.apache.avro.generic.GenericData.Record
10 |
11 | import scala.collection.mutable.ListBuffer
12 | import scala.util.control.Breaks.{break, breakable}
13 |
14 | /**
15 | * Created by Royce on 13/02/2017.
16 | */
17 | object XMLEvents {
18 | val PRIMITIVES: List[Type] =
19 | List(STRING, INT, LONG, FLOAT, DOUBLE, BOOLEAN, NULL)
20 | val eleStack: ListBuffer[XNode] = ListBuffer[XNode]()
21 | val schemaPath: ListBuffer[AvroPath] = ListBuffer[AvroPath]()
22 | var rootSchema: Schema = _
23 | var rootRecord: Record = _
24 | private var lastSchema = rootSchema
25 |
26 | def setSchema(schema: Schema, record: Record): Unit = {
27 | rootSchema = schema
28 | rootRecord = record
29 | lastSchema = rootSchema
30 | eleStack.clear()
31 | schemaPath.clear()
32 | }
33 |
34 | def addElement(node: XNode): Boolean = {
35 | eleStack.insert(0, node)
36 |
37 | var found = false
38 | if (eleStack.length != 1) {
39 | val (field, path, _) = searchField(lastSchema, node)
40 | if (field isDefined) {
41 | schemaPath ++= path.reverse
42 | updatePath(field.get)
43 | found = true
44 | } else
45 | AvroPath.missing(eleStack)
46 | } else found = true
47 | found
48 | }
49 |
50 | def removeElement(node: XNode): Unit = {
51 | if (node.name != eleStack.head.name)
52 | throw ConversionException(s"No. of closing tags is not matching opening tags when closing ${node.name}, contact the developer")
53 |
54 | eleStack.remove(0)
55 | var count = schemaPath.size
56 | if (count != 0) {
57 | val schemaNodeName = if (SchemaBuilder.HIVE_KEYWORDS.contains(node.name.toUpperCase))
58 | if (schemaPath.last.name != s"${node.name}_value" && Option(lastSchema.getField(s"${node.name}_value")).isEmpty) {
59 | AvroPath.warning(eleStack, s"${node.name} found in the XML is a Hive keyword, " +
60 | s"but the avsc schema is not modified to fix any possible issues, " +
61 | s"please consider updating it to ${node.name}_value or re-create the avsc with latest jar. " +
62 | s"If you updated the avsc make sure you update your table schema as well")
63 | node.name
64 | } else
65 | s"${node.name}_value"
66 | else
67 | node.name
68 |
69 |
70 | if (schemaPath.last.name == schemaNodeName && node.name != eleStack.head.name) { //Complex tag closing
71 | count = destroyLastPath()
72 | while (count != 0 && schemaPath.last.virtual) {
73 | count = destroyLastPath()
74 | }
75 | } else if (schemaPath.last.name.startsWith("type")) {
76 | while (count != 0 && schemaPath.last.virtual) {
77 | count = destroyLastPath()
78 | }
79 | }
80 |
81 | lastSchema = rootRecord.at(schemaPath.toList).getSchema
82 | }
83 | }
84 |
85 | private def destroyLastPath(): Int = {
86 | val tempPath = schemaPath.last
87 | schemaPath -= tempPath
88 | schemaPath size
89 | }
90 |
91 | def searchField(
92 | schema: Schema,
93 | node: XNode): (Option[Field], ListBuffer[AvroPath], Schema) = {
94 | var fieldSchema = schema.simplify
95 | var field = schema.deepSchema.field(node)
96 | val path = ListBuffer[AvroPath]()
97 |
98 | // If field is not a direct child in schema, search through all custom fields
99 | if (field isEmpty)
100 | breakable {
101 | for (typeField <- fieldSchema.customTypeFields()) {
102 | val (resultField, resultPath, resultSchema) =
103 | searchField(typeField.fieldSchema, node)
104 | if (resultField isDefined) {
105 | val (tempPath, tempSchema) = getPath(typeField, virtual = true)
106 | resultPath ++= tempPath
107 | path ++= resultPath
108 | field = resultField
109 | fieldSchema = resultSchema
110 | break
111 | }
112 | }
113 | }
114 | if (field isEmpty)
115 | field = schema.wildcard(node.attribute)
116 | (field, path, fieldSchema)
117 | }
118 |
119 | def getPath(field: Field,
120 | virtual: Boolean = false): (ListBuffer[AvroPath], Schema) = {
121 | val path = ListBuffer[AvroPath]()
122 | val name = field name()
123 | if (field isArray) {
124 | if (field.arrayItemType == RECORD) {
125 | path += AvroPath(name, ARRAY, schemaPath ++ path.reverse, virtual)
126 | return (path, field arraySchema)
127 | } else if (!field.isPrimitiveArray)
128 | warn(s"1 - Unknown type ${field arraySchema} for $name")
129 | } else if (field isRecord)
130 | path += AvroPath(name, RECORD, schemaPath ++ path.reverse, virtual)
131 | else if (!field.isPrimitive && !field.isMap)
132 | throw ConversionException(s"WARNING: 2 - Unknown type ${field.fieldType} for $name")
133 | (path, field fieldSchema)
134 | }
135 |
136 | def updatePath(field: Field, virtual: Boolean = false): Unit = {
137 | val name = field name()
138 | if (field isArray) {
139 | if (field.arrayItemType == RECORD) {
140 | schemaPath += AvroPath(name, ARRAY, schemaPath, virtual)
141 | lastSchema = field.arraySchema
142 | } else if (!field.isPrimitiveArray)
143 | warn(s"1 - Unknown type ${field.arraySchema} for $name")
144 | } else if (field isRecord) {
145 | schemaPath += AvroPath(name, RECORD, schemaPath, virtual)
146 | lastSchema = field.fieldSchema
147 | } else if (!field.isPrimitive && !field.isMap)
148 | throw ConversionException(s"WARNING: 2 - Unknown type ${field.fieldType} for $name")
149 | }
150 | }
151 |
152 |
--------------------------------------------------------------------------------
/gradlew:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env sh
2 |
3 | ##############################################################################
4 | ##
5 | ## Gradle start up script for UN*X
6 | ##
7 | ##############################################################################
8 |
9 | # Attempt to set APP_HOME
10 | # Resolve links: $0 may be a link
11 | PRG="$0"
12 | # Need this for relative symlinks.
13 | while [ -h "$PRG" ] ; do
14 | ls=`ls -ld "$PRG"`
15 | link=`expr "$ls" : '.*-> \(.*\)$'`
16 | if expr "$link" : '/.*' > /dev/null; then
17 | PRG="$link"
18 | else
19 | PRG=`dirname "$PRG"`"/$link"
20 | fi
21 | done
22 | SAVED="`pwd`"
23 | cd "`dirname \"$PRG\"`/" >/dev/null
24 | APP_HOME="`pwd -P`"
25 | cd "$SAVED" >/dev/null
26 |
27 | APP_NAME="Gradle"
28 | APP_BASE_NAME=`basename "$0"`
29 |
30 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
31 | DEFAULT_JVM_OPTS=""
32 |
33 | # Use the maximum available, or set MAX_FD != -1 to use that value.
34 | MAX_FD="maximum"
35 |
36 | warn () {
37 | echo "$*"
38 | }
39 |
40 | die () {
41 | echo
42 | echo "$*"
43 | echo
44 | exit 1
45 | }
46 |
47 | # OS specific support (must be 'true' or 'false').
48 | cygwin=false
49 | msys=false
50 | darwin=false
51 | nonstop=false
52 | case "`uname`" in
53 | CYGWIN* )
54 | cygwin=true
55 | ;;
56 | Darwin* )
57 | darwin=true
58 | ;;
59 | MINGW* )
60 | msys=true
61 | ;;
62 | NONSTOP* )
63 | nonstop=true
64 | ;;
65 | esac
66 |
67 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
68 |
69 | # Determine the Java command to use to start the JVM.
70 | if [ -n "$JAVA_HOME" ] ; then
71 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
72 | # IBM's JDK on AIX uses strange locations for the executables
73 | JAVACMD="$JAVA_HOME/jre/sh/java"
74 | else
75 | JAVACMD="$JAVA_HOME/bin/java"
76 | fi
77 | if [ ! -x "$JAVACMD" ] ; then
78 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
79 |
80 | Please set the JAVA_HOME variable in your environment to match the
81 | location of your Java installation."
82 | fi
83 | else
84 | JAVACMD="java"
85 | which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
86 |
87 | Please set the JAVA_HOME variable in your environment to match the
88 | location of your Java installation."
89 | fi
90 |
91 | # Increase the maximum file descriptors if we can.
92 | if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
93 | MAX_FD_LIMIT=`ulimit -H -n`
94 | if [ $? -eq 0 ] ; then
95 | if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
96 | MAX_FD="$MAX_FD_LIMIT"
97 | fi
98 | ulimit -n $MAX_FD
99 | if [ $? -ne 0 ] ; then
100 | warn "Could not set maximum file descriptor limit: $MAX_FD"
101 | fi
102 | else
103 | warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
104 | fi
105 | fi
106 |
107 | # For Darwin, add options to specify how the application appears in the dock
108 | if $darwin; then
109 | GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
110 | fi
111 |
112 | # For Cygwin, switch paths to Windows format before running java
113 | if $cygwin ; then
114 | APP_HOME=`cygpath --path --mixed "$APP_HOME"`
115 | CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
116 | JAVACMD=`cygpath --unix "$JAVACMD"`
117 |
118 | # We build the pattern for arguments to be converted via cygpath
119 | ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
120 | SEP=""
121 | for dir in $ROOTDIRSRAW ; do
122 | ROOTDIRS="$ROOTDIRS$SEP$dir"
123 | SEP="|"
124 | done
125 | OURCYGPATTERN="(^($ROOTDIRS))"
126 | # Add a user-defined pattern to the cygpath arguments
127 | if [ "$GRADLE_CYGPATTERN" != "" ] ; then
128 | OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
129 | fi
130 | # Now convert the arguments - kludge to limit ourselves to /bin/sh
131 | i=0
132 | for arg in "$@" ; do
133 | CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
134 | CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option
135 |
136 | if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition
137 | eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
138 | else
139 | eval `echo args$i`="\"$arg\""
140 | fi
141 | i=$((i+1))
142 | done
143 | case $i in
144 | (0) set -- ;;
145 | (1) set -- "$args0" ;;
146 | (2) set -- "$args0" "$args1" ;;
147 | (3) set -- "$args0" "$args1" "$args2" ;;
148 | (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
149 | (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
150 | (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
151 | (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
152 | (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
153 | (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
154 | esac
155 | fi
156 |
157 | # Escape application args
158 | save () {
159 | for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
160 | echo " "
161 | }
162 | APP_ARGS=$(save "$@")
163 |
164 | # Collect all arguments for the java command, following the shell quoting and substitution rules
165 | eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
166 |
167 | # by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong
168 | if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then
169 | cd "$(dirname "$0")"
170 | fi
171 |
172 | exec "$JAVACMD" "$@"
173 |
--------------------------------------------------------------------------------
/src/test/resources/old_books.avsc:
--------------------------------------------------------------------------------
1 | {
2 | "type": "record",
3 | "name": "BooksForm",
4 | "fields": [
5 | {
6 | "name": "book",
7 | "type": {
8 | "type": "array",
9 | "items": {
10 | "type": "record",
11 | "name": "BookForm",
12 | "fields": [
13 | {
14 | "name": "id",
15 | "type": [
16 | "null",
17 | "string"
18 | ],
19 | "source": "attribute id"
20 | },
21 | {
22 | "name": "others",
23 | "type": {
24 | "type": "map",
25 | "values": "string"
26 | }
27 | },
28 | {
29 | "name": "author",
30 | "type": "string",
31 | "source": "element author"
32 | },
33 | {
34 | "name": "title",
35 | "type": "string",
36 | "source": "element title"
37 | },
38 | {
39 | "name": "genre",
40 | "type": "string",
41 | "source": "element genre"
42 | },
43 | {
44 | "name": "price",
45 | "type": {
46 | "type": "array",
47 | "items": {
48 | "type": "record",
49 | "name": "PriceType",
50 | "fields": [
51 | {
52 | "name": "currency",
53 | "type": [
54 | "null",
55 | "string"
56 | ],
57 | "source": "attribute currency"
58 | },
59 | {
60 | "name": "text_value",
61 | "type": [
62 | "null",
63 | "double"
64 | ],
65 | "source": "element text_value"
66 | }
67 | ]
68 | }
69 | },
70 | "source": "element price"
71 | },
72 | {
73 | "name": "pub_date",
74 | "type": [
75 | "null",
76 | "long"
77 | ],
78 | "source": "element pub_date",
79 | "comment": "timestamp"
80 | },
81 | {
82 | "name": "type0",
83 | "type": {
84 | "type": "array",
85 | "items": {
86 | "type": "record",
87 | "name": "type1",
88 | "fields": [
89 | {
90 | "name": "review",
91 | "type": [
92 | "null",
93 | {
94 | "type": "record",
95 | "name": "ReviewType",
96 | "fields": [
97 | {
98 | "name": "title",
99 | "type": "string",
100 | "source": "element title"
101 | },
102 | {
103 | "name": "content",
104 | "type": [
105 | "null",
106 | "string"
107 | ],
108 | "source": "element content"
109 | }
110 | ]
111 | }
112 | ],
113 | "source": "element review"
114 | },
115 | {
116 | "name": "alias",
117 | "type": [
118 | "null",
119 | {
120 | "type": "record",
121 | "name": "AliasType",
122 | "fields": [
123 | {
124 | "name": "title",
125 | "type": "string",
126 | "source": "element title"
127 | },
128 | {
129 | "name": "language",
130 | "type": {
131 | "type": "array",
132 | "items": "string"
133 | },
134 | "source": "element language"
135 | }
136 | ]
137 | }
138 | ],
139 | "source": "element alias"
140 | },
141 | {
142 | "name": "website",
143 | "type": [
144 | "null",
145 | {
146 | "type": "record",
147 | "name": "WebsiteType",
148 | "fields": [
149 | {
150 | "name": "url",
151 | "type": {
152 | "type": "array",
153 | "items": "string"
154 | },
155 | "source": "element url"
156 | }
157 | ]
158 | }
159 | ],
160 | "source": "element website"
161 | },
162 | {
163 | "name": "sold",
164 | "type": [
165 | "null",
166 | "string"
167 | ],
168 | "source": "element sold"
169 | }
170 | ]
171 | }
172 | }
173 | }
174 | ]
175 | }
176 | },
177 | "source": "element book"
178 | }
179 | ]
180 | }
181 |
--------------------------------------------------------------------------------
/src/test/resources/new_books.avsc:
--------------------------------------------------------------------------------
1 | {
2 | "type": "record",
3 | "name": "BooksForm",
4 | "fields": [
5 | {
6 | "name": "book",
7 | "type": {
8 | "type": "array",
9 | "items": {
10 | "type": "record",
11 | "name": "BookForm",
12 | "fields": [
13 | {
14 | "name": "id",
15 | "type": [
16 | "null",
17 | "string"
18 | ],
19 | "source": "attribute id"
20 | },
21 | {
22 | "name": "others",
23 | "type": {
24 | "type": "map",
25 | "values": "string"
26 | }
27 | },
28 | {
29 | "name": "author",
30 | "type": "string",
31 | "source": "element author"
32 | },
33 | {
34 | "name": "title",
35 | "type": "string",
36 | "source": "element title"
37 | },
38 | {
39 | "name": "genre",
40 | "type": "string",
41 | "source": "element genre"
42 | },
43 | {
44 | "name": "price",
45 | "type": [
46 | "null",
47 | {
48 | "type": "array",
49 | "items": {
50 | "type": "record",
51 | "name": "PriceType",
52 | "fields": [
53 | {
54 | "name": "currency",
55 | "type": [
56 | "null",
57 | "string"
58 | ],
59 | "source": "attribute currency"
60 | },
61 | {
62 | "name": "text_value",
63 | "type": [
64 | "null",
65 | "double"
66 | ],
67 | "source": "element text_value"
68 | }
69 | ]
70 | }
71 | }
72 | ],
73 | "source": "element price"
74 | },
75 | {
76 | "name": "pub_date",
77 | "type": [
78 | "null",
79 | "long"
80 | ],
81 | "source": "element pub_date",
82 | "comment": "timestamp"
83 | },
84 | {
85 | "name": "review",
86 | "type": [
87 | "null",
88 | {
89 | "type": "array",
90 | "items": {
91 | "type": "record",
92 | "name": "ReviewType",
93 | "fields": [
94 | {
95 | "name": "title",
96 | "type": "string",
97 | "source": "element title"
98 | },
99 | {
100 | "name": "content",
101 | "type": [
102 | "null",
103 | "string"
104 | ],
105 | "source": "element content"
106 | }
107 | ]
108 | }
109 | }
110 | ],
111 | "source": "element review"
112 | },
113 | {
114 | "name": "type0",
115 | "type": {
116 | "type": "array",
117 | "items": {
118 | "type": "record",
119 | "name": "type1",
120 | "fields": [
121 | {
122 | "name": "alias",
123 | "type": {
124 | "type": "record",
125 | "name": "AliasType",
126 | "fields": [
127 | {
128 | "name": "title",
129 | "type": "string",
130 | "source": "element title"
131 | },
132 | {
133 | "name": "language",
134 | "type": [
135 | "null",
136 | {
137 | "type": "array",
138 | "items": "string"
139 | }
140 | ],
141 | "source": "element language"
142 | }
143 | ]
144 | },
145 | "source": "element alias"
146 | },
147 | {
148 | "name": "website",
149 | "type": {
150 | "type": "record",
151 | "name": "WebsiteType",
152 | "fields": [
153 | {
154 | "name": "url",
155 | "type": [
156 | "null",
157 | {
158 | "type": "array",
159 | "items": "string"
160 | }
161 | ],
162 | "source": "element url"
163 | }
164 | ]
165 | },
166 | "source": "element website"
167 | }
168 | ]
169 | }
170 | }
171 | },
172 | {
173 | "name": "sold",
174 | "type": [
175 | "null",
176 | {
177 | "type": "array",
178 | "items": "string"
179 | }
180 | ],
181 | "source": "element sold"
182 | }
183 | ]
184 | }
185 | },
186 | "source": "element book"
187 | }
188 | ]
189 | }
190 |
--------------------------------------------------------------------------------
/src/main/scala/in/dreamlabs/xmlavro/XMLDocument.scala:
--------------------------------------------------------------------------------
1 | package in.dreamlabs.xmlavro
2 |
3 | import java.io.{IOException, PipedReader, PipedWriter, PrintWriter}
4 | import javax.xml.XMLConstants
5 | import javax.xml.stream.events.XMLEvent
6 | import javax.xml.stream.{XMLEventFactory, XMLEventWriter, XMLOutputFactory}
7 | import javax.xml.transform.stream.StreamSource
8 | import javax.xml.validation.{Schema, SchemaFactory}
9 |
10 | import in.dreamlabs.xmlavro.Utils.{info, log, warn}
11 | import in.dreamlabs.xmlavro.config.XMLConfig
12 | import org.xml.sax.SAXParseException
13 |
14 | import scala.collection.mutable
15 | import scala.reflect.io.{File, Path}
16 |
17 | /**
18 | * Created by Royce on 06/03/2017.
19 | */
20 | class XMLDocument(val id: Int, val uniqueKey: Option[String], config: XMLConfig) {
21 | private val events = mutable.ListBuffer[XMLEvent]()
22 | @volatile var error = false
23 | private var exceptionList: mutable.ListBuffer[Exception] =
24 | mutable.ListBuffer()
25 | private var pipeIn: PipedReader = _
26 | private var pipeOut: PipedWriter = _
27 | private var eventOut: XMLEventWriter = _
28 | private var errorDataFile, errorMetaFile: File = _
29 | private val locker: AnyRef = new AnyRef
30 | val docText = s"document #$id${
31 | if (uniqueKey.isDefined)
32 | s" with Unique ID: \'${uniqueKey.get.toString}\'"
33 | else ""
34 | }"
35 |
36 | info("Processing " + docText)
37 |
38 | if (config.errorFile isDefined) {
39 | val filePath = config.errorFile.get
40 | val fileName = filePath.stripExtension
41 | val fileSuffix = if (uniqueKey isDefined) s"${id}__${uniqueKey.get}" else s"$id"
42 | val parent = filePath.parent
43 | errorDataFile = Path(s"${fileName}__$fileSuffix")
44 | .toAbsoluteWithRoot(parent)
45 | .addExtension("xml")
46 | .toFile
47 | errorMetaFile = Path(s"${fileName}__$fileSuffix")
48 | .toAbsoluteWithRoot(parent)
49 | .addExtension("MD")
50 | .toFile
51 | }
52 |
53 | private var validationThread = if (config.validationXSD isDefined) {
54 | pipeIn = new PipedReader()
55 | pipeOut = new PipedWriter(pipeIn)
56 | eventOut = XMLOutputFactory.newInstance().createXMLEventWriter(pipeOut)
57 | Option(new Thread {
58 | override def run(): Unit = {
59 | val validator = XMLDocument.schema.newValidator()
60 | try validator.validate(new StreamSource(pipeIn))
61 | catch {
62 | case e: SAXParseException =>
63 | val message = s"XSD validation failed - Line: ${e.getLineNumber}, Column: ${e.getColumnNumber}, Message: ${e.getMessage}"
64 | fail(ConversionException(message))
65 | case e: Exception =>
66 | warn("Exception in thread: " + e.getMessage)
67 | fail(e)
68 | } finally {
69 | pipeIn.close()
70 | info(s"Finished xsd validation on " + docText)
71 | }
72 | }
73 | })
74 | } else None
75 |
76 | if (validationThread isDefined) validationThread.get.start()
77 |
78 | def add(event: XMLEvent): Unit = locker.synchronized {
79 | if (config.errorFile isDefined) events += event
80 | if (validationThread.isDefined && !error) eventOut.add(event)
81 | }
82 |
83 | def fail(exception: Exception, wait: Boolean = false): Unit = {
84 | if (wait) {
85 | var thread: Thread = null
86 | validationThread.synchronized {
87 | if (validationThread.isDefined)
88 | thread = validationThread.get
89 | }
90 | if (Option(thread) isDefined)
91 | thread.join(2000)
92 | }
93 | locker.synchronized {
94 | error = true
95 | exceptionList += exception
96 | validationThread.synchronized {
97 | if (validationThread isDefined) validationThread = None
98 | }
99 | }
100 | }
101 |
102 | def close(): Unit = this.synchronized {
103 | if (error) {
104 | val reasons = {
105 | val builder = StringBuilder.newBuilder
106 | exceptionList.foreach(exc =>
107 | builder.append(exc.getMessage).append(", "))
108 | builder.mkString.stripSuffix(", ")
109 | }
110 | log(config.docErrorLevel,
111 | s"Failed processing $docText with reason '$reasons'")
112 | if (config.errorFile.isDefined) {
113 | info(
114 | s"Saving the failed $docText in '$errorDataFile' with message in '$errorMetaFile'")
115 | val dataOut = XMLOutputFactory
116 | .newInstance()
117 | .createXMLEventWriter(errorDataFile.bufferedWriter())
118 | events += XMLEventFactory.newInstance().createSpace("\n")
119 | events.foreach(dataOut.add)
120 | dataOut.flush()
121 | dataOut.close()
122 | val metaOut = new PrintWriter(errorMetaFile.bufferedWriter())
123 | metaOut.write(reasons)
124 | metaOut.flush()
125 | metaOut.close()
126 | }
127 | }
128 |
129 | var thread: Thread = null
130 | validationThread.synchronized {
131 | if (validationThread isDefined) thread = validationThread.get
132 | }
133 | if (Option(thread) isDefined) {
134 | try {
135 | eventOut.flush()
136 | pipeOut.flush()
137 | eventOut.close()
138 | pipeOut.close()
139 | info(s"Waiting for xsd validation of $docText to finish")
140 | thread.join(5000)
141 | if (thread.isAlive) {
142 | warn(
143 | s"Schema validation timed out for $docText, ignoring and proceeding further")
144 | pipeIn.close()
145 | }
146 | } catch {
147 | case e: Exception =>
148 | warn(
149 | s"Failed to close pipes for $docText with message '${e.getMessage}', ignoring and proceeding further")
150 | }
151 | }
152 | info(s"Closed document #$id")
153 | }
154 | }
155 |
156 | object XMLDocument {
157 | private var schema: Schema = _
158 | private var count: Int = 0
159 | var config: XMLConfig = _
160 |
161 | def apply(uniqueKey: Option[String]): XMLDocument = {
162 | if (count == 0 && config.errorFile.isDefined) {
163 | config.errorFile.get.delete()
164 | }
165 | count += 1
166 | if (Option(schema).isEmpty && config.validationXSD.isDefined)
167 | schema = SchemaFactory
168 | .newInstance(XMLConstants.W3C_XML_SCHEMA_NS_URI)
169 | .newSchema(config.validationXSD.get.jfile)
170 | new XMLDocument(count, uniqueKey, config)
171 | }
172 |
173 | def closeAll(): Unit = {
174 | if (config.qaDir.isDefined) {
175 | val qaDir = config.qaDir.get
176 | if (!qaDir.exists)
177 | qaDir.jfile.mkdir()
178 | try {
179 | val docCountOut = Path("DOCUMENT_COUNT")
180 | .toAbsoluteWithRoot(qaDir)
181 | .toFile
182 | .bufferedWriter()
183 | docCountOut.write(count + "")
184 | docCountOut.close()
185 | } catch {
186 | case e: IOException =>
187 | warn("Problem occurred while writing DOCUMENT_COUNT to QA DIR :" + e.getMessage)
188 | }
189 | }
190 | }
191 | }
192 |
--------------------------------------------------------------------------------
/src/main/scala/in/dreamlabs/xmlavro/RichAvro.scala:
--------------------------------------------------------------------------------
1 | package in.dreamlabs.xmlavro
2 |
3 | import java.util
4 |
5 | import in.dreamlabs.xmlavro.RichAvro.{caseSensitive, ignoreCaseFor}
6 | import org.apache.avro.Schema
7 | import org.apache.avro.Schema.Type._
8 | import org.apache.avro.Schema.{Field, Type}
9 | import org.apache.avro.generic.GenericData
10 | import org.apache.avro.generic.GenericData.Record
11 |
12 | import scala.collection.JavaConverters._
13 | import scala.collection.mutable
14 | import scala.util.control.Breaks._
15 |
16 | /**
17 | * Created by Royce on 26/01/2017.
18 | */
19 | trait RichAvro {
20 |
21 | implicit class RichRecord(record: Record) {
22 | def at(path: List[AvroPath]): Record = {
23 | var resultRecord = record
24 | path.foreach { path =>
25 | if (path.pathType == ARRAY) {
26 | var array =
27 | resultRecord.get(path name).asInstanceOf[util.List[AnyRef]]
28 | if (array == null || array.size() - 1 < path.index) {
29 | val arraySchema =
30 | resultRecord.getSchema.getField(path name).arraySchema
31 | if (array == null) {
32 | array = new util.ArrayList[AnyRef]()
33 | resultRecord.put(path name, array)
34 | }
35 | resultRecord = arraySchema.newRecord
36 | array.add(resultRecord)
37 | } else
38 | resultRecord = array.get(path index).asInstanceOf[Record]
39 | } else {
40 | val tempSchema =
41 | resultRecord.getSchema.getField(path name).fieldSchema
42 | var tempRecord = resultRecord.get(path name).asInstanceOf[Record]
43 | if (tempRecord == null) {
44 | tempRecord = tempSchema.newRecord
45 | resultRecord.put(path name, tempRecord)
46 | }
47 | resultRecord = tempRecord
48 | }
49 | }
50 | resultRecord
51 | }
52 |
53 | def add(node: XNode, value: String): Unit = {
54 | val schema = record.getSchema
55 | var fieldOp = schema field node
56 | var wildcard = false
57 | //TODO Handle wildcard data properly
58 |
59 | if (fieldOp isEmpty) {
60 | fieldOp = schema wildcard (node attribute)
61 | if (fieldOp isDefined) wildcard = true
62 | else if (value.trim() != "" && node.name!="nil")
63 | AvroPath.missing(XMLEvents.eleStack, node)
64 | }
65 | if (fieldOp isDefined) {
66 | val field = fieldOp.get
67 |
68 | if (wildcard) {
69 | val wildField =
70 | record.get(field name).asInstanceOf[util.Map[String, AnyRef]]
71 | val existingVal = wildField.get(node name)
72 | if (Option(existingVal) isEmpty)
73 | wildField.put(node name, value)
74 | else {
75 | existingVal match {
76 | case existingList: util.ArrayList[AnyRef] =>
77 | existingList.add(value)
78 | case _ =>
79 | val list = new util.ArrayList[AnyRef]()
80 | list.add(existingVal)
81 | list.add(value)
82 | wildField.put(node name, list)
83 | }
84 |
85 | }
86 |
87 | } else {
88 | if (field isArray) {
89 | var array = record.get(field name).asInstanceOf[util.List[AnyRef]]
90 | if (array == null) {
91 | array = new util.ArrayList[AnyRef]()
92 | record.put(field name, array)
93 | }
94 | array.add(AvroUtils.createValue(field arrayItemType, value))
95 | } else if (field.fieldType == STRING) {
96 | val currentValue = record.get(field name)
97 | if (currentValue != null) record.put(field name, s"$currentValue$value")
98 | else record put(field name, value)
99 | } else
100 | record.put(field name,
101 | AvroUtils.createValue(field fieldType, value))
102 | }
103 | }
104 | }
105 | }
106 |
107 | implicit class RichSchema(schema: Schema) {
108 | val PRIMITIVES: List[Type] =
109 | List(STRING, INT, LONG, FLOAT, DOUBLE, BOOLEAN, NULL)
110 |
111 | def wildcard(attribute: Boolean): Option[Field] =
112 | Option(schema.simplify.getField(XNode.WILDCARD))
113 |
114 | def field(node: XNode): Option[Field] = {
115 | var resultField: Option[Field] = None
116 | val tempSchema = schema.simplify
117 | breakable {
118 | tempSchema.getFields.forEach { field =>
119 | val sourceField = field.getProp(XNode.SOURCE)
120 | if (Option(sourceField).isEmpty && field.name == XNode.WILDCARD)
121 | break
122 | else if (Option(sourceField).isEmpty && field.name.matches("type\\d+")){
123 | // Do nothing
124 | }
125 | else if (node sourceMatches(sourceField, caseSensitive, ignoreCaseFor)) {
126 | resultField = Some(field)
127 | break
128 | }
129 | }
130 | }
131 | if (resultField isEmpty)
132 | resultField = Option(tempSchema.getField(XNode.TEXT_VALUE))
133 | resultField
134 | }
135 |
136 | def simplify: Schema =
137 | if (schema.getType == UNION) schema.getTypes.get(1) else schema
138 |
139 | def customTypeFields(): mutable.Buffer[Field] =
140 | schema.simplify.getFields.asScala.filter(_.name.matches("type\\d+"))
141 |
142 | def deepSchema: Schema = schema getType match {
143 | case UNION => schema.getTypes.get(1)
144 | case ARRAY =>
145 | val itemType = schema getElementType()
146 | if (itemType.getType == UNION)
147 | itemType.getTypes.get(1)
148 | else
149 | itemType
150 | case _ => schema
151 | }
152 |
153 | def isArray: Boolean = schema.getType == ARRAY
154 |
155 | def isRecord: Boolean = schema.getType == RECORD
156 |
157 | def isMap: Boolean = schema.getType == MAP
158 |
159 | def isPrimitive: Boolean = PRIMITIVES.contains(schemaType)
160 |
161 | def schemaType: Type = schema.getType
162 |
163 | def arraySchema: Schema = schema.getElementType
164 |
165 | def isPrimitiveArray: Boolean = PRIMITIVES contains arrayItemType
166 |
167 | def arrayItemType: Type = schema.getElementType.getType
168 |
169 | def newRecord: Record = {
170 | val record = new GenericData.Record(schema)
171 | for (field <- record.getSchema.getFields.asScala) {
172 | if (field isArray)
173 | record.put(field.name, new util.ArrayList[AnyRef]())
174 | if (field.name == XNode.WILDCARD)
175 | record.put(field.name, new util.HashMap[String, AnyRef]())
176 | }
177 | record
178 | }
179 |
180 | }
181 |
182 | implicit class RichField(field: Field) {
183 |
184 | def fieldType: Type = fieldSchema.getType
185 |
186 | def isArray: Boolean = fieldSchema.isArray
187 |
188 | def fieldSchema: Schema = field.schema().simplify
189 |
190 | def isRecord: Boolean = fieldSchema.isRecord
191 |
192 | def isMap: Boolean = fieldSchema.isMap
193 |
194 | def isPrimitive: Boolean = fieldSchema.isPrimitive
195 |
196 | def isWildcard: Boolean =
197 | if (field.name() == XNode.WILDCARD && field.isMap && Option(field.getProp(XNode.SOURCE)).isEmpty) true else false
198 |
199 | def arraySchema: Schema = fieldSchema.arraySchema
200 |
201 | def arrayItemType: Type = fieldSchema.arrayItemType
202 |
203 | def isPrimitiveArray: Boolean = fieldSchema.isPrimitiveArray
204 |
205 | }
206 |
207 | }
208 |
209 | object RichAvro extends RichAvro {
210 | var ignoreMissing = false
211 | var caseSensitive = true
212 | var ignoreCaseFor: List[String] = _
213 | var suppressWarnings = false
214 | }
215 |
--------------------------------------------------------------------------------
/src/test/resources/new_books2.avsc:
--------------------------------------------------------------------------------
1 | {
2 | "type": "record",
3 | "name": "BooksForm",
4 | "fields": [
5 | {
6 | "name": "book",
7 | "type": {
8 | "type": "array",
9 | "items": {
10 | "type": "record",
11 | "name": "BookForm",
12 | "fields": [
13 | {
14 | "name": "id",
15 | "type": [
16 | "null",
17 | "string"
18 | ],
19 | "source": "attribute id"
20 | },
21 | {
22 | "name": "others",
23 | "type": {
24 | "type": "map",
25 | "values": "string"
26 | }
27 | },
28 | {
29 | "name": "author",
30 | "type": "string",
31 | "source": "element author"
32 | },
33 | {
34 | "name": "title",
35 | "type": "string",
36 | "source": "element title"
37 | },
38 | {
39 | "name": "genre",
40 | "type": "string",
41 | "source": "element genre"
42 | },
43 | {
44 | "name": "price",
45 | "type": [
46 | "null",
47 | {
48 | "type": "array",
49 | "items": {
50 | "type": "record",
51 | "name": "PriceType",
52 | "fields": [
53 | {
54 | "name": "currency",
55 | "type": [
56 | "null",
57 | "string"
58 | ],
59 | "source": "attribute currency"
60 | },
61 | {
62 | "name": "text_value",
63 | "type": [
64 | "null",
65 | "double"
66 | ],
67 | "source": "element text_value"
68 | }
69 | ]
70 | }
71 | }
72 | ],
73 | "source": "element price"
74 | },
75 | {
76 | "name": "pub_date",
77 | "type": [
78 | "null",
79 | "long"
80 | ],
81 | "source": "element pub_date",
82 | "comment": "timestamp"
83 | },
84 | {
85 | "name": "review",
86 | "type": [
87 | "null",
88 | {
89 | "type": "array",
90 | "items": {
91 | "type": "record",
92 | "name": "ReviewType",
93 | "fields": [
94 | {
95 | "name": "title",
96 | "type": "string",
97 | "source": "element title"
98 | },
99 | {
100 | "name": "content",
101 | "type": [
102 | "null",
103 | "string"
104 | ],
105 | "source": "element content"
106 | }
107 | ]
108 | }
109 | }
110 | ],
111 | "source": "element review"
112 | },
113 | {
114 | "name": "type0",
115 | "type": {
116 | "type": "array",
117 | "items": {
118 | "type": "record",
119 | "name": "type1",
120 | "fields": [
121 | {
122 | "name": "alias",
123 | "type": {
124 | "type": "record",
125 | "name": "AliasType",
126 | "fields": [
127 | {
128 | "name": "title",
129 | "type": "string",
130 | "source": "element title"
131 | },
132 | {
133 | "name": "language",
134 | "type": [
135 | "null",
136 | {
137 | "type": "array",
138 | "items": "string"
139 | }
140 | ],
141 | "source": "element language"
142 | }
143 | ]
144 | },
145 | "source": "element alias"
146 | },
147 | {
148 | "name": "website",
149 | "type": {
150 | "type": "record",
151 | "name": "WebsiteType",
152 | "fields": [
153 | {
154 | "name": "url",
155 | "type": [
156 | "null",
157 | {
158 | "type": "array",
159 | "items": "string"
160 | }
161 | ],
162 | "source": "element url"
163 | }
164 | ]
165 | },
166 | "source": "element website"
167 | }
168 | ]
169 | }
170 | }
171 | },
172 | {
173 | "name": "type2",
174 | "type": {
175 | "type": "array",
176 | "items": {
177 | "type": "record",
178 | "name": "type3",
179 | "fields": [
180 | {
181 | "name": "type4",
182 | "type": {
183 | "type": "array",
184 | "items": {
185 | "type": "record",
186 | "name": "type5",
187 | "fields": [
188 | {
189 | "name": "alias2",
190 | "type": "AliasType",
191 | "source": "element alias2"
192 | },
193 | {
194 | "name": "website2",
195 | "type": "WebsiteType",
196 | "source": "element website2"
197 | }
198 | ]
199 | }
200 | }
201 | }
202 | ]
203 | }
204 | }
205 | },
206 | {
207 | "name": "type6",
208 | "type": {
209 | "type": "record",
210 | "name": "type7",
211 | "fields": [
212 | {
213 | "name": "alias3",
214 | "type": "string",
215 | "source": "element alias3"
216 | }
217 | ]
218 | }
219 | },
220 | {
221 | "name": "sold",
222 | "type": [
223 | "null",
224 | {
225 | "type": "array",
226 | "items": "string"
227 | }
228 | ],
229 | "source": "element sold"
230 | }
231 | ]
232 | }
233 | },
234 | "source": "element book"
235 | }
236 | ]
237 | }
238 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # XSD => Avsc & XML => Avro
2 | No longer maintained actively
3 |
4 | ****
5 | This project was initially a fork of [xml-avro-elodina](https://github.com/elodina/xml-avro).
6 | Later evolved to separate project with lotsss of bug fixes, memory & performance improvements, options, re-coded in Scala
7 | ****
8 | - Converts any XSD to a proper usable Avro schema (Avsc)
9 | - Converts any XML to avro using the provided schema. What can it do? See the list below.
10 | - Handle any large size XML (even in GigaBytes), as it streams the xml
11 | - Read xml from stdin and output to stdout
12 | - Validate the XML with XSD
13 | - Split the data at any specified element (can have any no.of splits)
14 | - Handle multiple documents in single file (useful when streaming continuous data)
15 | - Write out failed documents without killing the whole process
16 | - Completely configurable
17 |
18 | ### Running Project
19 | 1. `git clone` to clone the repository to local
20 | 2. `gradle build` to generate the jar file
21 | 3. `java -jar ./build/libs/xml-avro-all-.jar -c ` to run the code (options as below)
22 |
23 | Check `./example/config.yml` for sample configuration file
24 |
25 | ### Config File
26 | Create yml config file as per the below format
27 | ```
28 | debug: false # Enable printing of debug messages
29 | baseDir: "files" # Base directory where most files are relative to
30 | namespaces: true # Enable/Disable usage of namespaces in schema/conversion - Optional (default: true)
31 |
32 | XML: # Convert XML
33 | xmlInput: stdin # Source of XML [ stdin | "somefile.xml" ]
34 | avscFile: "books.avsc" # Avsc file to use for conversion - (If not using splits)
35 | avroOutput: stdout # Traget location [ stdout | "somefile.avro" ] - Optional (Uses the xmlInput to assume the output) (If not using splits)
36 | documentRootTag: "books" # Root tag of the XML (without namespace)
37 | validationXSD: "books.xsd" # Enable validation with specified xsd
38 | ignoreMissing: true # Incase you use a smaller version of avsc (to take only required tags),
39 | # tags in the xml may not exist in the trimmed avsc..
40 | # This option enables to ignore the missing tags instead of failing
41 | suppressWarnings: true # In case of a lot of missing fields don't print them as warnings
42 | split: # Split the avro records based on specifed list
43 | -
44 | by: "bookName" # Split tag name
45 | avscFile: "name.avsc" # Avsc File for the split part
46 | avroFile: "name.avro" # Avro file name to save to
47 | -
48 | by: "bookPublisher"
49 | avscFile: "publisher.avsc"
50 | avroFile: "publisher.avro"
51 | qaDir: "some path" # Writes some count details
52 | caseSensitive: true # Tags matching xml & avsc are case sensitive - Optional (default: true)
53 | ignoreCaseFor: # Ignore case senitivity for the below list
54 | - "SomeTag"
55 | docErrorLevel: "WARNING" # Use this level to log in case of error in a document
56 | errorFile: "failures.xml" # Writes the failed documents to this file
57 | useAvroInput: true # Read xml data from inside an avro file
58 | inputAvroMappings: # Set of mappings from source field name to target, use "xmlInput" as target to mark it as the xml data, use "unique_id" as target to mark the value as unique key
59 | "headers" : "avroHeader"
60 | "body" : "xmlInput"
61 | "headers.unique_id" : "unique_id"
62 | XSD:
63 | xsdFile: "somefile.xsd" # Source of XSD
64 | avscFile: "books.avsc" # Avsc file to save as - Optional (Uses the xsdFile to assume the output)
65 | stringTimestamp: true # Represent timestamp as string instead of long. Defaults to false. Setting this value to "true" overrides XSD.logicalTypes.xsDateTime to "string".
66 | attributePrefix: "_" # Optional, will assign the specified prefix for attributes in the avsc schema
67 |
68 | ignoreHiveKeywords: true # Do not suffix field name with `_value` when matching Hive keywords. Default value is false.
69 | rootElementQName: "{ns}name" # Only generate schema for root element matching this QName
70 | logicalTypes:
71 | xsDateTime: "long" # Configures the Avro mapping of xs:dateTime XML types. [ long | string | timestamp-micros | timestamp-millis ]
72 | # "long" (the default) maps xs:dateTime types to regular Avro "long". Same as the default mapping for xs:dateTime in older xml-avro versions.
73 | # "string" maps xs:dateTime types to Avro "string"
74 | # "timestamp-micros" maps xs:dateTime types to Avro "timestamp-micros" logical type annotating a "long".
75 | # "timestamp-millis" maps xs:dateTime types to Avro "timestamp-millis" logical type annotating a "long".
76 | # Note: Setting the stringTimestamp will override this config value to "string" for backward compatibility reasons.
77 | xsDate: "string" # Configures the Avro mapping of xs:date XML types. [ string | date ].
78 | # "string" (the default) maps xs:date types to Avro "string"
79 | # "date" maps xs:date types to Avro "date" logical type annotating an "int".
80 | xsTime: "string" # Configures the Avro mapping of xs:time XML types. [ string | time-micros | time-millis ]
81 | # "string" (the default) maps xs:time types to Avro "string".
82 | # "time-micros" maps xs:time types to Avro "time-micros" logical type annotating a "long".
83 | # "time-millis" maps xs:time types to Avro "time-millis" logical type annotating a "long".
84 | #
85 | xsDecimal: # Configurations controlling the mapping of xs:decimal XML types
86 | #
87 | avroType: "decimal" # Configures the Avro type mapping of xs:Decimal derived xml types.
88 | # Possible values are: [ double | string | decimal ]
89 | # - "double" (the default) maps xs:decimal types to Avro "double".
90 | # - "string" maps xs:decimal types to Avro "string".
91 | # - "decimal" maps xs:decimal types to Avro "decimal" logical types annotating "bytes".
92 | # When using the "decimal" option, the mandatory precision and scale properties of the Avro
93 | # decimal type are picked up from any xs:totalDigits and xs:fractionDigits restriction facets, if any.
94 | # In the absense of these restriction facets, the mapping will instead fall back to using a backup strategy defined
95 | # by a combination of the fallbackType, fallbackPrecision and fallbackScale configurations.
96 | #
97 | fallbackType: "string" # Configures a fallback type mapping for xs:decimal types with unrestricted precision and scale. (i.e. types without
98 | # declared xs:totalDigits and xs:fractionDigits restriction facets). This configuration is ignored, unless the
99 | # avroType setting is configured to "decimal".
100 | # The possible values are: [ string | double | decimal ]
101 | # All options are identical to those described under the avroType configuration, the only exception being
102 | # "decimal" that uses the fallbackPrecision and fallbackScale configurations as a defaults for missing
103 | # precision and scale information.
104 | #
105 | fallbackPrecision: 5 # Configures the fallback precision for decimal types without declared xs:totalDigits
106 | # and restriction. Required when fallbackType is set to "decimal".
107 | #
108 | fallbackScale: 3 # Configures the fallback scale for decimal types without declared xs:fractionDigits restriction.
109 | # Required when fallbackType is set to "decimal".
110 | ```
111 |
112 | ## Docker
113 |
114 | ### Build docker image
115 |
116 |
117 | ```sh
118 | docker build -t xml-avro:v1.8.2 --build-arg VERSION=1.8.2 .
119 | ```
120 |
121 | ### Run with docker
122 |
123 | ```sh
124 | docker run --rm -v $(pwd)/example:/app/example -v $(pwd)/example/config.yml:/app/config.yml xml-avro:latest
125 | ```
126 |
--------------------------------------------------------------------------------
/src/main/scala/in/dreamlabs/xmlavro/Supporters.scala:
--------------------------------------------------------------------------------
1 | package in.dreamlabs.xmlavro
2 |
3 | import java.util.{Calendar, TimeZone}
4 | import javax.xml.bind.DatatypeConverter
5 |
6 | import in.dreamlabs.xmlavro.RichAvro.{ignoreMissing, suppressWarnings}
7 | import in.dreamlabs.xmlavro.Utils._
8 | import org.apache.avro.Schema.Type
9 | import AvroPath.countsMap
10 | import org.apache.avro.Schema.Type._
11 | import org.apache.xerces.xni.XNIException
12 | import org.apache.xerces.xni.parser.{XMLErrorHandler, XMLParseException}
13 | import org.apache.xerces.xs.XSObject
14 | import org.w3c.dom.{DOMError, DOMErrorHandler}
15 | import org.xml.sax.{ErrorHandler, SAXParseException}
16 |
17 | import scala.collection.mutable
18 | import scala.collection.mutable.ListBuffer
19 |
20 | /**
21 | * Created by Royce on 20/01/2017.
22 | */
23 | case class ConversionException(message: String = null, cause: Throwable = null)
24 | extends RuntimeException(message, cause) {
25 | def this(cause: Throwable) = this(null, cause)
26 | }
27 |
28 | class XSDErrorHandler extends XMLErrorHandler with DOMErrorHandler {
29 | private var exception: Option[XMLParseException] = None
30 | private var error: Option[DOMError] = None
31 |
32 | @throws[XNIException]
33 | def warning(domain: String,
34 | key: String,
35 | exception: XMLParseException): Unit =
36 | if (this.exception isEmpty) this.exception = Option(exception)
37 |
38 | @throws[XNIException]
39 | def error(domain: String, key: String, exception: XMLParseException): Unit =
40 | if (this.exception isEmpty) this.exception = Option(exception)
41 |
42 | @throws[XNIException]
43 | def fatalError(domain: String,
44 | key: String,
45 | exception: XMLParseException): Unit =
46 | if (this.exception isEmpty) this.exception = Option(exception)
47 |
48 | def handleError(error: DOMError): Boolean = {
49 | if (this.error isEmpty) this.error = Option(error)
50 | false
51 | }
52 |
53 | def check(): Unit = {
54 | if (exception isDefined) throw new ConversionException(exception.get)
55 | if (error isDefined) {
56 | error.get.getRelatedException match {
57 | case cause: Throwable => throw new ConversionException(cause)
58 | case _ =>
59 | }
60 | val locator = error.get.getLocation
61 | val location = "at:" + locator.getUri + ", line:" + locator.getLineNumber + ", char:" + locator.getColumnNumber
62 | throw ConversionException(location + " " + error.get.getMessage)
63 | }
64 | }
65 | }
66 |
67 | class ValidationErrorHandler(var xml: XMLDocument) extends ErrorHandler {
68 | def warning(exception: SAXParseException): Unit = {
69 | handle(exception)
70 | }
71 |
72 | def error(exception: SAXParseException): Unit = {
73 | handle(exception)
74 | }
75 |
76 | def fatalError(exception: SAXParseException): Unit = {
77 | handle(exception)
78 | }
79 |
80 | private def handle(exception: SAXParseException): Unit = xml.fail(exception)
81 | }
82 |
83 | case class XNode(name: String,
84 | nsURI: String,
85 | nsName: String,
86 | attribute: Boolean) {
87 | var parentNS: String = _
88 | val element: Boolean = !attribute
89 |
90 | def sourceMatches(sourceTag: String,
91 | caseSensitive: Boolean,
92 | ignoreList: List[String]): Boolean = {
93 | val matches =
94 | if (caseSensitive)
95 | if (ignoreList contains sourceTag.toLowerCase)
96 | source.equalsIgnoreCase(sourceTag) || parentNSSource
97 | .equalsIgnoreCase(sourceTag)
98 | else
99 | source == sourceTag || parentNSSource == sourceTag
100 | else
101 | source.equalsIgnoreCase(sourceTag) || parentNSSource.equalsIgnoreCase(
102 | sourceTag)
103 | matches
104 | }
105 |
106 | def source: String =
107 | (if (attribute) "attribute" else "element") + s" ${fullName()}"
108 |
109 | def parentNSSource: String =
110 | (if (attribute) "attribute" else "element") + s" ${fullName(other = true)}"
111 |
112 | def fullName(other: Boolean = false): String =
113 | if (other)
114 | s"${if (option(parentNS) isDefined) parentNS + ":" else ""}$name"
115 | else
116 | s"${if (option(nsURI) isDefined) nsURI + ":" else ""}$name"
117 |
118 | override def toString: String =
119 | s"${if (option(nsName) isDefined) nsName + ":" else ""}$name"
120 | }
121 |
122 | object XNode {
123 | val SOURCE = "source"
124 | val DOCUMENT = "document"
125 | val WILDCARD = "others"
126 | val TEXT_VALUE = "text_value"
127 | var namespaces = true
128 |
129 | def apply(ele: XSObject, attribute: Boolean = false): XNode =
130 | new XNode(ele.getName, ele.getNamespace, null, attribute)
131 |
132 | def apply(parentNode: XNode,
133 | name: String,
134 | nsURI: String,
135 | nsName: String,
136 | attribute: Boolean): XNode = {
137 | val node = new XNode(name, nsURI, nsName, attribute)
138 | if (option(nsURI) isEmpty)
139 | if (option(parentNode.nsURI) isDefined) node.parentNS = parentNode.nsURI
140 | else node.parentNS = parentNode.parentNS
141 | node
142 | }
143 |
144 | def textNode: XNode = new XNode(TEXT_VALUE, null, null, attribute = false)
145 |
146 | def wildNode(attribute: Boolean): XNode =
147 | new XNode(WILDCARD, null, null, attribute)
148 | }
149 |
150 | class AvroPath(val name: String,
151 | val pathType: Type,
152 | currentPath: ListBuffer[AvroPath],
153 | val virtual: Boolean = false) {
154 |
155 | private val innerName = {
156 | val builder = StringBuilder.newBuilder
157 | builder append s"$name"
158 | currentPath.foreach(path =>
159 | builder append path.toString)
160 | builder.mkString
161 | }
162 |
163 | val index: Int =
164 | if (countsMap contains innerName) {
165 | var currentIndex = countsMap(innerName)
166 | currentIndex += 1
167 | countsMap += (innerName -> currentIndex)
168 | currentIndex
169 | } else {
170 | countsMap += (innerName -> 0)
171 | 0
172 | }
173 |
174 | def destroy(): Unit = {
175 | var currentIndex = countsMap(innerName)
176 | currentIndex -= 1
177 | countsMap += (innerName -> currentIndex)
178 | }
179 |
180 | override def toString: String =
181 | if (pathType == ARRAY) s"$name[$index]" else name
182 | }
183 |
184 | object AvroPath {
185 | val countsMap: mutable.Map[String, Int] = mutable.Map[String, Int]()
186 | val warnedNodes: ListBuffer[String] = ListBuffer[String]()
187 |
188 | def apply(name: String,
189 | pathType: Type,
190 | currentPath: ListBuffer[AvroPath],
191 | virtual: Boolean = false) =
192 | new AvroPath(name, pathType, currentPath, virtual)
193 |
194 | def reset(): Unit = countsMap.clear()
195 |
196 | def missing(eleStack: ListBuffer[XNode], node: XNode = null): Unit = {
197 |
198 | val builder = StringBuilder.newBuilder
199 | var missingStack = eleStack
200 | var missingNode = node
201 | if (Option(node) isEmpty) {
202 | missingStack = eleStack.tail
203 | missingNode = eleStack.head
204 | }
205 |
206 | val ignoreList = List("noNamespaceSchemaLocation")
207 | if (!ignoreList.contains(missingNode.name)) {
208 | missingStack.reverse.foreach(ele => builder append s"$ele/")
209 | builder.append(s"${if (missingNode attribute) "@" else ""}${missingNode name}")
210 | val fullNode = builder.mkString
211 | if (!warnedNodes.contains(fullNode)) {
212 | warnedNodes += fullNode
213 | val message = s"$fullNode is not found in Schema (even as a wildcard)"
214 | if (ignoreMissing && !suppressWarnings)
215 | warn(message)
216 | else if (!ignoreMissing)
217 | throw ConversionException(message)
218 | }
219 | }
220 | }
221 |
222 | def warning(eleStack: ListBuffer[XNode], message: String):Unit={
223 | val builder = StringBuilder.newBuilder
224 | builder.append("In path ")
225 | eleStack.reverse.foreach(ele => builder append s"$ele/")
226 | builder.append(", ")
227 | builder.append(message)
228 | val finalMessage = builder.mkString
229 | if (!warnedNodes.contains(finalMessage)) {
230 | warnedNodes += finalMessage
231 | if (!suppressWarnings)
232 | warn(finalMessage)
233 | else
234 | throw ConversionException(finalMessage)
235 | }
236 | }
237 | }
238 |
239 |
240 | object AvroUtils {
241 | private val TIMESTAMP_PATTERN =
242 | "^(\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.*\\d*)Z?$"
243 |
244 | var timeZone: TimeZone = TimeZone.getTimeZone("UTC-0")
245 |
246 | def createValue(nodeType: Type, content: String): AnyRef = {
247 | val result = nodeType match {
248 | case BOOLEAN => content.toLowerCase == "true" || content == "1"
249 | case INT => content.toInt
250 | case LONG =>
251 | if (content contains "T") parseDateFrom(content trim)
252 | else content.toLong
253 | case FLOAT => content.toFloat
254 | case DOUBLE => content.toDouble
255 | case STRING => content
256 | case other => throw ConversionException(s"Unsupported type $other")
257 | }
258 | result.asInstanceOf[AnyRef]
259 | }
260 |
261 | private def parseDateFrom(text: String): Long = {
262 | var cal = DatatypeConverter.parseDateTime(text)
263 | if (text matches TIMESTAMP_PATTERN)
264 | cal.setTimeZone(timeZone)
265 | cal.getTimeInMillis
266 | //Local
267 | val tsp =
268 | if (!text.matches(TIMESTAMP_PATTERN)) text.substring(0, 19)
269 | else text
270 | cal = DatatypeConverter.parseDateTime(tsp)
271 | cal.setTimeZone(timeZone)
272 | cal.getTimeInMillis
273 | }
274 | }
275 |
276 | object Utils {
277 | var debugEnabled = false
278 |
279 | def option(text: String): Option[String] = {
280 | if (Option(text) isDefined)
281 | if (text.trim == "") None else Option(text)
282 | else None
283 | }
284 |
285 | def debug(text: String): Unit = if (debugEnabled) log("DEBUG", text)
286 |
287 | def info(text: String): Unit = log("INFO", text)
288 |
289 | def warn(text: String): Unit = log("WARNING", text)
290 |
291 | def log(level: String, text: String, duplicates:Boolean = true): Unit = {
292 | System.err.println(s"${Calendar.getInstance().getTime} ${level.toUpperCase}: $text")
293 | }
294 |
295 | def profile(tag: String)(op: => Unit): Unit = {
296 | val start = Calendar.getInstance().getTimeInMillis
297 | op
298 | val end = Calendar.getInstance().getTimeInMillis
299 | info(s"$tag took: ${(end - start) / 1000.0} seconds")
300 | }
301 | }
302 |
--------------------------------------------------------------------------------
/src/main/python/avsc_fix.py:
--------------------------------------------------------------------------------
1 | import json
2 | import sys
3 | from collections import OrderedDict
4 |
5 | import os
6 |
7 |
8 | class AvroSchema:
9 | def __init__(self, file_path):
10 | with open(file_path) as file_in:
11 | root = json.load(file_in)
12 | self._root_name = root.get('name')
13 | self._level = 0
14 | self._known_types_list = []
15 |
16 | file_name = os.path.basename(file_path)
17 | self._root_prefix = file_name.split('.')[0]
18 | self._prefix = self._root_prefix
19 | self._base_fields = []
20 |
21 | for node in root.get('fields'):
22 | self._base_fields.append(Node(node))
23 | self._fields = self._base_fields
24 |
25 | # Recreate the schema splitting with specified element and save to new file
26 | def recreate_schema(self, split_by=None, new_file=None):
27 | self._known_types_list = []
28 | if split_by:
29 | if split_by != self._root_name:
30 | search_res = self._search(self._base_fields, split_by)
31 | if search_res:
32 | search_res = search_res.content
33 | search_res.name = split_by
34 | self._fields = search_res
35 | else:
36 | split_by = None
37 |
38 | if not self._fields:
39 | print 'Split element {} not found'.format(split_by)
40 | exit(1)
41 | schema = self._generate_schema(self._fields)
42 |
43 | if not split_by:
44 | schema['source'] = 'document'
45 | if new_file:
46 | with open(new_file, 'w') as file_out:
47 | json.dump(schema, file_out, indent=2)
48 | return schema
49 |
50 | def _generate_schema(self, node):
51 | schema = OrderedDict()
52 | # Generate schema for list of nodes
53 | if type(node) is list:
54 | inner_schema = []
55 | schema['type'] = 'record'
56 | schema['name'] = self._root_name
57 | for inner_node in node:
58 | inner_schema.append(self._generate_schema(inner_node))
59 | schema['fields'] = inner_schema
60 | else:
61 | # Generate schema for primitive types
62 | if node.node_type == Node.primitive_type:
63 | schema['name'] = node.name
64 | if node.optional:
65 | schema['type'] = ['null', node.source_type]
66 | else:
67 | schema['type'] = node.source_type
68 | schema['source'] = node.source
69 | # Generate schema for complex types
70 | elif node.node_type == Node.complex_type:
71 | sql_type = node.sql_type
72 | inner_type = OrderedDict()
73 | # primitive complexes
74 | if type(node.content) is str:
75 | if sql_type == 'ARRAY':
76 | inner_type['type'] = 'array'
77 | if node.original_type:
78 | inner_type['items'] = node.original_type
79 | else:
80 | inner_type['items'] = node.content
81 | elif sql_type == 'MAP':
82 | inner_type['type'] = 'map'
83 | if node.original_type:
84 | inner_type['values'] = \
85 | node.original_type.split(',')[
86 | 1].strip()
87 | else:
88 | inner_type['values'] = node.content.split(',')[
89 | 1].strip()
90 | else:
91 | inner_type['type'] = 'record'
92 | inner_type['fields'] = node.content
93 | schema['name'] = node.name
94 | if node.optional:
95 | schema['type'] = ['null', inner_type]
96 | else:
97 | schema['type'] = inner_type
98 | if node.name != 'others':
99 | schema['source'] = node.source
100 | # custom complexes
101 | else:
102 | # Array
103 | if sql_type == 'ARRAY':
104 | inner_type['type'] = 'array'
105 | inner_type['items'] = self._generate_schema(
106 | node.content)
107 | schema['name'] = node.name
108 | if node.optional:
109 | schema['type'] = ['null', inner_type]
110 | else:
111 | schema['type'] = inner_type
112 | schema['source'] = node.source
113 | # Map
114 | elif sql_type == 'MAP':
115 | inner_type['type'] = 'map'
116 | inner_type['values'] = self._generate_schema(
117 | node.content)
118 | schema['name'] = node.name
119 | if node.optional:
120 | schema['type'] = ['null', inner_type]
121 | else:
122 | schema['type'] = inner_type
123 | schema['source'] = node.source
124 | # Struct
125 | else:
126 | schema['name'] = node.name
127 | if node.optional:
128 | schema['type'] = ['null', self._generate_schema(
129 | node.content)]
130 | else:
131 | schema['type'] = self._generate_schema(node.content)
132 | schema['source'] = node.source
133 | # Generate schema for custom defined types
134 | else:
135 | if node.name not in self._known_types_list:
136 | schema = self._generate_schema(node.content)
137 | self._known_types_list.append(node.name)
138 | schema['name'] = node.name
139 | else:
140 | schema = node.name
141 | return schema
142 |
143 | def _search(self, node, key):
144 | if type(node) is list:
145 | for inner_node in node:
146 | search_res = self._search(inner_node, key)
147 | if search_res:
148 | break
149 | else:
150 | if node.node_type == Node.primitive_type:
151 | search_res = node if node.name == key else None
152 | elif node.node_type == Node.complex_type:
153 | # primitive complexes
154 | if type(node.content) is str:
155 | search_res = node if node.name == key else None
156 | # custom complexes
157 | else:
158 | if node.name == key:
159 | search_res = node
160 | else:
161 | search_res = self._search(node.content, key)
162 | else:
163 | search_res = self._search(node.content, key)
164 | return search_res
165 |
166 |
167 | class Node:
168 | primitive_type = 'PRIMITIVE'
169 | complex_type = 'COMPLEX'
170 | custom_type = 'CUSTOM'
171 |
172 | primitives_map = {'int': 'int', 'long': 'bigint', 'float': 'float',
173 | 'double': 'double', 'bytes': 'string',
174 | 'string': 'string', 'boolean': 'boolean'}
175 | type_dict = {}
176 |
177 | def __init__(self, node):
178 | self.sql_type = None
179 | self.content = None
180 | self.optional = False
181 | self.source = None
182 | self.comment = None
183 | self.original_type = None
184 | self.name = str(node.get('name'))
185 | node_type = node.get('type')
186 |
187 | # Parsing union - complex type and
188 | # take valid complex/primitive type in the union
189 | if type(node_type) is list:
190 | node_type = node_type[1]
191 | self.optional = True
192 |
193 | # Detect Primitives
194 | if node_type in Node.primitives_map.keys():
195 | self.node_type = Node.primitive_type
196 | self.sql_type = Node.primitives_map[node_type]
197 | self.source = str(node.get('source'))
198 | if 'comment' in node:
199 | self.comment = node['comment']
200 | self.source_type = node_type
201 |
202 | # Parse the inner record
203 | elif node_type == 'record':
204 | self.node_type = self.custom_type
205 | self.content = self._parse_list(node.get('fields'))
206 | self.source = str(node.get('source'))
207 | Node.type_dict[self.name] = self
208 |
209 | # Parse complex types
210 | else:
211 | self.node_type = self.complex_type
212 | self.sql_type, self.content = self._parse_complex_type(node_type)
213 | self.source = str(node.get('source'))
214 |
215 | # Parse a list of nodes
216 | @staticmethod
217 | def _parse_list(element_list):
218 | node_list = []
219 | for node in element_list:
220 | node_list.append(Node(node))
221 | return node_list
222 |
223 | # Parse a complex datatype
224 | def _parse_complex_type(self, node_type):
225 | if type(node_type) is dict:
226 | temp_type = node_type.get('type')
227 | # Parse array complex type
228 | if temp_type == 'array':
229 | items = node_type.get('items')
230 |
231 | # Array of primitives
232 | if items in Node.primitives_map.keys():
233 | self.original_type = items
234 | return 'ARRAY', Node.primitives_map[items]
235 |
236 | # Array of known custom types
237 | elif items in Node.type_dict.keys():
238 | return 'ARRAY', Node.type_dict[items]
239 |
240 | # Array of new custom types
241 | else:
242 | return 'ARRAY', Node(items)
243 |
244 | # Parse map complex type
245 | elif temp_type == 'map':
246 | value_type = node_type.get('values')
247 |
248 | # Map of primitives
249 | if value_type in Node.primitives_map.keys():
250 | self.original_type = '{}, {}'.format('STRING', value_type)
251 | return 'MAP', '{}, {}'.format('STRING', Node.primitives_map[
252 | value_type])
253 |
254 | # Map of custom types
255 | else:
256 | if value_type in Node.type_dict.keys():
257 | value_type = self.type_dict[node_type]
258 | else:
259 | print '1 - {} type not found in the schema'.format(
260 | node_type)
261 | exit(1)
262 | return 'MAP', '{}, {}'.format('STRING', value_type)
263 |
264 | # Parse other struct types
265 | else:
266 | return 'STRUCT', Node(node_type)
267 |
268 | elif node_type in Node.type_dict.keys():
269 | return 'STRUCT', self.type_dict[node_type]
270 | else:
271 | print '2 - {} type not found in the schema'.format(node_type)
272 | exit(1)
273 |
274 | def __repr__(self):
275 | optional = 'Optional' if self.optional else 'Mandatory'
276 | return '{}, {}, {}, {}, {}'.format(self.name, self.node_type,
277 | self.sql_type, optional, self.source)
278 |
279 |
280 | file_path = sys.argv[1]
281 | split_by = sys.argv[2]
282 | temp = AvroSchema(file_path)
283 | temp.recreate_schema(split_by=split_by, new_file=file_path)
284 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright {yyyy} {name of copyright owner}
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
203 |
--------------------------------------------------------------------------------
/src/main/scala/in/dreamlabs/xmlavro/config/Config.scala:
--------------------------------------------------------------------------------
1 | package in.dreamlabs.xmlavro.config
2 |
3 | import java.util
4 |
5 | import in.dreamlabs.xmlavro.ConversionException
6 | import in.dreamlabs.xmlavro.Utils.option
7 | import javax.xml.namespace.QName
8 |
9 | import scala.beans.BeanProperty
10 | import scala.collection.JavaConverters._
11 | import scala.reflect.io.Path
12 |
13 | /**
14 | * Created by Royce on 01/02/2017.
15 | */
16 | class Config() {
17 | @BeanProperty var dynamic: Boolean = false
18 | @BeanProperty var dynamicSource: String = ""
19 | @BeanProperty var debug: Boolean = false
20 | var baseDir: Option[Path] = None
21 | @BeanProperty var namespaces: Boolean = true
22 | var XSD: Option[XSDConfig] = None
23 | var XML: Option[XMLConfig] = None
24 |
25 | def getBaseDir: String = if (baseDir isDefined) baseDir.get.path else null
26 |
27 | def setBaseDir(value: String): Unit =
28 | baseDir = Option(Path(value).toAbsolute)
29 |
30 | def getXSD: XSDConfig = XSD.orNull
31 |
32 | def setXSD(value: XSDConfig): Unit = XSD = Option(value)
33 |
34 | def getXML: XMLConfig = XML.orNull
35 |
36 | def setXML(value: XMLConfig): Unit = XML = Option(value)
37 |
38 | def validate(): Unit = {
39 | if (XSD isDefined) {
40 | XSD.get.namespaces = namespaces
41 | XSD.get.debug = debug
42 | XSD.get.baseDir = baseDir
43 | XSD.get.validate()
44 | }
45 | if (XML isDefined) {
46 | XML.get.namespaces = namespaces
47 | XML.get.debug = debug
48 | XML.get.baseDir = baseDir
49 | XML.get.validate(XSD)
50 | }
51 | }
52 | }
53 |
54 | class XSDConfig {
55 | var namespaces: Boolean = _
56 | var debug: Boolean = _
57 | var baseDir: Option[Path] = _
58 | var xsdFile: Path = _
59 | var avscFile: Path = _
60 |
61 | @BeanProperty var logicalTypes: LogicalTypesConfig = _
62 | @BeanProperty var rebuildChoice: Boolean = true
63 | @BeanProperty var stringTimestamp: Boolean = false
64 | @BeanProperty var ignoreHiveKeywords: Boolean = false
65 | @BeanProperty var rootElementQName: Option[QName] = None
66 | @BeanProperty var attributePrefix: String = ""
67 |
68 | def getXsdFile: String = xsdFile.path
69 |
70 | def setXsdFile(value: String): Unit = xsdFile = Path(value)
71 |
72 | def getAvscFile: String = avscFile.path
73 |
74 | def setAvscFile(value: String): Unit = avscFile = Path(value)
75 |
76 | def validate(): Unit = {
77 | if (baseDir.isDefined) {
78 | xsdFile = xsdFile toAbsoluteWithRoot baseDir.get
79 | if (Option(avscFile) isDefined)
80 | avscFile = avscFile toAbsoluteWithRoot baseDir.get
81 | else
82 | avscFile = xsdFile changeExtension "avsc"
83 | }
84 | logicalTypes = Option(logicalTypes) getOrElse new LogicalTypesConfig
85 | logicalTypes.validate()
86 | if (stringTimestamp) {
87 | logicalTypes.xsDateTime = LogicalType.STRING
88 | }
89 | }
90 | }
91 |
92 | object LogicalType {
93 | /**
94 | * Logical type "timestamp-millis" annotating a long type.
95 | */
96 | val TIMESTAMP_MILLIS = "timestamp-millis"
97 | /**
98 | * Logical type "timestamp-micros" annotating a long type.
99 | */
100 | val TIMESTAMP_MICROS = "timestamp-micros"
101 |
102 | /**
103 | * Logical type "times-millis" annotating a long type.
104 | */
105 | val TIME_MILLIS = "time-millis"
106 |
107 | /**
108 | * Logical type "times-micros" annotating a long type.
109 | */
110 | val TIME_MICROS = "time-micros"
111 |
112 | /**
113 | * Logical type "date" annotating an int type.
114 | */
115 | val DATE = "date"
116 |
117 | /**
118 | * Dummy logical type for handling values as string without indicating a logicalType.
119 | */
120 | val STRING = "string"
121 |
122 | /**
123 | * Dummy logical type for handling values as long without indicating a logicalType.
124 | */
125 | val LONG = "long"
126 |
127 | }
128 |
129 | class LogicalTypesConfig {
130 |
131 | @BeanProperty
132 | var xsDateTime: String = LogicalType.LONG
133 | @BeanProperty
134 | var xsTime: String = LogicalType.STRING
135 | @BeanProperty
136 | var xsDate: String = LogicalType.STRING
137 | @BeanProperty
138 | var xsDecimal: XSDecimalConfig = new XSDecimalConfig
139 |
140 | def validate(): Unit = {
141 | xsDateTime = Option(xsDateTime) getOrElse ""
142 | xsDateTime match {
143 | case LogicalType.LONG
144 | | LogicalType.STRING
145 | | LogicalType.TIMESTAMP_MILLIS
146 | | LogicalType.TIMESTAMP_MICROS => /* accept */
147 | case _ =>
148 | throw new IllegalArgumentException("Invalid configuration for xs:dateTime logical type.")
149 | }
150 |
151 | xsTime = Option(xsTime) getOrElse ""
152 | xsTime match {
153 | case LogicalType.STRING
154 | | LogicalType.TIME_MILLIS
155 | | LogicalType.TIME_MICROS => /* accept */
156 | case _ =>
157 | throw new IllegalArgumentException("Invalid configuration for xs:time logical type.")
158 | }
159 |
160 | xsDate = Option(xsDate) getOrElse ""
161 | xsDate match {
162 | case LogicalType.STRING | LogicalType.DATE => /* accept */
163 | case _ =>
164 | throw new IllegalArgumentException("Invalid configuration for xs:date logical type.")
165 | }
166 |
167 | xsDecimal = Option(xsDecimal) getOrElse new XSDecimalConfig
168 | xsDecimal.validate()
169 | }
170 |
171 | }
172 |
173 | object XSDecimalConfigLogicalType {
174 | val DOUBLE = "double"
175 |
176 | val STRING = "string"
177 |
178 | val DECIMAL = "decimal"
179 | }
180 |
181 |
182 | class XSDecimalConfig {
183 | @BeanProperty
184 | var avroType = XSDecimalConfigLogicalType.DOUBLE
185 | @BeanProperty
186 | var fallbackType = XSDecimalConfigLogicalType.STRING
187 | @BeanProperty
188 | var fallbackPrecision : Integer = null
189 | @BeanProperty
190 | var fallbackScale : Integer = 0
191 |
192 | def validate(): Unit = {
193 |
194 | val acceptedAvroTypes = List(
195 | XSDecimalConfigLogicalType.DECIMAL,
196 | XSDecimalConfigLogicalType.DOUBLE,
197 | XSDecimalConfigLogicalType.STRING
198 | )
199 |
200 | if (!acceptedAvroTypes.contains(avroType)) {
201 | throw new IllegalArgumentException(s"Invalid configuration value '$avroType' for xsDecimal avroType.")
202 | }
203 |
204 | if (!acceptedAvroTypes.contains(fallbackType)) {
205 | throw new IllegalArgumentException(s"Invalid configuration value '$fallbackType' for xsDecimal fallbackType.")
206 | }
207 |
208 | if (fallbackType == XSDecimalConfigLogicalType.DECIMAL) {
209 | if (Option(fallbackPrecision) isEmpty) {
210 | throw new IllegalArgumentException(s"Missing xsDecimal fallbackPrecision " +
211 | s"configuration for '$fallbackType' fallback type.")
212 | }
213 | if (Option(fallbackScale) isEmpty) {
214 | throw new IllegalArgumentException(s"Missing xsDecimal fallbackScale " +
215 | s"configuration for '$fallbackType' fallback type.")
216 | }
217 | if (fallbackPrecision <= 0) {
218 | throw new IllegalArgumentException(s"Invalid configuration value $fallbackPrecision for xsDecimal fallbackPrecision.")
219 | }
220 | if (fallbackScale <= 0 || fallbackScale > fallbackPrecision) {
221 | throw new IllegalArgumentException(s"Invalid configuration value $fallbackScale for xsDecimal fallbackScale.")
222 | }
223 | }
224 |
225 | }
226 | }
227 |
228 | class XMLConfig {
229 | var namespaces: Boolean = _
230 | var debug: Boolean = _
231 | var baseDir: Option[Path] = None
232 | var qaDir: Option[Path] = None
233 | var xmlFile: Path = _
234 | var streamingInput, streamingOutput: Boolean = false
235 | var validationXSD: Option[Path] = None
236 | var splitBy: String = ""
237 | var avscFile: Path = _
238 | var avroFile: Path = _
239 | var errorFile: Option[Path] = None
240 |
241 | @BeanProperty var documentRootTag: String = _
242 | @BeanProperty var ignoreMissing: Boolean = false
243 | @BeanProperty var suppressWarnings: Boolean = false
244 | @BeanProperty var xmlInput: String = _
245 | @BeanProperty var avroOutput: String = _
246 | @BeanProperty var docErrorLevel: String = "WARNING"
247 | @BeanProperty var split: util.List[AvroSplit] =
248 | new util.ArrayList[AvroSplit]()
249 | @BeanProperty var caseSensitive: Boolean = true
250 | @BeanProperty var ignoreCaseFor: util.List[String] =
251 | new util.ArrayList[String]
252 |
253 | @BeanProperty var useAvroInput: Boolean = false
254 | var inputAvroMappings: Map[String, String] = _
255 | var inputAvroKey: String = _
256 | var inputAvroUniqueKey: Option[String] = None
257 |
258 |
259 | def getQaDir: String = if (qaDir isDefined) qaDir.get.path else null
260 |
261 | def setQaDir(value: String): Unit = qaDir = Option(Path(value))
262 |
263 | def getValidationXSD: String =
264 | if (validationXSD isDefined) validationXSD.get.path else null
265 |
266 | def setValidationXSD(value: String): Unit =
267 | validationXSD = Option(Path(value))
268 |
269 | def getErrorFile: String =
270 | if (errorFile isDefined) errorFile.get.path else null
271 |
272 | def setErrorFile(value: String): Unit =
273 | errorFile = Option(Path(value))
274 |
275 | def getAvscFile: String = avscFile.path
276 |
277 | def setAvscFile(value: String): Unit = avscFile = Path(value)
278 |
279 | def getAvroFile: String = avroFile.path
280 |
281 | def setAvroFile(value: String): Unit = avroFile = Path(value)
282 |
283 | def getInputAvroMappings: util.Map[String, String] =
284 | if (Option(inputAvroMappings) isDefined) inputAvroMappings.asJava else null
285 |
286 | def setInputAvroMappings(value: util.Map[String, String]): Unit =
287 | inputAvroMappings = value.asScala.toMap
288 |
289 | def validate(xsdConfig: Option[XSDConfig]): Unit = {
290 | if (Option(xmlInput) isDefined)
291 | if (xmlInput == "stdin") {
292 | streamingInput = true
293 | if (Option(avroOutput).isEmpty || avroOutput == "stdout")
294 | streamingOutput = true
295 | else avroFile = Path(avroOutput)
296 | } else {
297 | xmlFile = Path(xmlInput)
298 | if (Option(avroOutput) isDefined) avroFile = Path(avroOutput)
299 | else avroFile = xmlFile changeExtension "avro"
300 | } else
301 | throw ConversionException("XML Input is not specified in the config")
302 |
303 | if (baseDir.isDefined && !streamingInput)
304 | xmlFile = xmlFile toAbsoluteWithRoot baseDir.get
305 |
306 | if (baseDir.isDefined && !streamingOutput)
307 | avroFile = avroFile toAbsoluteWithRoot baseDir.get
308 |
309 | if (Option(avscFile).isDefined) {
310 | if (baseDir.isDefined)
311 | avscFile = avscFile toAbsoluteWithRoot baseDir.get
312 | } else if (xsdConfig.isDefined)
313 | avscFile = xsdConfig.get.xsdFile changeExtension "avsc"
314 |
315 | if (baseDir.isDefined && validationXSD.isDefined)
316 | validationXSD = Option(validationXSD.get.toAbsoluteWithRoot(baseDir.get))
317 |
318 | if (baseDir.isDefined && qaDir.isDefined)
319 | qaDir = Option(qaDir.get.toAbsoluteWithRoot(baseDir.get))
320 |
321 | if (baseDir.isDefined && errorFile.isDefined)
322 | errorFile = Option(errorFile.get.toAbsoluteWithRoot(baseDir.get))
323 |
324 | if (Option(documentRootTag) isEmpty)
325 | throw ConversionException("Document Root Tag is not specified in the config")
326 |
327 | if (option(splitBy) isEmpty)
328 | splitBy = documentRootTag
329 |
330 | if (split isEmpty) {
331 | val tempSplit = new AvroSplit
332 | tempSplit.avscFile = avscFile
333 | tempSplit.avroFile = avroFile
334 | tempSplit.stream = streamingOutput
335 | tempSplit.by = splitBy
336 | split.add(tempSplit)
337 | }
338 |
339 | split.forEach(item => item.validate(baseDir))
340 |
341 | if (useAvroInput) {
342 | inputAvroMappings.foreach {
343 | case (key, value) =>
344 | if (value == "xmlInput") inputAvroKey = key
345 | else if (value == "unique_id") inputAvroUniqueKey = Option(key)
346 | }
347 |
348 | if (Option(inputAvroKey) isEmpty)
349 | throw ConversionException("No xmlInput specified in inputAvroMappings")
350 | }
351 | }
352 | }
353 |
354 | class AvroSplit {
355 | @BeanProperty var by: String = ""
356 | var avscFile: Path = _
357 | var avroFile: Path = _
358 | var stream: Boolean = false
359 |
360 | def getAvscFile: String = avscFile.path
361 |
362 | def setAvscFile(value: String): Unit = avscFile = Path(value)
363 |
364 | def getAvroFile: String = avroFile.path
365 |
366 | def setAvroFile(value: String): Unit = avroFile = Path(value)
367 |
368 | def validate(baseDir: Option[Path]): Unit = {
369 | if (option(by) isEmpty)
370 | ConversionException("Split by is not specified in the config")
371 |
372 | if (Option(avroFile) isEmpty)
373 | ConversionException(
374 | s"Avro Output is not specified in the config for tag $by")
375 | else if (baseDir isDefined)
376 | avroFile = avroFile toAbsoluteWithRoot baseDir.get
377 |
378 | if (Option(avscFile) isEmpty)
379 | ConversionException(
380 | s"Avsc Schema is not specified in the config for tag $by")
381 | else if (baseDir isDefined)
382 | avscFile = avscFile toAbsoluteWithRoot baseDir.get
383 | }
384 | }
385 |
--------------------------------------------------------------------------------
/src/main/scala/in/dreamlabs/xmlavro/AvroBuilder.scala:
--------------------------------------------------------------------------------
1 | package in.dreamlabs.xmlavro
2 |
3 | import java.io._
4 | import java.nio.ByteBuffer
5 | import java.util
6 | import javax.xml.stream.XMLInputFactory
7 | import javax.xml.stream.XMLStreamConstants._
8 | import javax.xml.stream.events.{Attribute, EndElement, StartElement, XMLEvent}
9 |
10 | import in.dreamlabs.xmlavro.AvroBuilder.unknown
11 | import in.dreamlabs.xmlavro.RichAvro._
12 | import in.dreamlabs.xmlavro.XMLEvents.{addElement, eleStack, removeElement}
13 | import in.dreamlabs.xmlavro.config.XMLConfig
14 | import org.apache.avro.Schema
15 | import org.apache.avro.file.{CodecFactory, DataFileStream, DataFileWriter}
16 | import org.apache.avro.generic.GenericData.Record
17 | import org.apache.avro.generic.{GenericDatumReader, GenericRecord}
18 | import org.apache.avro.specific.SpecificDatumWriter
19 | import in.dreamlabs.xmlavro.Utils.info
20 | import scala.collection.JavaConverters._
21 | import scala.collection.mutable
22 | import org.apache.commons.io.input.CountingInputStream
23 |
24 | /**
25 | * Created by Royce on 25/01/2017.
26 | */
27 | class AvroBuilder(config: XMLConfig) {
28 | Utils.debugEnabled = config.debug
29 | RichAvro.caseSensitive = config.caseSensitive
30 | RichAvro.ignoreCaseFor =
31 | config.ignoreCaseFor.asScala.toList.map(element => element.toLowerCase)
32 | RichAvro.ignoreMissing = config.ignoreMissing
33 | RichAvro.suppressWarnings = config.suppressWarnings
34 | XNode.namespaces = config.namespaces
35 | XMLDocument.config = config
36 |
37 | private val writers = mutable.Map[String, DataFileWriter[Record]]()
38 | private val schemas = mutable.Map[String, Schema]()
39 | private val streams = mutable.ListBuffer[OutputStream]()
40 |
41 | def createDatums(): Unit = {
42 | config.split.forEach { split =>
43 | val schema = new Schema.Parser().parse(split.avscFile.jfile)
44 | val datumWriter = new SpecificDatumWriter[Record](schema)
45 | val fileWriter = new DataFileWriter[Record](datumWriter)
46 | fileWriter setCodec (CodecFactory snappyCodec)
47 | val avroOut =
48 | if (split stream) new BufferedOutputStream(System.out)
49 | else split.avroFile.toFile.bufferedOutput()
50 | fileWriter create(schema, avroOut)
51 | streams += avroOut
52 | writers += split.by -> fileWriter
53 | schemas += split.by -> schema
54 | }
55 |
56 | val sourceInput =
57 | if (config.streamingInput) new BufferedInputStream(System.in)
58 | else config.xmlFile.toFile.bufferedInput()
59 |
60 | if (config.useAvroInput) {
61 | val avroReader = new DataFileStream[GenericRecord](
62 | sourceInput,
63 | new GenericDatumReader[GenericRecord]())
64 | var avroCount = 0
65 | avroReader.forEach { record =>
66 | val xmlIn = new BufferedInputStream(
67 | new ByteArrayInputStream(
68 | record.get(config.inputAvroKey).asInstanceOf[ByteBuffer].array()))
69 | var uniqueKey = if (config.inputAvroUniqueKey isDefined) {
70 | val keys = config.inputAvroUniqueKey.get.split('.')
71 | val valueMap =
72 | record.get(keys(0)).asInstanceOf[util.HashMap[AnyRef, AnyRef]]
73 | var found: Option[String] = None
74 | valueMap.forEach {
75 | case (key, value) =>
76 | if (key.toString.equals(keys(1))) {
77 | found = Some(value.toString)
78 | }
79 | }
80 | found
81 | } else None
82 | avroCount += 1
83 | info(s"Loading avro record #$avroCount for Unique ID: ${uniqueKey}")
84 | createFromXML(xmlIn, Some(record), uniqueKey)
85 | info(s"Finished avro record #$avroCount for Unique ID: ${uniqueKey}")
86 | }
87 | avroReader.close()
88 | sourceInput.close()
89 | } else {
90 | createFromXML(sourceInput)
91 | }
92 |
93 | XMLDocument.closeAll()
94 |
95 | writers.values.foreach { writer =>
96 | writer.flush()
97 | writer.close()
98 | }
99 |
100 | streams.foreach(_.close())
101 | }
102 |
103 | def createFromXML(xmlIn: InputStream,
104 | sourceAvro: Option[GenericRecord] = None,
105 | uniqueKey: Option[String] = None): Unit = {
106 | val countingStream = new CountingInputStream(xmlIn)
107 | val reader = XMLInputFactory.newInstance.createXMLEventReader(countingStream)
108 | var splitRecord: Record = null
109 | var splitFound, documentFound: Boolean = false
110 | var proceed: Boolean = false
111 | var parentEle: String = ""
112 | var currentDoc: Option[XMLDocument] = None
113 | var prevEvent: XMLEvent = null
114 | var lastPrintMB: Long = 0
115 |
116 | while (reader.hasNext) {
117 | var event: XMLEvent = null
118 | try {
119 | event = reader.nextEvent
120 | if (Utils.debugEnabled){
121 | val currentMB = countingStream.getByteCount/1024/1024
122 | if (currentMB > lastPrintMB){
123 | Utils.debug(s"Processed ${currentMB} Mb")
124 | lastPrintMB = currentMB
125 | }
126 | }
127 | } catch {
128 | case e: Exception =>
129 | currentDoc match {
130 | case None =>
131 | Utils.log(config.docErrorLevel,
132 | s"No XML data received, ${e.getMessage} ")
133 | return
134 | case Some(doc) =>
135 | doc.fail(
136 | ConversionException(s"Invalid XML received, ${e.getMessage} ",
137 | e),
138 | wait = true)
139 | documentFound = false
140 | currentDoc.get close()
141 | currentDoc = None
142 | return
143 | }
144 | }
145 | if (Option(event) isDefined) {
146 | try {
147 | if (currentDoc isDefined)
148 | currentDoc.get add event
149 | event getEventType match {
150 | case START_DOCUMENT | END_DOCUMENT => //Ignore
151 | case START_ELEMENT =>
152 | if (writers contains "") {
153 | writers += event.name -> writers("")
154 | schemas += event.name -> schemas("")
155 | writers remove ""
156 | schemas remove ""
157 | }
158 | if (config.documentRootTag == event.name) {
159 | documentFound = true
160 | proceed = true
161 | splitFound = false
162 | currentDoc = Some(XMLDocument(uniqueKey))
163 | currentDoc.get add event
164 | }
165 |
166 | if (currentDoc.isDefined && !currentDoc.get.error) {
167 | if (writers.contains(event.name)) {
168 | if (splitFound)
169 | ConversionException(
170 | "Splits cannot be inside each other, they should be completely separated tags")
171 | splitFound = true
172 | splitRecord = schemas(event name).newRecord
173 | XMLEvents.setSchema(schemas(event name), splitRecord)
174 | AvroPath.reset()
175 | proceed = true
176 | }
177 |
178 | if (splitFound && proceed) {
179 | proceed = event push()
180 | parentEle = event.fullName
181 |
182 | if (event.hasAttributes && proceed) {
183 | val record = splitRecord.at(event path)
184 | event.attributes foreach {
185 | case (xEle, value) =>
186 | record.add(xEle, value)
187 | }
188 | }
189 | }
190 | }
191 | case CHARACTERS =>
192 | if (splitFound && proceed && currentDoc.isDefined && !currentDoc.get.error && event.hasText) {
193 | val record = splitRecord.at(event path)
194 | record.add(event element, event text)
195 | }
196 | case END_ELEMENT =>
197 | if (splitFound && proceed && currentDoc.isDefined && !currentDoc.get.error && prevEvent.isStartElement) {
198 | if (event.path.nonEmpty) {
199 | val path = event.path.last.name
200 | if (path != event.name) {
201 | val record = splitRecord.at(event path)
202 | record.add(event element, "")
203 | }
204 | }
205 | }
206 | if (currentDoc.isDefined && !currentDoc.get.error) {
207 | if (splitFound && (proceed || event.fullName == parentEle)) {
208 | proceed = true
209 | event pop()
210 | if (writers.contains(event.name)) {
211 | if (sourceAvro isDefined) {
212 | config.inputAvroMappings.foreach {
213 | case (source, target) =>
214 | if ((source != config.inputAvroKey) && !config.inputAvroUniqueKey
215 | .contains(source)) {
216 | splitRecord.put(target, sourceAvro.get.get(source))
217 | }
218 | }
219 | }
220 | val writer = writers(event name)
221 | writer append splitRecord
222 | Utils.info(
223 | s"Writing avro record for ${currentDoc.get.docText} split at ${event.name}")
224 | splitFound = false
225 | }
226 | }
227 | }
228 | case COMMENT => // Do nothing
229 | case other => unknown(other.toString, event)
230 | }
231 | } catch {
232 | case e: Exception =>
233 | currentDoc match {
234 | case None => throw new ConversionException(e)
235 | case Some(doc) =>
236 | var innerMessage =
237 | s"'${event.toString}' after ${prevEvent.toString} at Line: ${event.getLocation.getLineNumber}, Column: ${event.getLocation.getColumnNumber}"
238 | val message =
239 | s"${e.toString}${if (config.debug) "\n" + e.getStackTrace.mkString("\n")} occurred while processing $innerMessage"
240 | doc.fail(ConversionException(message), wait = true)
241 | }
242 | proceed = false
243 | } finally {
244 | if (event.isEndElement && config.documentRootTag == event.name) {
245 | documentFound = false
246 | currentDoc.get close()
247 | currentDoc = None
248 | }
249 | prevEvent = event
250 | }
251 | }
252 | }
253 | xmlIn.close()
254 | }
255 |
256 | implicit class RichXMLEvent(event: XMLEvent) {
257 |
258 | private val startEle: Option[StartElement] =
259 | if (event isStartElement)
260 | Some(event.asStartElement())
261 | else
262 | None
263 |
264 | private val endEle: Option[EndElement] =
265 | if (event isEndElement)
266 | Some(event.asEndElement())
267 | else
268 | None
269 |
270 | val attributes: mutable.LinkedHashMap[XNode, String] = {
271 | val attrMap = mutable.LinkedHashMap.empty[XNode, String]
272 | if (startEle isDefined) {
273 | val attrs = startEle.get.getAttributes
274 | while (attrs.hasNext) {
275 | val attr = attrs.next().asInstanceOf[Attribute]
276 | val name = attr.getName
277 | if (name.getLocalPart.toLowerCase() != "schemalocation")
278 | attrMap += XNode(name.getLocalPart,
279 | name.getNamespaceURI,
280 | name.getPrefix,
281 | attribute = true) -> attr.getValue
282 | }
283 | }
284 | attrMap
285 | }
286 |
287 | def path: List[AvroPath] = XMLEvents.schemaPath.toList
288 |
289 | def hasAttributes: Boolean = attributes nonEmpty
290 |
291 | def push(): Boolean = {
292 | if (eleStack.isEmpty)
293 | addElement(XNode(name, nsURI, nsName, attribute = false))
294 | else addElement(XNode(element, name, nsURI, nsName, attribute = false))
295 | }
296 |
297 | private def nsURI: String =
298 | if (startEle isDefined) startEle.get.getName.getNamespaceURI
299 | else if (endEle isDefined) endEle.get.getName.getNamespaceURI
300 | else element.nsURI
301 |
302 | private def nsName: String =
303 | if (startEle isDefined) startEle.get.getName.getPrefix
304 | else if (endEle isDefined) endEle.get.getName.getPrefix
305 | else element.nsName
306 |
307 | def element: XNode = eleStack.head
308 |
309 | def name: String =
310 | if (startEle isDefined) startEle.get.getName.getLocalPart
311 | else if (endEle isDefined) endEle.get.getName.getLocalPart
312 | else element.name
313 |
314 | def fullName: String = {
315 | XNode(name, nsURI, nsName, attribute = false).fullName()
316 | }
317 |
318 | def pop(): Unit =
319 | removeElement(XNode(name, nsURI, nsName, attribute = false))
320 |
321 | def text: String = event.asCharacters().getData
322 |
323 | def hasText: Boolean = text.trim() != "" || text.matches(" +")
324 | }
325 |
326 | }
327 |
328 | object AvroBuilder {
329 | private def unknown(message: String, event: XMLEvent) =
330 | Utils.warn(s"WARNING: Unknown $message: $event")
331 | }
332 |
--------------------------------------------------------------------------------
/src/test/resources/xml/iam/SAML_response.asvc:
--------------------------------------------------------------------------------
1 | {
2 | "type" : "record",
3 | "name" : "ArtifactResponseType",
4 | "fields" : [ {
5 | "name" : "ID",
6 | "type" : "string",
7 | "source" : "attribute ID"
8 | }, {
9 | "name" : "InResponseTo",
10 | "type" : [ "string", "null" ],
11 | "source" : "attribute InResponseTo"
12 | }, {
13 | "name" : "Version",
14 | "type" : "string",
15 | "source" : "attribute Version"
16 | }, {
17 | "name" : "IssueInstant",
18 | "type" : "string",
19 | "source" : "attribute IssueInstant"
20 | }, {
21 | "name" : "Destination",
22 | "type" : [ "string", "null" ],
23 | "source" : "attribute Destination"
24 | }, {
25 | "name" : "Consent",
26 | "type" : [ "string", "null" ],
27 | "source" : "attribute Consent"
28 | }, {
29 | "name" : "Issuer",
30 | "type" : [ {
31 | "type" : "record",
32 | "name" : "NameIDType",
33 | "fields" : [ {
34 | "name" : "NameQualifier",
35 | "type" : [ "string", "null" ],
36 | "source" : "attribute NameQualifier"
37 | }, {
38 | "name" : "SPNameQualifier",
39 | "type" : [ "string", "null" ],
40 | "source" : "attribute SPNameQualifier"
41 | }, {
42 | "name" : "Format",
43 | "type" : [ "string", "null" ],
44 | "source" : "attribute Format"
45 | }, {
46 | "name" : "SPProvidedID",
47 | "type" : [ "string", "null" ],
48 | "source" : "attribute SPProvidedID"
49 | } ]
50 | }, "null" ],
51 | "source" : "element Issuer"
52 | }, {
53 | "name" : "Signature",
54 | "type" : [ {
55 | "type" : "record",
56 | "name" : "SignatureType",
57 | "fields" : [ {
58 | "name" : "Id",
59 | "type" : [ "string", "null" ],
60 | "source" : "attribute Id"
61 | }, {
62 | "name" : "SignedInfo",
63 | "type" : {
64 | "type" : "record",
65 | "name" : "SignedInfoType",
66 | "fields" : [ {
67 | "name" : "Id",
68 | "type" : [ "string", "null" ],
69 | "source" : "attribute Id"
70 | }, {
71 | "name" : "CanonicalizationMethod",
72 | "type" : {
73 | "type" : "record",
74 | "name" : "CanonicalizationMethodType",
75 | "fields" : [ {
76 | "name" : "Algorithm",
77 | "type" : "string",
78 | "source" : "attribute Algorithm"
79 | }, {
80 | "name" : "others",
81 | "type" : {
82 | "type" : "map",
83 | "values" : "string"
84 | }
85 | } ]
86 | },
87 | "source" : "element CanonicalizationMethod"
88 | }, {
89 | "name" : "SignatureMethod",
90 | "type" : {
91 | "type" : "record",
92 | "name" : "SignatureMethodType",
93 | "fields" : [ {
94 | "name" : "Algorithm",
95 | "type" : "string",
96 | "source" : "attribute Algorithm"
97 | }, {
98 | "name" : "HMACOutputLength",
99 | "type" : [ "string", "null" ],
100 | "source" : "element HMACOutputLength"
101 | }, {
102 | "name" : "others",
103 | "type" : {
104 | "type" : "map",
105 | "values" : "string"
106 | }
107 | } ]
108 | },
109 | "source" : "element SignatureMethod"
110 | }, {
111 | "name" : "Reference",
112 | "type" : {
113 | "type" : "array",
114 | "items" : {
115 | "type" : "record",
116 | "name" : "ReferenceType",
117 | "fields" : [ {
118 | "name" : "Id",
119 | "type" : [ "string", "null" ],
120 | "source" : "attribute Id"
121 | }, {
122 | "name" : "URI",
123 | "type" : [ "string", "null" ],
124 | "source" : "attribute URI"
125 | }, {
126 | "name" : "Type",
127 | "type" : [ "string", "null" ],
128 | "source" : "attribute Type"
129 | }, {
130 | "name" : "Transforms",
131 | "type" : [ {
132 | "type" : "record",
133 | "name" : "TransformsType",
134 | "fields" : [ {
135 | "name" : "Transform",
136 | "type" : {
137 | "type" : "array",
138 | "items" : {
139 | "type" : "record",
140 | "name" : "TransformType",
141 | "fields" : [ {
142 | "name" : "Algorithm",
143 | "type" : "string",
144 | "source" : "attribute Algorithm"
145 | }, {
146 | "name" : "others",
147 | "type" : {
148 | "type" : "map",
149 | "values" : "string"
150 | }
151 | }, {
152 | "name" : "XPath",
153 | "type" : [ "string", "null" ],
154 | "source" : "element XPath"
155 | } ]
156 | }
157 | },
158 | "source" : "element Transform"
159 | } ]
160 | }, "null" ],
161 | "source" : "element Transforms"
162 | }, {
163 | "name" : "DigestMethod",
164 | "type" : {
165 | "type" : "record",
166 | "name" : "DigestMethodType",
167 | "fields" : [ {
168 | "name" : "Algorithm",
169 | "type" : "string",
170 | "source" : "attribute Algorithm"
171 | }, {
172 | "name" : "others",
173 | "type" : {
174 | "type" : "map",
175 | "values" : "string"
176 | }
177 | } ]
178 | },
179 | "source" : "element DigestMethod"
180 | }, {
181 | "name" : "DigestValue",
182 | "type" : "string",
183 | "source" : "element DigestValue"
184 | } ]
185 | }
186 | },
187 | "source" : "element Reference"
188 | } ]
189 | },
190 | "source" : "element SignedInfo"
191 | }, {
192 | "name" : "SignatureValue",
193 | "type" : {
194 | "type" : "record",
195 | "name" : "SignatureValueType",
196 | "fields" : [ {
197 | "name" : "Id",
198 | "type" : [ "string", "null" ],
199 | "source" : "attribute Id"
200 | } ]
201 | },
202 | "source" : "element SignatureValue"
203 | }, {
204 | "name" : "KeyInfo",
205 | "type" : [ {
206 | "type" : "record",
207 | "name" : "KeyInfoType",
208 | "fields" : [ {
209 | "name" : "Id",
210 | "type" : [ "string", "null" ],
211 | "source" : "attribute Id"
212 | }, {
213 | "name" : "KeyName",
214 | "type" : [ "string", "null" ],
215 | "source" : "element KeyName"
216 | }, {
217 | "name" : "KeyValue",
218 | "type" : [ {
219 | "type" : "record",
220 | "name" : "KeyValueType",
221 | "fields" : [ {
222 | "name" : "DSAKeyValue",
223 | "type" : [ {
224 | "type" : "record",
225 | "name" : "DSAKeyValueType",
226 | "fields" : [ {
227 | "name" : "P",
228 | "type" : "string",
229 | "source" : "element P"
230 | }, {
231 | "name" : "Q",
232 | "type" : "string",
233 | "source" : "element Q"
234 | }, {
235 | "name" : "G",
236 | "type" : [ "string", "null" ],
237 | "source" : "element G"
238 | }, {
239 | "name" : "Y",
240 | "type" : "string",
241 | "source" : "element Y"
242 | }, {
243 | "name" : "J",
244 | "type" : [ "string", "null" ],
245 | "source" : "element J"
246 | }, {
247 | "name" : "Seed",
248 | "type" : "string",
249 | "source" : "element Seed"
250 | }, {
251 | "name" : "PgenCounter",
252 | "type" : "string",
253 | "source" : "element PgenCounter"
254 | } ]
255 | }, "null" ],
256 | "source" : "element DSAKeyValue"
257 | }, {
258 | "name" : "RSAKeyValue",
259 | "type" : [ {
260 | "type" : "record",
261 | "name" : "RSAKeyValueType",
262 | "fields" : [ {
263 | "name" : "Modulus",
264 | "type" : "string",
265 | "source" : "element Modulus"
266 | }, {
267 | "name" : "Exponent",
268 | "type" : "string",
269 | "source" : "element Exponent"
270 | } ]
271 | }, "null" ],
272 | "source" : "element RSAKeyValue"
273 | }, {
274 | "name" : "others",
275 | "type" : {
276 | "type" : "map",
277 | "values" : "string"
278 | }
279 | } ]
280 | }, "null" ],
281 | "source" : "element KeyValue"
282 | }, {
283 | "name" : "RetrievalMethod",
284 | "type" : [ {
285 | "type" : "record",
286 | "name" : "RetrievalMethodType",
287 | "fields" : [ {
288 | "name" : "URI",
289 | "type" : [ "string", "null" ],
290 | "source" : "attribute URI"
291 | }, {
292 | "name" : "Type",
293 | "type" : [ "string", "null" ],
294 | "source" : "attribute Type"
295 | }, {
296 | "name" : "Transforms",
297 | "type" : [ "TransformsType", "null" ],
298 | "source" : "element Transforms"
299 | } ]
300 | }, "null" ],
301 | "source" : "element RetrievalMethod"
302 | }, {
303 | "name" : "X509Data",
304 | "type" : [ {
305 | "type" : "record",
306 | "name" : "X509DataType",
307 | "fields" : [ {
308 | "name" : "X509IssuerSerial",
309 | "type" : [ {
310 | "type" : "record",
311 | "name" : "X509IssuerSerialType",
312 | "fields" : [ {
313 | "name" : "X509IssuerName",
314 | "type" : "string",
315 | "source" : "element X509IssuerName"
316 | }, {
317 | "name" : "X509SerialNumber",
318 | "type" : "string",
319 | "source" : "element X509SerialNumber"
320 | } ]
321 | }, "null" ],
322 | "source" : "element X509IssuerSerial"
323 | }, {
324 | "name" : "X509SKI",
325 | "type" : [ "string", "null" ],
326 | "source" : "element X509SKI"
327 | }, {
328 | "name" : "X509SubjectName",
329 | "type" : [ "string", "null" ],
330 | "source" : "element X509SubjectName"
331 | }, {
332 | "name" : "X509Certificate",
333 | "type" : [ "string", "null" ],
334 | "source" : "element X509Certificate"
335 | }, {
336 | "name" : "X509CRL",
337 | "type" : [ "string", "null" ],
338 | "source" : "element X509CRL"
339 | }, {
340 | "name" : "others",
341 | "type" : {
342 | "type" : "map",
343 | "values" : "string"
344 | }
345 | } ]
346 | }, "null" ],
347 | "source" : "element X509Data"
348 | }, {
349 | "name" : "PGPData",
350 | "type" : [ {
351 | "type" : "record",
352 | "name" : "PGPDataType",
353 | "fields" : [ {
354 | "name" : "PGPKeyID",
355 | "type" : [ "string", "null" ],
356 | "source" : "element PGPKeyID"
357 | }, {
358 | "name" : "PGPKeyPacket0",
359 | "type" : [ "string", "null" ],
360 | "source" : "element PGPKeyPacket"
361 | }, {
362 | "name" : "others",
363 | "type" : {
364 | "type" : "map",
365 | "values" : "string"
366 | }
367 | } ]
368 | }, "null" ],
369 | "source" : "element PGPData"
370 | }, {
371 | "name" : "SPKIData",
372 | "type" : [ {
373 | "type" : "record",
374 | "name" : "SPKIDataType",
375 | "fields" : [ {
376 | "name" : "SPKISexp",
377 | "type" : "string",
378 | "source" : "element SPKISexp"
379 | }, {
380 | "name" : "others",
381 | "type" : {
382 | "type" : "map",
383 | "values" : "string"
384 | }
385 | } ]
386 | }, "null" ],
387 | "source" : "element SPKIData"
388 | }, {
389 | "name" : "MgmtData",
390 | "type" : [ "string", "null" ],
391 | "source" : "element MgmtData"
392 | }, {
393 | "name" : "others",
394 | "type" : {
395 | "type" : "map",
396 | "values" : "string"
397 | }
398 | } ]
399 | }, "null" ],
400 | "source" : "element KeyInfo"
401 | }, {
402 | "name" : "Object",
403 | "type" : {
404 | "type" : "array",
405 | "items" : {
406 | "type" : "record",
407 | "name" : "ObjectType",
408 | "fields" : [ {
409 | "name" : "Id",
410 | "type" : [ "string", "null" ],
411 | "source" : "attribute Id"
412 | }, {
413 | "name" : "MimeType",
414 | "type" : [ "string", "null" ],
415 | "source" : "attribute MimeType"
416 | }, {
417 | "name" : "Encoding",
418 | "type" : [ "string", "null" ],
419 | "source" : "attribute Encoding"
420 | }, {
421 | "name" : "others",
422 | "type" : {
423 | "type" : "map",
424 | "values" : "string"
425 | }
426 | } ]
427 | }
428 | },
429 | "source" : "element Object"
430 | } ]
431 | }, "null" ],
432 | "source" : "element Signature"
433 | }, {
434 | "name" : "Extensions",
435 | "type" : [ {
436 | "type" : "record",
437 | "name" : "ExtensionsType",
438 | "fields" : [ {
439 | "name" : "others",
440 | "type" : {
441 | "type" : "map",
442 | "values" : "string"
443 | }
444 | } ]
445 | }, "null" ],
446 | "source" : "element Extensions"
447 | }, {
448 | "name" : "Status",
449 | "type" : {
450 | "type" : "record",
451 | "name" : "StatusType",
452 | "fields" : [ {
453 | "name" : "StatusCode",
454 | "type" : {
455 | "type" : "record",
456 | "name" : "StatusCodeType",
457 | "fields" : [ {
458 | "name" : "Value",
459 | "type" : "string",
460 | "source" : "attribute Value"
461 | }, {
462 | "name" : "StatusCode",
463 | "type" : [ "StatusCodeType", "null" ],
464 | "source" : "element StatusCode"
465 | } ]
466 | },
467 | "source" : "element StatusCode"
468 | }, {
469 | "name" : "StatusMessage",
470 | "type" : [ "string", "null" ],
471 | "source" : "element StatusMessage"
472 | }, {
473 | "name" : "StatusDetail",
474 | "type" : [ {
475 | "type" : "record",
476 | "name" : "StatusDetailType",
477 | "fields" : [ {
478 | "name" : "others",
479 | "type" : {
480 | "type" : "map",
481 | "values" : "string"
482 | }
483 | } ]
484 | }, "null" ],
485 | "source" : "element StatusDetail"
486 | } ]
487 | },
488 | "source" : "element Status"
489 | }, {
490 | "name" : "others",
491 | "type" : {
492 | "type" : "map",
493 | "values" : "string"
494 | }
495 | } ]
496 | }
--------------------------------------------------------------------------------
/src/test/resources/temp:
--------------------------------------------------------------------------------
1 |
2 | CY_NAV_00211
3 | CY_20150901145144
4 | 2015-09-01T14:51:44+02:00
5 | CY_NAV
6 | I2029
7 | International Basket Level Sales
8 | POSLog
9 | 1
10 | 1
11 | 54.00
12 |
13 | 9482
14 | AKROPOLIS
15 |
16 |
17 |
18 |
19 |
20 | 19
21 | 34
22 | 2015-08-27T20:07:22+02:00
23 | 2015-08-27T20:07:06+02:00
24 | 2015-08-27
25 | 101
26 | EUR
27 | 000000012
28 | 0
29 | 2015-08-27T20:07:06+02:00
30 | 20318178
31 | T17/NU
32 | 20318178
33 |
34 | 30.00
35 | 27.00
36 | 54.00
37 | 2.00000
38 | 0
39 | 6.00
40 | 1
41 | 10.00000
42 |
43 | 13:5
44 | 1
45 | 2
46 | 60.00
47 | 2.00000
48 |
49 |
50 | 0
51 | 54.00
52 | 8.62
53 | 19.00000
54 | S
55 | CYS
56 | CYVAT
57 |
58 |
59 | 54.00
60 | 54.00
61 | False
62 | 1
63 | 0
64 | 11:3
65 | 2607
66 | 2801
67 |
68 |
69 |
70 | 1
71 | 2015-08-27T20:07:06+02:00
72 | 1
73 | 54.00
74 | 8.62
75 | 19.00000
76 | S
77 | CYS
78 | CYVAT
79 |
80 |
81 |
82 | 2
83 | 2015-08-27T20:07:06+02:00
84 | 138
85 | 54.00
86 | 15
87 |
88 |
89 |
90 | 54.00
91 | 45.38
92 | 8.62
93 |
94 |
95 | 2015-08-27T20:07:06+02:00
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 | CY_NAV_00211
104 | CY_20150901145144
105 | 2015-09-01T14:51:44+02:00
106 | CY_NAV
107 | I2029
108 | International Basket Level Sales
109 | POSLog
110 | 1
111 | 1
112 | 54.00
113 |
114 | 9482
115 | AKROPOLIS
116 |
117 |
118 |
119 |
120 |
121 | 19
122 | 34
123 | 2015-08-27T20:07:22+02:00
124 | 2015-08-27T20:07:06+02:00
125 | 2015-08-27
126 | 101
127 | EUR
128 | 000000012
129 | 0
130 | 2015-08-27T20:07:06+02:00
131 | 20318178
132 | T17/NU
133 | 20318178
134 |
135 | 30.00
136 | 27.00
137 | 54.00
138 | 2.00000
139 | 0
140 | 6.00
141 | 1
142 | 10.00000
143 |
144 | 13:5
145 | 1
146 | 2
147 | 60.00
148 | 2.00000
149 |
150 |
151 | 0
152 | 54.00
153 | 8.62
154 | 19.00000
155 | S
156 | CYS
157 | CYVAT
158 |
159 |
160 | 54.00
161 | 54.00
162 | False
163 | 1
164 | 0
165 | 11:3
166 | 2607
167 | 2801
168 |
169 |
170 |
171 | 1
172 | 2015-08-27T20:07:06+02:00
173 | 1
174 | 54.00
175 | 8.62
176 | 19.00000
177 | S
178 | CYS
179 | CYVAT
180 |
181 |
182 |
183 | 2
184 | 2015-08-27T20:07:06+02:00
185 | 138
186 | 54.00
187 | 15
188 |
189 |
190 |
191 | 54.00
192 | 45.38
193 | 8.62
194 |
195 |
196 | 2015-08-27T20:07:06+02:00
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 | CY_NAV_00211
205 | CY_20150901145144
206 | 2015-09-01T14:51:44+02:00
207 | CY_NAV
208 | I2029
209 | International Basket Level Sales
210 | POSLog
211 | 1
212 | 1
213 | 54.00
214 |
215 | 9482
216 | AKROPOLIS
217 |
218 |
219 |
220 |
221 |
222 | 19
223 | 34
224 | 2015-08-27T20:07:22+02:00
225 | 2015-08-27T20:07:06+02:00
226 | 2015-08-27
227 | 101
228 | EUR
229 | 000000012
230 | 0
231 | 2015-08-27T20:07:06+02:00
232 | 20318178
233 | T17/NU
234 | 20318178
235 |
236 | 30.00
237 | 27.00
238 | 54.00
239 | 2.00000
240 | 0
241 | 6.00
242 | 1
243 | 10.00000
244 |
245 | 13:5
246 | 1
247 | 2
248 | 60.00
249 | 2.00000
250 |
251 |
252 | 0
253 | 54.00
254 | 8.62
255 | 19.00000
256 | S
257 | CYS
258 | CYVAT
259 |
260 |
261 | 54.00
262 | 54.00
263 | False
264 | 1
265 | 0
266 | 11:3
267 | 2607
268 | 2801
269 |
270 |
271 |
272 | 1
273 | 2015-08-27T20:07:06+02:00
274 | 1
275 | 54.00
276 | 8.62
277 | 19.00000
278 | S
279 | CYS
280 | CYVAT
281 |
282 |
283 |
284 | 2
285 | 2015-08-27T20:07:06+02:00
286 | 138
287 | 54.00
288 | 15
289 |
290 |
291 |
292 | 54.00
293 | 45.38
294 | 8.62
295 |
296 |
297 | 2015-08-27T20:07:06+02:00
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 | CY_NAV_00211
306 | CY_20150901145144
307 | 2015-09-01T14:51:44+02:00
308 | CY_NAV
309 | I2029
310 | International Basket Level Sales
311 | POSLog
312 | 1
313 | 1
314 | 54.00
315 |
316 | 9482
317 | AKROPOLIS
318 |
319 |
320 |
321 |
322 |
323 | 19
324 | 34
325 | 2015-08-27T20:07:22+02:00
326 | 2015-08-27T20:07:06+02:00
327 | 2015-08-27
328 | 101
329 | EUR
330 | 000000012
331 | 0
332 | 2015-08-27T20:07:06+02:00
333 | 20318178
334 | T17/NU
335 | 20318178
336 |
337 | 30.00
338 | 27.00
339 | 54.00
340 | 2.00000
341 | 0
342 | 6.00
343 | 1
344 | 10.00000
345 |
346 | 13:5
347 | 1
348 | 2
349 | 60.00
350 | 2.00000
351 |
352 |
353 | 0
354 | 54.00
355 | 8.62
356 | 19.00000
357 | S
358 | CYS
359 | CYVAT
360 |
361 |
362 | 54.00
363 | 54.00
364 | False
365 | 1
366 | 0
367 | 11:3
368 | 2607
369 | 2801
370 |
371 |
372 |
373 | 1
374 | 2015-08-27T20:07:06+02:00
375 | 1
376 | 54.00
377 | 8.62
378 | 19.00000
379 | S
380 | CYS
381 | CYVAT
382 |
383 |
384 |
385 | 2
386 | 2015-08-27T20:07:06+02:00
387 | 138
388 | 54.00
389 | 15
390 |
391 |
392 |
393 | 54.00
394 | 45.38
395 | 8.62
396 |
397 |
398 | 2015-08-27T20:07:06+02:00
399 |
400 |
401 |
402 |
403 |
404 |
405 |
--------------------------------------------------------------------------------