├── .gitignore
├── .scalafmt.conf
├── .travis.yml
├── LICENSE
├── README.md
├── build.sbt
├── docker-compose.yml
├── project
├── Dependencies.scala
├── build.properties
├── plugins.sbt
└── project
│ └── plugins.sbt
├── publish.sh
├── pubring.gpg.enc
├── resources
└── images
│ ├── 001.png
│ ├── 002.png
│ ├── 003.png
│ ├── 004.png
│ ├── 005.png
│ ├── 006.png
│ ├── 007.png
│ ├── 008.png
│ ├── schemer-logo-text-wide.png
│ ├── schemer-logo-text-wide.svg
│ ├── schemer-logo-text.png
│ ├── schemer-logo-text.svg
│ ├── schemer-logo.png
│ └── schemer-logo.svg
├── schemer-core
└── src
│ ├── main
│ └── scala
│ │ └── schemer
│ │ ├── AvroSchema.scala
│ │ ├── CSVSchema.scala
│ │ ├── JSONSchema.scala
│ │ ├── ParquetSchema.scala
│ │ ├── SchemaLike.scala
│ │ ├── Schemer.scala
│ │ └── utils
│ │ ├── JSONUtil.scala
│ │ └── JsonSchemaValidationUtil.scala
│ └── test
│ ├── resources
│ ├── test.csv
│ ├── test.json
│ └── test.tsv
│ └── scala
│ └── schemer
│ ├── AvroSchemaSpec.scala
│ ├── CSVSchemaSpec.scala
│ ├── Helpers.scala
│ ├── JSONSchemaSpec.scala
│ └── ParquetSchemaSpec.scala
├── schemer-registry
└── src
│ └── main
│ ├── resources
│ ├── application.conf
│ ├── aws-core-site.xml
│ ├── db
│ │ └── migration
│ │ │ └── V1__creates_schemas.sql
│ └── graphql
│ │ └── graphiql.html
│ └── scala
│ └── schemer
│ └── registry
│ ├── actors
│ └── InferActor.scala
│ ├── dao
│ └── SchemaDao.scala
│ ├── exceptions
│ ├── SchemerException.scala
│ ├── SchemerInferenceException.scala
│ ├── SchemerSchemaCreationException.scala
│ └── SchemerSchemaVersionCreationException.scala
│ ├── graphql
│ ├── CustomGraphQLResolver.scala
│ ├── GraphQLService.scala
│ └── schema
│ │ ├── GraphQLCustomTypes.scala
│ │ ├── InferType.scala
│ │ ├── MetadataType.scala
│ │ ├── MutationType.scala
│ │ ├── SchemaDefinition.scala
│ │ └── SchemaType.scala
│ ├── models
│ └── Schema.scala
│ ├── package.scala
│ ├── routes
│ ├── GraphQLRoutes.scala
│ ├── HealthRoutes.scala
│ ├── Routes.scala
│ └── SwaggerRoutes.scala
│ ├── server
│ ├── ConfigWithDefault.scala
│ ├── InferenceConfig.scala
│ ├── Main.scala
│ ├── Modules.scala
│ └── ServerConfig.scala
│ ├── sql
│ ├── DatabaseConfig.scala
│ ├── SqlDatabase.scala
│ └── package.scala
│ └── utils
│ ├── Clock.scala
│ └── DateTimeUtils.scala
├── schemer-ui.md
├── secring.gpg.enc
└── sonatype.sbt
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | target/
3 | *.log
4 | schemer_db/
5 |
--------------------------------------------------------------------------------
/.scalafmt.conf:
--------------------------------------------------------------------------------
1 | style = defaultWithAlign
2 | maxColumn = 120
3 | align.openParenCallSite = false
4 | align.openParenDefnSite = false
5 | danglingParentheses = true
6 |
7 | rewrite.rules = [RedundantBraces, RedundantParens, SortImports, PreferCurlyFors]
8 | rewrite.redundantBraces.includeUnitMethods = true
9 | rewrite.redundantBraces.stringInterpolation = true
10 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: scala
2 | scala:
3 | - 2.11.11
4 | jdk:
5 | - oraclejdk8
6 | sudo: required
7 | services:
8 | - docker
9 | before_cache:
10 | - find $HOME/.sbt -name "*.lock" | xargs rm
11 | - find $HOME/.ivy2 -name "ivydata-*.properties" | xargs rm
12 | cache:
13 | directories:
14 | - "$HOME/.ivy2/cache"
15 | - "$HOME/.sbt/boot/"
16 | before_deploy:
17 | - openssl aes-256-cbc -pass pass:$ENCRYPTION_PASSWORD -in secring.gpg.enc -out local.secring.gpg
18 | -d
19 | - openssl aes-256-cbc -pass pass:$ENCRYPTION_PASSWORD -in pubring.gpg.enc -out local.pubring.gpg
20 | -d
21 | deploy:
22 | - provider: script
23 | script: "./publish.sh"
24 | skip_cleanup: true
25 | on:
26 | tags: true
27 | jdk: oraclejdk8
28 | scala: 2.11.11
29 | env:
30 | global:
31 | - BUILD_LABEL=1.0.${TRAVIS_BUILD_NUMBER}
32 | - secure: KUG4LdZBUn11TWMysOvF7jbwkHZzWRUIJqz6HlcXEOIdN4rzt2An0L2KB9L5L+DG6M2e0aHyl6ajeopSF8SYV3OYmPe2RKO2pJifhfn9Z301gpV3nKQKp2maZHTeoOSeWNOHV1APJAd5WyHsmPp1zKGHc7ePLcse9f45KuK0YxOnM+i1APUzWsRdS3PSaDfByP++LYNh4V8Bz4Xl75TgLs/kUtY2xxmFakWZe0PE87qjdadAI42Q7/b6sWHb17WNIzMg/8Vfn//C7XsQjmhXS5dwa2JilyFTZKLJ0N89h2KoMGgF4rJQWS7DLo5mFccqnAvx1iN4gZvzmvyQgZ2JCK4QxAxxC1umcJvJ4Yf7pzZaHqMrthCLz8eNixFSYtoCskJzyrHyow99MhOTnv9NcsOZxdT25j20wI2E2JxoYRDLOZcBd7xSiXCmyzQE3/V+FEaqCUcY2dFHdA8V6GnOmetycgxyWd23S7dCM4IQEB8UAnCM/UgIWGIhTAMbxUtJKEIMfA+VVQwJUPvKNkVB2HjUUK5q+//+B9H9WJrvR16urcyC8MpgWJWex5J7mwlvs1999t1S/EUPUmHlJQX7XQuN5kSOyQ2B+L+r0FW8LyHx4ffxy1qW9x9kFzGsWNKTKJd3PWJONKlcdWLYU/T56EFBzWZwPt74sVCRhtQ0G30=
33 | - secure: 2hLgeG61N9TZ8xNRSMCMlMiVatsoxsTT6Gj1ohbZC2cMTsEmMVPvYxOkBrjqu2VsJBcmyyiKqw8LX1FQBvU3eDIOss+mIQndjTvJIZIw1jx2KK6TNliT4nK0sORyHFDAmXR43AZPjpJVsg28uR80/Nwr6+7OoZ6Cy8kA2w+iO33vW2V/jXhD3H1XQRd+T7NE9RQigcY7OVjq1SqHE+6sH8CVQL94cqzV7fhHVeC5oBoze4bn+KcmK/iRl6zW9cXlyS+kjhPkVrS0NOADcXnCjUkkw+5N1TWSABXSBBO3VxkuE6W311T/NeNYFDP7CrjzrH1XEyTXjf9A3RVyJVRIEaDX4SHadb6eLRWSAgTO1d6tRxl8IcsnDoYghiMw61ggx4tkV6OQa5nF0U+0GJOYC/HLnqSIkv7/YM6LMp9AprWlWD+wB9wQ4l/09Ssed7hz/VoWnL7ymDIj/fJnsFnoOIJebciAo2pLWyftjjBIxEnQbGVyXSRdXVgxKpTyIeQY57xZPBugi4Q2KmBrLl+l2ean49eb827mMhM8FBdyLqbM8FKFln8E7Hokzw835/HKn23pqzLGO8/H/6rHOBkqwmjoL06FbiqHZyV16XCr0Jy/Bh6xoQaCYoE8ZhcTYzeBkTDzJIcfXqWIvNRUQzwHu+EogEXju/3c5G/2nITdz+g=
34 | - secure: yyiWyS++Q/ZUvRcRRi8Md0u7VqwQvA74cn57eSLJuUFEHsr4fDjVqkA9HhwUCWdCJC5UIm72c9gqkuyVjq9Vxb3M+YPF6h+sCsOfH3bDowq8ozWBIC1W8giORNAoANVcaFnDgd+9RkXqkIAarVgyff1r5Cdks5EfQ0lsCnaphdyrO0h1jQx8A5UMCZU6za52NDviyn92TUH5Z+XEBJzWg7/WV18zfr8/5VPK5ErYzcNk6s9Gq8WfLtf3b9aCdOkjBB1HzqB/pqEMOfxtERjSBtw66lB5Fzn6xk0O0cn4jfzyI/DpFfzi9Ecwuwm/kMX+ZSPUOe/riEt5/D5wJKEMio0G2Zt77Ulkq72ed2h801gLzjuNLvN3b+zk38Va0bPgWGARFVGVbAytve1xcpCJxQ8ZG+hgFq95WoTYguWUT2ny3xo+0aZm+r/jqT8o05p/UjbzPsTAeTsL0m8Qxr/+JPwvuLZIekzIg23JBe5PPHaIp5PdgYfCv+s02TdZNGEIYa9K3jo6yUsnZXClJaZov9tmmkvyw95/2bhgZfWn5dDqyNu/YrpsQ9k+sDfC01UmXXHl10jAWBy+wjAYw2F7UTR9/MVXC041j6vZmOrfVLdYvXvJslmVjR2qUq9zBrF6uZIOCaARMoy8YCcJFTruHFemdUJ0ElCOtrfpuZkDnkY=
35 | - secure: vvDoGnUlOIDqvN1+zKjxXUeOCEQ9AFURhXV1y8lUDBqZXHshoEV6MZKe7cL2vkBrCGLgEAOlX7vplW44spIxkL78zLtyxmUCsla4VcCT41iYkrBCM4LOKlzsKM3U0xoo76EP9zYXul6O/vIL9W9RvL78RyiC3C4vC8tUahuoxaWpyYvOFcKEI7sVf1O7/HeAKqaRRitsAuk8nfZDkTMU8FRjvxp2aY7I3hGaQVVMKe3lpns3R7T74FuQvnkfyr7T/O0/GnGKJELl8lBtdQlsWoZOXYQL7ZzaEWVjFsmWTNqtKU7DMv9j5eC/4JWqnQK2A3Eda0bPx0sgbymM79IAqBw5fwauyfUFMOsvmA/bEDoDF8cD/moPJmZBkxa2HC1DKgpsWE46fFJqqTZKaaaDZulkviGfqBUbwmZqGDlm1Xo3vrYBCOm1tXgzkuPRoksoGN3K3u2gUL9IeE0kiR0xx0qp62R/Kh51cycY1/w8yaNNOfB+1YdcMb7wv3SWGWohE0R/ke7x5luOvV5TCyqOJjmQb1OkzT4AeqACYjTBK6JhgcE5XqsveaF4NuKSA6CtfV19jgyFDEKjU8202SWeLzQew/RFFTSUea3xD/PfBQcKkGizfmCHF1tJmEqFgP09u7uqt0X8q3xls3882CGU46MB6GNmkB+nwXu2DCD1WyQ=
36 | - secure: loYtA4a1MMNZDyezwFf/2PNkJeli0E9mpmzeHeWQ1oB5+CVHE/vruOhXPHKDBU/RUBqM2Y09d00fA1Jz0N+0KFhj2nAJDZz2UR6LysOo56Xps4E+NSmMd72qntQtwP3qmWa6kqUYjr31y4i4O2zwlG/gRmq1BNBoouco3BIat8CMOram3WKgjyQSnvLIq9jtrq45uRc0po+nCzqnyGwmwG2sZKkajPTQpO4VjTPu+fmW8bbGhSC62UI2JpWnWev/8CNKaERvzs7s4iwf2ksMoS55iQSyixMp5k1L/qKtee6dYkbZli8gSuhdyNYaIIzTfV42YEUT7Jmwdxcw24sdgHcrCcQNtnVAGvW/EyMq+f1lwoJTabekfAcij30W+lwShnbS/69itl7gozm5dcClmihrtMUgQbb50b8Err+HHayy8xGDipCvtpoAku9NrSG+aX5BZA6BXmZw0VfUiPZXMQFMWA/n40cqzZclPb3cKQSbDd0WfukkmL/nWh2RKxyOLZgH0FLgzoDt3U8pUdR2YmDqZidUoVG9Iz4Lt5c7CTzNSoEaUhWiSFC+tytwn8OPzYq1uJtwv8lPaWvqQA/5+c9Tni5G2LsfRrrBlxSkHkCWqPuhVze5ByxLRank7CBOwBjLOJmv9tZt4NJX7YJhdYAs/jV7ya4jJBVmw1zUdhk=
37 | - secure: G0Wv6Odzqh6GrkyxbV6v8Xq1seQD97rX9C+NfuzwTAh2MVUopnPZGh/Oakiwdb/kTAIpb70dKrZaYqv+mTz/1oTQMg4T2e1OAgItZwR/d6SB/cOKjkxtFgI/q0N5/ukw1N5LE7LrTwoBWtoYIdTO4EqoxTcTkbx+VJczzMAlD3VRICzMsvpxXoOw0BkvESTxEU/yrX3AxRA5VziVvFeSb8cpmgvNYAWmlczaujChPedWOWoO0Z2lFhFU8LYBBzsIwBSIOd6f93vlOCOGQtYOCx4FybRkHs7B1Yjihfz68prR047vDvuT4y+evI9xIpvk3AjN2zkV92EXCo26/fqbepooiQ2cVSpQs+RtTT5P2jPasWFwm1K9G6a/5Ir1dFB7CXKl9EmEUxBnRR0+ZWYWssVc6sTduxvSpMx80CBlWWsHkSlbD/by5fWQptgQp4DYG63Pifc+MuPEHjhl3z1PvyhtccWtGbrYsCNoWlGRT497E8QaBQ8BValqg9sgZ/He9RzzHB6UBve8co92W8rgfOdE+JlgJEr+oIkbB5jnnJIGqC6cDW10aWfjCzIotfmk/MACvaaLTkXT4U786iktf0AR06kPOKssFGrvrqSFqui/ugbHsggBYbfgRC0D76VLQdBPFSuJNplgJamGvFGeoQHk6v+v08dNrQ70S/qp894=
38 | - secure: q5a1a4lDIQLaU32xilGkP91zGiKqW+m5KwuzwAhAIirmHZ9sD6cky5zoOPc6oYCeE4UGIowv9gkGJyMPMD+Jl7pom47/FfmvsS/pcMCzasgWueEsjF2Y9TRHdRPyeGFejJzLgOWu78ss8HOobmlHZPrXzl3Zxx+HkLfwFJ23bH7IIyBYmhC54XGCBuIxU/sbkUctCICOUNXxpzLey63dV/O4cGLzYkpXMtS/6CbwwQj/Hj+TKzaaonqeOAMY4hPj9WE0Vc/QnhstGGHHDIAhBYkNdvs1NLNvxRa1R0i+uGGjnQVhWXOfrKnFhn5qWHOYXvk58f+iW2Ey69Tg21iqOA9SBgrjtDUQLRhP9iAQJbu6EevKLFZcXqfeFJtDfDiUTxpycLWj8GD4k+sz40O2DcIGiPRvzI5v1ZN87baeojUGCjP7I3c2Mv4XigylZzhPJ7m9rPV/G5WG8aKaK/qUpJ7ynl4XpFAMcC7QGb+eQXOw6BqFMhDw4sGR2qu3LQ/flxmpzXZPkHyLK6m2oSGXVbk0w2UvO9pnF+XillQ4YOzB1O+OqbqjZtDu+ZmrB4NK/QzvyGyUS6C/7ancDdlgHZhcoLw50jp27msD+JBv5NcR0y1IAvPwwWqiyg/teaOxl/hZ80BT0AwpC0nNc83UbORGYGcLr6o46OoZNP5xlQ4=
39 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # schemer
2 | [](https://travis-ci.org/indix/schemer) [](http://repo1.maven.org/maven2/com/indix/schemer-core_2.11/) [](https://hub.docker.com/r/indix/schemer-registry/)
3 |
4 |
5 |
6 |
7 |
8 | Schema registry with support for CSV, TSV, AVRO, JSON and Parquet. Has ability to infer schema from a given data source.
9 |
10 | ## Schemer UI [WIP]
11 |
12 |
13 |
14 |
15 |
16 | Schemer UI is the wizard based frontend for Schemer. It provides a wizard based schema creation and versioning workflow apart from browsing and search capabilities. It is a work in progress. [More screens](schemer-ui.md)
17 |
18 | ## Schemer Core
19 |
20 | `schemer-core` is the core library that implements most of the logic needed to understand the supported schema types along with the schema inference. To use `schemer-core` directly, just add it to your dependencies:
21 |
22 | ```
23 | libraryDependencies += "com.indix" %% "schemer" % "v0.2.3"
24 | ```
25 |
26 | ## Schemer Registry
27 |
28 | `schemer-registry` is a schema registry for storing the metadata about schema and schema versions. It provides a GraphQL API for adding, viewing and inferring schemas.
29 |
30 | Schemer Registry is available as a [docker image at DockeHub](https://hub.docker.com/r/indix/schemer-registry/)
31 |
32 | ### Running Locally
33 |
34 | Local docker based PostgreSQL can be run as follows:
35 |
36 | ```
37 | docker run -e POSTGRES_USER=schemer -e POSTGRES_PASSWORD=schemer -e PGDATA=/var/lib/postgresql/data/pgdata -e POSTGRES_DB=schemer -v $(pwd)/schemer_db:/var/lib/postgresql/data/pgdata -p 5432:5432 postgres:9.5.0
38 | ```
39 |
40 | Remove `schmer_db` folder to clear all data and start from scratch.
41 |
42 | The registry service can be run using `sbt`:
43 |
44 | ```bash
45 | sbt "project registry" ~reStart
46 | ```
47 |
48 |
49 |
--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
1 | import Dependencies._
2 | import com.typesafe.sbt.packager.Keys.{daemonUser, dockerBaseImage, dockerExposedPorts, dockerRepository, packageName}
3 | import spray.revolver.RevolverPlugin
4 | import spray.revolver.RevolverPlugin.autoImport.Revolver
5 |
6 | val libVersion = sys.env.get("TRAVIS_TAG") orElse sys.env.get("BUILD_LABEL") getOrElse s"1.0.0-${System.currentTimeMillis / 1000}-SNAPSHOT"
7 |
8 | lazy val publishSettings = Seq(
9 | publishMavenStyle := true,
10 | pgpSecretRing := file("local.secring.gpg"),
11 | pgpPublicRing := file("local.pubring.gpg"),
12 | pgpPassphrase := Some(sys.env.getOrElse("GPG_PASSPHRASE", "").toCharArray),
13 | credentials += Credentials(
14 | "Sonatype Nexus Repository Manager",
15 | "oss.sonatype.org",
16 | System.getenv("SONATYPE_USERNAME"),
17 | System.getenv("SONATYPE_PASSWORD")
18 | ),
19 | publishTo := {
20 | val nexus = "https://oss.sonatype.org/"
21 | if (isSnapshot.value)
22 | Some("snapshots" at nexus + "content/repositories/snapshots")
23 | else
24 | Some("releases" at nexus + "service/local/staging/deploy/maven2")
25 | },
26 | publishArtifact in Test := false,
27 | pomIncludeRepository := { _ =>
28 | false
29 | },
30 | pomExtra :=
31 | https://github.com/indix/schemer
32 |
33 |
34 | Apache License
35 | https://raw.githubusercontent.com/indix/schemer/master/LICENSE
36 | repo
37 |
38 |
39 |
40 | git@github.com:indix/schemer.git
41 | scm:git:git@github.com:indix/schemer.git
42 |
43 |
44 |
45 | indix
46 | Indix
47 | http://www.indix.com
48 |
49 |
50 | )
51 |
52 | lazy val schemer = Project(
53 | id = "schemer",
54 | base = file(".")
55 | ) aggregate (core, registry)
56 |
57 | lazy val core = (project in file("schemer-core"))
58 | .settings(
59 | inThisBuild(
60 | List(
61 | organization := "com.indix",
62 | scalaVersion := "2.11.11",
63 | crossScalaVersions := Seq("2.11.11"),
64 | version := libVersion,
65 | scalafmtOnCompile := true
66 | )
67 | ),
68 | name := "schemer-core",
69 | libraryDependencies ++= sparkStackProvided ++ Seq(jsonSchemaValidator, scalaTest)
70 | )
71 | .settings(publishSettings: _*)
72 |
73 | lazy val registry = (project in file("schemer-registry"))
74 | .enablePlugins(BuildInfoPlugin)
75 | .enablePlugins(AshScriptPlugin)
76 | .enablePlugins(JavaAppPackaging)
77 | .enablePlugins(DockerPlugin)
78 | .settings(
79 | dockerBaseImage := "anapsix/alpine-java:8u131b11_server-jre_unlimited",
80 | packageName in Docker := "schemer-registry",
81 | dockerExposedPorts := Seq(9000),
82 | version in Docker := libVersion,
83 | daemonUser in Docker := "root",
84 | dockerRepository := Some("indix"),
85 | Revolver.enableDebugging(port = 5005, suspend = false)
86 | )
87 | .settings(
88 | inThisBuild(
89 | List(
90 | organization := "com.indix",
91 | scalaVersion := "2.11.11",
92 | version := libVersion,
93 | scalafmtOnCompile := true
94 | )
95 | ),
96 | name := "schemer-registry",
97 | libraryDependencies ++= sparkStack ++ akkaStack ++ loggingStack ++ Seq(
98 | hadoopAws,
99 | sangria,
100 | sangriaSpray,
101 | postgres,
102 | quill,
103 | quillAsyncPostgres,
104 | flyway,
105 | prometheusClient,
106 | prometheusClientCommon,
107 | prometheusHotspot,
108 | scalaTest
109 | ),
110 | excludeDependencies ++= Seq(
111 | ExclusionRule("com.typesafe.scala-logging", "scala-logging-slf4j_2.11")
112 | )
113 | ) dependsOn core
114 |
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '3'
2 |
3 | services:
4 | postgres:
5 | image: postgres:9.5.0
6 | ports:
7 | - 5432:5432
8 | environment:
9 | - POSTGRES_USER=schemer
10 | - POSTGRES_PASSWORD=schemer
11 | - PGDATA=/var/lib/postgresql/data/pgdata
12 | - POSTGRES_DB=schemer
13 | volumes:
14 | - ./schemer_db:/var/lib/postgresql/data/pgdata
15 | schemer:
16 | image: indix/schemer-registry:latest
17 | restart: always
18 | ports:
19 | - 9000:9000
20 | depends_on:
21 | - postgres
22 | environment:
23 | - POSTGRES_URL=postgresql://postgres:5432/schemer?user=schemer&password=schemer
24 |
25 |
--------------------------------------------------------------------------------
/project/Dependencies.scala:
--------------------------------------------------------------------------------
1 | import sbt.{ExclusionRule, _}
2 |
3 | object Versions {
4 | val sparkVersion = "2.3.1"
5 | val akkaHttpVersion = "10.0.10"
6 | }
7 |
8 | object Dependencies {
9 | lazy val scalaTest = "org.scalatest" %% "scalatest" % "3.0.3" % Test
10 | lazy val sparkCore = "org.apache.spark" %% "spark-core" % Versions.sparkVersion
11 | lazy val sparkSql = "org.apache.spark" %% "spark-sql" % Versions.sparkVersion
12 | lazy val sparkAvro = "com.databricks" %% "spark-avro" % "4.0.0"
13 |
14 | lazy val sparkStack = Seq(sparkCore, sparkSql, sparkAvro)
15 | lazy val sparkStackProvided = sparkStack.map(_ % Provided)
16 |
17 | lazy val hadoopAws = "org.apache.hadoop" % "hadoop-aws" % "2.6.0"
18 |
19 | lazy val jsonSchemaValidator = "com.github.fge" % "json-schema-validator" % "2.2.6" excludeAll {
20 | ExclusionRule("javax.mail")
21 | }
22 |
23 | lazy val prometheusClient = "io.prometheus" % "simpleclient" % "0.2.0"
24 | lazy val prometheusClientCommon = "io.prometheus" % "simpleclient_common" % "0.2.0"
25 | lazy val prometheusHotspot = "io.prometheus" % "simpleclient_hotspot" % "0.2.0"
26 |
27 | lazy val akkaHttpCore = "com.typesafe.akka" %% "akka-http-core" % Versions.akkaHttpVersion
28 | lazy val akkaHttp = "com.typesafe.akka" %% "akka-http" % Versions.akkaHttpVersion
29 | lazy val sprayJsonAkka = "com.typesafe.akka" %% "akka-http-spray-json" % Versions.akkaHttpVersion
30 | lazy val akkaHttpTestkit = "com.typesafe.akka" %% "akka-http-testkit" % Versions.akkaHttpVersion % Test
31 | lazy val akkaStack =
32 | Seq(akkaHttpCore, akkaHttp, sprayJsonAkka, akkaHttpTestkit)
33 |
34 | lazy val sangria = "org.sangria-graphql" %% "sangria" % "1.2.0"
35 | lazy val sangriaSpray = "org.sangria-graphql" %% "sangria-spray-json" % "1.0.0"
36 |
37 | val logbackClassic = "ch.qos.logback" % "logback-classic" % "1.2.3"
38 | val scalaLogging = "com.typesafe.scala-logging" %% "scala-logging" % "3.7.2"
39 |
40 | val loggingStack = Seq(logbackClassic, scalaLogging)
41 |
42 | val postgres = "org.postgresql" % "postgresql" % "9.4.1208"
43 | val quill = "io.getquill" %% "quill-jdbc" % "2.3.1"
44 | val quillAsyncPostgres = "io.getquill" %% "quill-async-postgres" % "2.3.1"
45 | val flyway = "org.flywaydb" % "flyway-core" % "4.1.1"
46 | }
47 |
--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=1.0.4
2 |
--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.lucidchart" % "sbt-scalafmt" % "1.12")
2 | addSbtPlugin("com.eed3si9n" % "sbt-buildinfo" % "0.7.0")
3 | addSbtPlugin("com.typesafe.sbt" % "sbt-native-packager" % "1.3.2")
4 | addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.1.0")
5 | addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "2.0")
6 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.9.0")
7 | addSbtPlugin("io.spray" % "sbt-revolver" % "0.9.1")
8 | scalafmtOnCompile in ThisBuild := true
--------------------------------------------------------------------------------
/project/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.lucidchart" % "sbt-scalafmt" % "1.12")
--------------------------------------------------------------------------------
/publish.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | set -ex
4 |
5 | sbt "project core" +publishSigned
6 | sbt sonatypeReleaseAll
7 |
8 | docker login -u "$DOCKER_USERNAME" -p "$DOCKER_PASSWORD"
9 | sbt docker:publishLocal
10 | docker push indix/schemer-registry:${TRAVIS_TAG}
11 | docker tag indix/schemer-registry:${TRAVIS_TAG} indix/schemer-registry:latest
12 | docker push indix/schemer-registry:latest
13 |
--------------------------------------------------------------------------------
/pubring.gpg.enc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/indix/schemer/16069813fa20b652f8e7735b64a47519e3ee8f14/pubring.gpg.enc
--------------------------------------------------------------------------------
/resources/images/001.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/indix/schemer/16069813fa20b652f8e7735b64a47519e3ee8f14/resources/images/001.png
--------------------------------------------------------------------------------
/resources/images/002.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/indix/schemer/16069813fa20b652f8e7735b64a47519e3ee8f14/resources/images/002.png
--------------------------------------------------------------------------------
/resources/images/003.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/indix/schemer/16069813fa20b652f8e7735b64a47519e3ee8f14/resources/images/003.png
--------------------------------------------------------------------------------
/resources/images/004.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/indix/schemer/16069813fa20b652f8e7735b64a47519e3ee8f14/resources/images/004.png
--------------------------------------------------------------------------------
/resources/images/005.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/indix/schemer/16069813fa20b652f8e7735b64a47519e3ee8f14/resources/images/005.png
--------------------------------------------------------------------------------
/resources/images/006.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/indix/schemer/16069813fa20b652f8e7735b64a47519e3ee8f14/resources/images/006.png
--------------------------------------------------------------------------------
/resources/images/007.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/indix/schemer/16069813fa20b652f8e7735b64a47519e3ee8f14/resources/images/007.png
--------------------------------------------------------------------------------
/resources/images/008.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/indix/schemer/16069813fa20b652f8e7735b64a47519e3ee8f14/resources/images/008.png
--------------------------------------------------------------------------------
/resources/images/schemer-logo-text-wide.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/indix/schemer/16069813fa20b652f8e7735b64a47519e3ee8f14/resources/images/schemer-logo-text-wide.png
--------------------------------------------------------------------------------
/resources/images/schemer-logo-text-wide.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 | schemer
24 |
25 |
26 |
27 |
--------------------------------------------------------------------------------
/resources/images/schemer-logo-text.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/indix/schemer/16069813fa20b652f8e7735b64a47519e3ee8f14/resources/images/schemer-logo-text.png
--------------------------------------------------------------------------------
/resources/images/schemer-logo-text.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 | schemer
24 |
25 |
26 |
27 |
--------------------------------------------------------------------------------
/resources/images/schemer-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/indix/schemer/16069813fa20b652f8e7735b64a47519e3ee8f14/resources/images/schemer-logo.png
--------------------------------------------------------------------------------
/resources/images/schemer-logo.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
--------------------------------------------------------------------------------
/schemer-core/src/main/scala/schemer/AvroSchema.scala:
--------------------------------------------------------------------------------
1 | package schemer
2 | import java.io.IOException
3 |
4 | import com.databricks.spark.avro.SchemaConverters
5 | import org.apache.avro.Schema.Parser
6 | import org.apache.avro.SchemaBuilder
7 | import org.apache.spark.sql.SparkSession
8 | import org.apache.spark.sql.types.StructType
9 |
10 | import scala.util.Random
11 |
12 | case class AvroSchemaBase() extends SchemaLikeBase[AvroSchema] {
13 | override def infer(paths: String*)(implicit spark: SparkSession) = {
14 | val schema = spark.read.format("com.databricks.spark.avro").load(paths: _*).schema
15 |
16 | AvroSchema(schema)
17 | }
18 | }
19 |
20 | case class AvroSchema(schema: String) extends SchemaLike {
21 |
22 | private def avroSchema() = new Parser().parse(schema)
23 |
24 | override def validate =
25 | try {
26 | sparkSchema()
27 | List.empty
28 | } catch {
29 | case e: IOException => List(s"Error while consuming Avro schema: ${e.getMessage}")
30 | }
31 |
32 | override def sparkSchema() = SchemaConverters.toSqlType(avroSchema()).dataType.asInstanceOf[StructType]
33 |
34 | override def toDf(paths: String*)(implicit spark: SparkSession) =
35 | spark.read.format("com.databricks.spark.avro").load(paths: _*)
36 | }
37 |
38 | object AvroSchema {
39 | def apply(): AvroSchemaBase = AvroSchemaBase()
40 |
41 | def apply(schema: StructType): AvroSchema =
42 | apply(schema, s"SchemerInferred_${Random.alphanumeric take 12 mkString ""}", "schemer")
43 |
44 | def apply(schema: StructType, record: String, namespace: String): AvroSchema = {
45 | val builder = SchemaBuilder.record(record).namespace(namespace)
46 | val avroSchema = SchemaConverters.convertStructToAvro(schema, builder, namespace).toString(true)
47 | new AvroSchema(avroSchema)
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/schemer-core/src/main/scala/schemer/CSVSchema.scala:
--------------------------------------------------------------------------------
1 | package schemer
2 |
3 | import com.fasterxml.jackson.annotation.JsonProperty
4 | import org.apache.spark.sql.types._
5 | import org.apache.spark.sql.{DataFrame, SparkSession}
6 | import schemer.utils.JSONUtil
7 |
8 | case class CSVOptions(
9 | header: Boolean = true,
10 | headerBasedParser: Boolean = false,
11 | separator: String = ",",
12 | quoteChar: String = "\"",
13 | escapeChar: String = "\\"
14 | )
15 |
16 | case class CSVSchemaBase(csvOptions: CSVOptions) extends SchemaLikeBase[CSVSchema] {
17 | override def infer(paths: String*)(implicit @transient spark: SparkSession) = {
18 | val schema = spark.read
19 | .option("header", csvOptions.header.toString)
20 | .option("delimiter", csvOptions.separator)
21 | .option("quote", csvOptions.quoteChar)
22 | .option("escape", csvOptions.escapeChar)
23 | .option("nullValue", null)
24 | .option("inferSchema", "true")
25 | .csv(paths: _*)
26 | .schema
27 |
28 | CSVSchema(schema, csvOptions)
29 | }
30 | }
31 |
32 | case class CSVSchema(
33 | @JsonProperty(required = true) fields: List[CSVField],
34 | options: CSVOptions = CSVOptions()
35 | ) extends SchemaLike {
36 |
37 | override def validate: List[String] =
38 | validateFields ++ validateMetaFields
39 |
40 | override def sparkSchema() = {
41 | val structFields = this.fields.map(field => StructField(field.name, getDataType(field.`type`), field.nullable))
42 | StructType(structFields)
43 | }
44 |
45 | def toDf(paths: String*)(implicit @transient spark: SparkSession) = {
46 | val csvDF = spark.read
47 | .option("delimiter", options.separator)
48 | .option("quote", options.quoteChar)
49 | .option("escape", options.escapeChar)
50 | .option("nullValue", null)
51 | .csv(paths: _*)
52 | val orderedSchema = reconcileSchemaFieldOrder(sparkSchema(), csvDF)
53 |
54 | spark.read
55 | .option("header", options.header.toString)
56 | .option("delimiter", options.separator)
57 | .option("quote", options.quoteChar)
58 | .option("escape", options.escapeChar)
59 | .option("nullValue", null)
60 | .schema(orderedSchema)
61 | .csv(paths: _*)
62 | }
63 |
64 | private def reconcileSchemaFieldOrder(sparkSchema: StructType, csvDF: DataFrame) =
65 | if (options.headerBasedParser && options.header) {
66 | val actualHeaders = csvDF
67 | .first()
68 | .toSeq
69 | .map(_.toString)
70 | StructType(actualHeaders.map(field => sparkSchema(sparkSchema.fieldIndex(field))))
71 | } else {
72 | sparkSchema
73 | }
74 |
75 | private def getDataType(csvFieldType: String) =
76 | csvFieldType.toLowerCase match {
77 | case "int" | "integer" => IntegerType
78 | case "long" => LongType
79 | case "double" => DoubleType
80 | case "float" => FloatType
81 | case "string" => StringType
82 | case "datetime" => DateType
83 | case "boolean" => BooleanType
84 | case _ => StringType
85 | }
86 |
87 | private def validateFields =
88 | if (fields.nonEmpty) {
89 | List.empty
90 | } else {
91 | List("fields can't be empty in a CSVSchema")
92 | }
93 |
94 | private def validateMetaFields =
95 | if (options.header && fields.exists(_.position.isEmpty)) {
96 | List("CSVSchema with hasHeader=false should have valid position numbers on all fields")
97 | } else {
98 | List.empty
99 | }
100 |
101 | override def schema() =
102 | JSONUtil.toJson(this)
103 | }
104 |
105 | object CSVSchema {
106 | def apply(schema: String): CSVSchema =
107 | JSONUtil.fromJson[CSVSchema](schema)
108 |
109 | def apply(options: CSVOptions): CSVSchemaBase =
110 | CSVSchemaBase(options)
111 |
112 | def apply(): CSVSchemaBase =
113 | CSVSchemaBase(CSVOptions())
114 | def apply(
115 | schema: StructType,
116 | options: CSVOptions
117 | ): CSVSchema = {
118 | val fields = schema.fields.zipWithIndex.map {
119 | case (f: StructField, i: Int) => CSVField(f.name, f.nullable, getCsvType(f.dataType), Some(i))
120 | }.toList
121 |
122 | new CSVSchema(fields, options)
123 | }
124 |
125 | def apply(
126 | schema: StructType,
127 | options: Map[String, String]
128 | ): CSVSchema = {
129 | val fields = schema.fields.zipWithIndex.map {
130 | case (f: StructField, i: Int) => CSVField(f.name, f.nullable, getCsvType(f.dataType), Some(i))
131 | }.toList
132 |
133 | val csvOptions = CSVOptions(
134 | options.getOrElse("header", "true").toBoolean,
135 | options.getOrElse("headerBasedParser", "true").toBoolean,
136 | options.getOrElse("separator", ","),
137 | options.getOrElse("quoteChar", "\""),
138 | options.getOrElse("escapeChar", "\\")
139 | )
140 |
141 | new CSVSchema(fields, csvOptions)
142 | }
143 |
144 | private def getCsvType(sparkType: DataType) = sparkType match {
145 | case IntegerType => "int"
146 | case LongType => "long"
147 | case DoubleType => "double"
148 | case FloatType => "float"
149 | case StringType => "string"
150 | case DateType => "datetime"
151 | case BooleanType => "boolean"
152 | case _ => "string"
153 | }
154 | }
155 |
156 | case class CSVField(
157 | name: String,
158 | nullable: Boolean,
159 | `type`: String,
160 | position: Option[Int]
161 | )
162 |
--------------------------------------------------------------------------------
/schemer-core/src/main/scala/schemer/JSONSchema.scala:
--------------------------------------------------------------------------------
1 | package schemer
2 |
3 | import com.fasterxml.jackson.databind.JsonNode
4 | import com.github.fge.jackson.JsonLoader
5 | import com.github.fge.jsonschema.main.JsonSchemaFactory
6 | import org.apache.spark.sql.SparkSession
7 | import org.apache.spark.sql.types._
8 | import schemer.utils.{JSONUtil, JsonSchemaValidationUtil}
9 |
10 | import scala.annotation.tailrec
11 | import scala.collection.JavaConverters._
12 |
13 | abstract trait JSONSchemaNode {
14 | def toJSON: String = JSONUtil.toJson(this)
15 | }
16 |
17 | case class ObjectSchema(
18 | `type`: String = "object",
19 | properties: Map[String, JSONSchemaNode],
20 | additionalProperties: Boolean = false,
21 | $schema: Option[String] = None
22 | ) extends JSONSchemaNode
23 |
24 | case class StringSchema(
25 | `type`: String = "string",
26 | format: Option[String] = None,
27 | pattern: Option[String] = None,
28 | minLength: Option[Int] = None,
29 | maxLength: Option[Int] = None
30 | ) extends JSONSchemaNode
31 |
32 | case class IntegerSchema(`type`: String = "integer", minimum: Option[BigInt] = None, maximum: Option[BigInt] = None)
33 | extends JSONSchemaNode
34 |
35 | case class NumberSchema(`type`: String = "number", minimum: Option[Double] = None, maximum: Option[Double] = None)
36 | extends JSONSchemaNode
37 |
38 | case class BooleanSchema(`type`: String = "boolean") extends JSONSchemaNode
39 |
40 | case class ArraySchema(`type`: String = "array", items: JSONSchemaNode) extends JSONSchemaNode
41 |
42 | case class JSONSchemaBase() extends SchemaLikeBase[JSONSchema] {
43 |
44 | @tailrec
45 | private def processStructFields(
46 | fields: List[StructField],
47 | accum: List[(String, JSONSchemaNode)] = Nil
48 | ): List[(String, JSONSchemaNode)] =
49 | fields match {
50 | case x :: xs =>
51 | processStructFields(xs, accum ++ List(processField(x)))
52 | case Nil => accum
53 | }
54 |
55 | private def processField(x: StructField) =
56 | (x.name, processDataType(x.dataType))
57 |
58 | private def processDataType(dataType: DataType): JSONSchemaNode = dataType match {
59 | case StringType => StringSchema()
60 | case LongType | IntegerType => IntegerSchema()
61 | case DoubleType => NumberSchema()
62 | case BooleanType => BooleanSchema()
63 | case f if f.isInstanceOf[StructType] => convertSparkToJsonSchema(dataType.asInstanceOf[StructType])
64 | case f if f.isInstanceOf[ArrayType] =>
65 | ArraySchema(items = processDataType(dataType.asInstanceOf[ArrayType].elementType))
66 | }
67 |
68 | def convertSparkToJsonSchema(schema: StructType, draft: Option[String] = None) =
69 | ObjectSchema(properties = processStructFields(schema.fields.toList).toMap, $schema = draft)
70 |
71 | override def infer(paths: String*)(implicit spark: SparkSession) = {
72 | val sampleJsonData = spark.read.textFile(paths: _*).limit(1000)
73 | val schema = spark.read.json(sampleJsonData.rdd).schema
74 | val jsonSchema = convertSparkToJsonSchema(schema, Some("http://json-schema.org/draft-06/schema#")).toJSON
75 | JSONSchema(jsonSchema)
76 | }
77 | }
78 |
79 | case class JSONSchema(schema: String) extends SchemaLike {
80 |
81 | private val jsonSchema = JsonLoader.fromString(schema)
82 |
83 | override def validate: List[String] = {
84 | val validator = JsonSchemaFactory.byDefault().getSyntaxValidator
85 | val report = validator.validateSchema(jsonSchema)
86 | val syntaxErrors = JsonSchemaValidationUtil.process(report)
87 | if (syntaxErrors.isEmpty) {
88 | try {
89 | sparkSchema()
90 | List.empty
91 | } catch {
92 | case e: UnsupportedOperationException => List(e.getMessage)
93 | }
94 | } else {
95 | syntaxErrors
96 | }
97 | }
98 |
99 | override def sparkSchema(): StructType = jsonToStructType(jsonSchema).asInstanceOf[StructType]
100 |
101 | def toDf(paths: String*)(implicit spark: SparkSession) =
102 | spark.read
103 | .schema(sparkSchema())
104 | .json(paths: _*)
105 |
106 | private def getRequiredProps(jsonSchema: JsonNode) =
107 | if (jsonSchema.has("required") && jsonSchema.get("required").isArray) {
108 | Some(jsonSchema.get("required").elements().asScala.map(_.asText()))
109 | } else {
110 | None
111 | }
112 |
113 | private def toArrayType(field: JsonNode) = {
114 | val itemsNode = field.get("items")
115 | if (itemsNode != null && itemsNode.isArray) {
116 | ArrayType(jsonToStructType(itemsNode.get(0)))
117 | } else if (itemsNode != null && itemsNode.isObject) {
118 | ArrayType(jsonToStructType(itemsNode))
119 | } else {
120 | ArrayType(StringType)
121 | }
122 | }
123 |
124 | private def toObjectType(jsonSchema: JsonNode) = {
125 | val requiredFields = getRequiredProps(jsonSchema).getOrElse(List.empty)
126 | if (jsonSchema.has("patternProperties")) {
127 | MapType(
128 | StringType,
129 | jsonToStructType(jsonSchema.get("patternProperties").fields().asScala.toList.head.getValue)
130 | )
131 | } else {
132 | StructType(
133 | jsonSchema
134 | .get("properties")
135 | .fields()
136 | .asScala
137 | .toList
138 | .map(field => {
139 | val fieldType = jsonToStructType(field.getValue)
140 | StructField(field.getKey, fieldType, nullable = !requiredFields.toList.contains(field.getKey))
141 | })
142 | )
143 | }
144 | }
145 |
146 | private def jsonToStructType(jsonSchema: JsonNode): DataType =
147 | jsonSchema.get("type").asText() match {
148 | case "array" => toArrayType(jsonSchema)
149 | case "object" => toObjectType(jsonSchema)
150 | case "boolean" => BooleanType
151 | case "string" => StringType
152 | case "integer" => LongType
153 | case "number" => DoubleType
154 | case _ =>
155 | throw new UnsupportedOperationException(
156 | s"Trying to convert a unsupported type ${jsonSchema.get("type").asText()}. Types other than (boolean, string, integer, number, object, array) aren't supported"
157 | )
158 |
159 | }
160 | }
161 |
162 | object JSONSchema {
163 | def apply(): JSONSchemaBase = JSONSchemaBase()
164 | def apply(schema: StructType): JSONSchema = JSONSchema(JSONSchemaBase().convertSparkToJsonSchema(schema).toJSON)
165 | }
166 |
--------------------------------------------------------------------------------
/schemer-core/src/main/scala/schemer/ParquetSchema.scala:
--------------------------------------------------------------------------------
1 | package schemer
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.types.StructType
5 |
6 | import scala.reflect.runtime.universe._
7 |
8 | sealed trait ParquetSchemaType {
9 | val `type`: String
10 | }
11 |
12 | object ParquetSchemaType {
13 | case object Avro extends ParquetSchemaType {
14 | override val `type`: String = "avro"
15 | }
16 | case object Csv extends ParquetSchemaType {
17 | override val `type`: String = "csv"
18 | }
19 | case object Json extends ParquetSchemaType {
20 | override val `type`: String = "json"
21 | }
22 |
23 | val supportedTypes = List(Avro, Csv, Json).map(_.`type`)
24 | }
25 |
26 | case class ParquetSchemaBase[T <: SchemaLike: TypeTag](override val options: Map[String, String] = Map())
27 | extends SchemaLikeBase[ParquetSchema] {
28 | override def infer(paths: String*)(implicit spark: SparkSession) = {
29 | val schema = spark.read.parquet(paths: _*).schema
30 | val underlyingSchema = typeOf[T] match {
31 | case t if t =:= typeOf[AvroSchema] => (ParquetSchemaType.Avro, AvroSchema(schema))
32 | case t if t =:= typeOf[JSONSchema] => (ParquetSchemaType.Json, JSONSchema(schema))
33 | case t if t =:= typeOf[CSVSchema] => (ParquetSchemaType.Csv, CSVSchema(schema, options))
34 | }
35 |
36 | ParquetSchema(underlyingSchema._2.schema(), underlyingSchema._1)
37 | }
38 | }
39 |
40 | case class ParquetSchema(schema: String, `type`: ParquetSchemaType) extends SchemaLike {
41 |
42 | val schemaType = `type` match {
43 | case ParquetSchemaType.Avro => AvroSchema(schema)
44 | case ParquetSchemaType.Csv => CSVSchema(schema)
45 | case ParquetSchemaType.Json => JSONSchema(schema)
46 | }
47 |
48 | override def validate = schemaType.validate
49 |
50 | def toDf(paths: String*)(implicit spark: SparkSession) =
51 | spark.read
52 | .schema(sparkSchema())
53 | .parquet(paths: _*)
54 |
55 | override def sparkSchema(): StructType = schemaType.sparkSchema()
56 | }
57 |
58 | object ParquetSchema {
59 | def apply[T <: SchemaLike: TypeTag]() = ParquetSchemaBase[T]()
60 | def apply(`type`: String) = `type` match {
61 | case ParquetSchemaType.Avro.`type` => apply[AvroSchema]()
62 | case ParquetSchemaType.Csv.`type` => apply[CSVSchema]()
63 | case ParquetSchemaType.Json.`type` => apply[JSONSchema]()
64 | }
65 | }
66 |
--------------------------------------------------------------------------------
/schemer-core/src/main/scala/schemer/SchemaLike.scala:
--------------------------------------------------------------------------------
1 | package schemer
2 |
3 | import org.apache.spark.sql.types.StructType
4 | import org.apache.spark.sql.{DataFrame, SparkSession}
5 |
6 | private[schemer] trait SchemaLikeBase[T <: SchemaLike] {
7 | val options: Map[String, String] = Map()
8 | def infer(paths: String*)(implicit @transient spark: SparkSession): T
9 | }
10 |
11 | private[schemer] trait SchemaLike {
12 | def validate: List[String]
13 |
14 | def sparkSchema(): StructType
15 |
16 | def schema(): String
17 |
18 | def toDf(paths: String*)(implicit @transient spark: SparkSession): DataFrame
19 | }
20 |
--------------------------------------------------------------------------------
/schemer-core/src/main/scala/schemer/Schemer.scala:
--------------------------------------------------------------------------------
1 | package schemer
2 |
3 | sealed trait SchemaType {
4 | val `type`: String
5 | }
6 |
7 | object SchemaType {
8 | case object Avro extends SchemaType {
9 | override val `type`: String = "avro"
10 | }
11 | case object Csv extends SchemaType {
12 | override val `type`: String = "csv"
13 | }
14 | case object Json extends SchemaType {
15 | override val `type`: String = "json"
16 | }
17 | case object ParquetAvro extends SchemaType {
18 | override val `type`: String = "parquet_avro"
19 | }
20 | case object ParquetCsv extends SchemaType {
21 | override val `type`: String = "parquet_csv"
22 | }
23 | case object ParquetJson extends SchemaType {
24 | override val `type`: String = "parquet_json"
25 | }
26 | val supportedTypes = List(Avro, Csv, Json, ParquetAvro, ParquetCsv, ParquetJson)
27 | }
28 |
29 | object Schemer {
30 | def from(`type`: String, config: String): SchemaLike = `type` match {
31 | case "avro" => AvroSchema(config)
32 | case "csv" => CSVSchema(config)
33 | case "json" => JSONSchema(config)
34 | case "parquet_avro" => ParquetSchema(config, ParquetSchemaType.Avro)
35 | case "parquet_csv" => ParquetSchema(config, ParquetSchemaType.Csv)
36 | case "parquet_json" => ParquetSchema(config, ParquetSchemaType.Json)
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/schemer-core/src/main/scala/schemer/utils/JSONUtil.scala:
--------------------------------------------------------------------------------
1 | package schemer.utils
2 |
3 | import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper, SerializationFeature}
4 | import com.fasterxml.jackson.module.scala.DefaultScalaModule
5 | import com.fasterxml.jackson.annotation.JsonInclude.Include
6 |
7 | import scala.reflect.ClassTag
8 |
9 | private[schemer] object JSONUtil {
10 | private val mapper = new ObjectMapper()
11 |
12 | mapper.registerModule(DefaultScalaModule)
13 |
14 | mapper.setSerializationInclusion(Include.NON_NULL)
15 |
16 | def toJson(value: Any) = mapper.writeValueAsString(value)
17 |
18 | def prettyJson(value: Any) = mapper.enable(SerializationFeature.INDENT_OUTPUT).writeValueAsString(value)
19 |
20 | def fromJson[T: ClassTag](json: String) = {
21 | val classType = implicitly[ClassTag[T]].runtimeClass.asInstanceOf[Class[T]]
22 | mapper.readValue[T](json, classType)
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/schemer-core/src/main/scala/schemer/utils/JsonSchemaValidationUtil.scala:
--------------------------------------------------------------------------------
1 | package schemer.utils
2 |
3 | import com.github.fge.jsonschema.core.report.ProcessingReport
4 | import scala.collection.JavaConverters._
5 |
6 | object JsonSchemaValidationUtil {
7 | def process(report: ProcessingReport): List[String] =
8 | if (!report.isSuccess) {
9 | getErrorsFromReport(report)
10 | } else {
11 | List.empty
12 | }
13 |
14 | private def getErrorsFromReport(report: ProcessingReport) = {
15 | val errorList = report.iterator.asScala.toList
16 | .map { message =>
17 | message.asJson()
18 | }
19 | .filter { json =>
20 | json.get("level").asText == "error"
21 | }
22 | .map { json =>
23 | json.get("message").asText
24 | }
25 | errorList
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/schemer-core/src/test/resources/test.csv:
--------------------------------------------------------------------------------
1 | title,url,storeId
2 | iphone,http://indix.com/iphone,42
3 | galaxy,http://indix.com/galaxy,43
4 | lumia,http://indix.com/lumia,44
--------------------------------------------------------------------------------
/schemer-core/src/test/resources/test.json:
--------------------------------------------------------------------------------
1 | {"title": "iphone", "url": "http://indix.com/iphone", "imageUrls": ["http://indix.com/iphone.jpg"], "storeId": 42, "price": {"min": 10.0, "max": 100.0 }, "isAvailable": false}
--------------------------------------------------------------------------------
/schemer-core/src/test/resources/test.tsv:
--------------------------------------------------------------------------------
1 | title url storeId
2 | iphone http://indix.com/iphone 42
3 | galaxy http://indix.com/galaxy 43
4 | lumia http://indix.com/lumia 44
--------------------------------------------------------------------------------
/schemer-core/src/test/scala/schemer/AvroSchemaSpec.scala:
--------------------------------------------------------------------------------
1 | package schemer
2 |
3 | import com.databricks.spark.avro._
4 | import org.apache.spark.SparkConf
5 | import org.apache.spark.sql.types.{IntegerType, StringType}
6 | import org.apache.spark.sql.{SaveMode, SparkSession}
7 | import org.scalatest.{FlatSpec, Matchers}
8 |
9 | class AvroSchemaSpec extends FlatSpec with Matchers {
10 | implicit val spark: SparkSession = SparkSession.builder
11 | .config(new SparkConf())
12 | .master("local[*]")
13 | .getOrCreate()
14 |
15 | "AvroSchema" should "infer avro schema from given path" in {
16 | import spark.implicits._
17 | val df = Seq(TestRecord("iphone", "http://indix.com/iphone", 42)).toDF
18 |
19 | try {
20 | df.write.mode(SaveMode.Overwrite).avro("test")
21 | val schema = AvroSchema().infer("test")
22 | schema.schema.replaceAll("SchemerInferred_[^\"]+", "SchemerInferred") should be(
23 | "{\n \"type\" : \"record\",\n \"name\" : \"SchemerInferred\",\n \"namespace\" : \"schemer\",\n \"fields\" : [ {\n \"name\" : \"title\",\n \"type\" : [ \"string\", \"null\" ]\n }, {\n \"name\" : \"url\",\n \"type\" : [ \"string\", \"null\" ]\n }, {\n \"name\" : \"storeId\",\n \"type\" : [ \"int\", \"null\" ]\n } ]\n}"
24 | )
25 | } finally {
26 | Helpers.cleanOutputPath("test")
27 | }
28 | }
29 |
30 | it should "get spark schema" in {
31 | val schema = AvroSchema(
32 | "{\n \"type\" : \"record\",\n \"name\" : \"SchemerInferred\",\n \"namespace\" : \"schemer\",\n \"fields\" : [ {\n \"name\" : \"title\",\n \"type\" : [ \"string\", \"null\" ]\n }, {\n \"name\" : \"url\",\n \"type\" : [ \"string\", \"null\" ]\n }, {\n \"name\" : \"storeId\",\n \"type\" : [ \"int\", \"null\" ]\n } ]\n}"
33 | )
34 | val schemaFields = schema.sparkSchema().fields
35 | schemaFields.length should be(3)
36 |
37 | schemaFields(0).name should be("title")
38 | schemaFields(0).dataType should be(StringType)
39 |
40 | schemaFields(1).name should be("url")
41 | schemaFields(1).dataType should be(StringType)
42 |
43 | schemaFields(2).name should be("storeId")
44 | schemaFields(2).dataType should be(IntegerType)
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/schemer-core/src/test/scala/schemer/CSVSchemaSpec.scala:
--------------------------------------------------------------------------------
1 | package schemer
2 |
3 | import org.apache.spark.SparkConf
4 | import org.apache.spark.sql.SparkSession
5 | import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
6 | import org.scalatest._
7 |
8 | import scala.util.Try
9 |
10 | class CSVSchemaSpec extends FlatSpec with Matchers {
11 | implicit val spark: SparkSession = SparkSession.builder
12 | .config(new SparkConf())
13 | .master("local[*]")
14 | .getOrCreate()
15 |
16 | "CSVSchema" should "infer schema from given path" in {
17 | val path = getClass.getClassLoader.getResource("test.csv").getPath
18 |
19 | val inferredSchema = CSVSchema().infer(path)
20 | val fields = inferredSchema.fields
21 |
22 | fields.length should be(3)
23 | fields(0).name should be("title")
24 | fields(0).`type` should be("string")
25 |
26 | fields(1).name should be("url")
27 | fields(1).`type` should be("string")
28 |
29 | fields(2).name should be("storeId")
30 | fields(2).`type` should be("int")
31 | }
32 |
33 | it should "infer schema without header from file" in {
34 | val path = getClass.getClassLoader.getResource("test.csv").getPath
35 |
36 | val inferredSchema = CSVSchema(CSVOptions(false)).infer(path)
37 | val fields = inferredSchema.fields
38 |
39 | fields.length should be(3)
40 | fields(0).name should be("_c0")
41 | fields(0).`type` should be("string")
42 |
43 | fields(1).name should be("_c1")
44 | fields(1).`type` should be("string")
45 |
46 | fields(2).name should be("_c2")
47 | fields(2).`type` should be("string")
48 | }
49 |
50 | it should "infer schema and read" in {
51 | val path = getClass.getClassLoader.getResource("test.csv").getPath
52 |
53 | val inferredSchema = CSVSchema().infer(path)
54 | import spark.implicits._
55 | val output = inferredSchema.toDf(path).as[TestRecord].collect()
56 |
57 | output.length should be(3)
58 | output(0).title should be("iphone")
59 | output(0).url should be("http://indix.com/iphone")
60 | output(0).storeId should be(42)
61 | }
62 |
63 | it should "infer schema and read from TSV" in {
64 | val path = getClass.getClassLoader.getResource("test.tsv").getPath
65 |
66 | val inferredSchema = CSVSchema(CSVOptions(headerBasedParser = true, separator = "\t")).infer(path)
67 | import spark.implicits._
68 | val output = inferredSchema.toDf(path).as[TestRecord].collect()
69 |
70 | output.length should be(3)
71 | output(0).title should be("iphone")
72 | output(0).url should be("http://indix.com/iphone")
73 | output(0).storeId should be(42)
74 | }
75 |
76 | it should "infer schema and get schema json" in {
77 | val path = getClass.getClassLoader.getResource("test.csv").getPath
78 |
79 | val inferredSchema = CSVSchema().infer(path)
80 |
81 | inferredSchema.schema() should be(
82 | "{\"fields\":[{\"name\":\"title\",\"nullable\":true,\"type\":\"string\",\"position\":0},{\"name\":\"url\",\"nullable\":true,\"type\":\"string\",\"position\":1},{\"name\":\"storeId\",\"nullable\":true,\"type\":\"int\",\"position\":2}],\"options\":{\"header\":true,\"headerBasedParser\":false,\"separator\":\",\",\"quoteChar\":\"\\\"\",\"escapeChar\":\"\\\\\"}}"
83 | )
84 | }
85 |
86 | it should "get schema from json" in {
87 | val schema = CSVSchema(
88 | "{\"fields\":[{\"name\":\"title\",\"nullable\":true,\"type\":\"string\",\"position\":0},{\"name\":\"url\",\"nullable\":true,\"type\":\"string\",\"position\":1},{\"name\":\"storeId\",\"nullable\":true,\"type\":\"int\",\"position\":2}],\"options\":{\"header\":true,\"headerBasedParser\":false,\"separator\":\",\",\"quoteChar\":\"\\\"\",\"escapeChar\":\"\\\\\"}}"
89 | )
90 |
91 | schema.sparkSchema() should be(
92 | StructType(
93 | Seq(
94 | StructField("title", StringType, true),
95 | StructField("url", StringType, true),
96 | StructField("storeId", IntegerType, true)
97 | )
98 | )
99 | )
100 | }
101 |
102 | it should "handle empty fields" in {
103 | val schema = CSVSchema(
104 | "{\"fields\":[], \"options\": {}}"
105 | )
106 |
107 | schema.sparkSchema() should be(
108 | StructType(List())
109 | )
110 | }
111 |
112 | it should "handle error parsing json" in {
113 | Try(CSVSchema("{}")).failed.get.getMessage should startWith("Missing required creator property 'fields'")
114 | }
115 | }
116 |
--------------------------------------------------------------------------------
/schemer-core/src/test/scala/schemer/Helpers.scala:
--------------------------------------------------------------------------------
1 | package schemer
2 |
3 | import java.net.URI
4 |
5 | import org.apache.hadoop.conf.Configuration
6 | import org.apache.hadoop.fs.{FileSystem, Path}
7 |
8 | case class TestRecord(title: String, url: String, storeId: Int)
9 |
10 | object Helpers {
11 |
12 | def cleanOutputPath(output: String) {
13 | val outputPath = new Path(output)
14 | if (fileExists(output))
15 | outputPath.getFileSystem(new Configuration()).delete(outputPath, true)
16 | }
17 |
18 | def fileExists(fileLocation: String) = {
19 | val fs = FileSystem.get(new URI(fileLocation), new Configuration())
20 | fs.exists(new Path(fileLocation))
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/schemer-core/src/test/scala/schemer/JSONSchemaSpec.scala:
--------------------------------------------------------------------------------
1 | package schemer
2 |
3 | import org.apache.spark.SparkConf
4 | import org.apache.spark.sql.SparkSession
5 | import org.apache.spark.sql.types._
6 | import org.scalatest.{FlatSpec, Matchers}
7 |
8 | class JSONSchemaSpec extends FlatSpec with Matchers {
9 | implicit val spark: SparkSession = SparkSession.builder
10 | .config(new SparkConf())
11 | .master("local[*]")
12 | .getOrCreate()
13 |
14 | "JSONSchema" should "infer json schema" in {
15 | val path = getClass.getClassLoader.getResource("test.json").getPath
16 |
17 | val inferredSchema = JSONSchema().infer(path)
18 | inferredSchema.schema should be(
19 | "{\"type\":\"object\",\"properties\":{\"imageUrls\":{\"type\":\"array\",\"items\":{\"type\":\"string\"}},\"url\":{\"type\":\"string\"},\"price\":{\"type\":\"object\",\"properties\":{\"max\":{\"type\":\"number\"},\"min\":{\"type\":\"number\"}},\"additionalProperties\":false},\"storeId\":{\"type\":\"integer\"},\"isAvailable\":{\"type\":\"boolean\"},\"title\":{\"type\":\"string\"}},\"additionalProperties\":false}"
20 | )
21 |
22 | val fields = inferredSchema.sparkSchema().fields
23 | fields.length should be(6)
24 | fields.map(f => (f.name, f.dataType)) should contain allElementsOf List(
25 | ("title", StringType),
26 | ("url", StringType),
27 | ("storeId", LongType),
28 | ("price", StructType(Seq(StructField("max", DoubleType), StructField("min", DoubleType)))),
29 | ("isAvailable", BooleanType),
30 | ("imageUrls", ArrayType(StringType))
31 | )
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/schemer-core/src/test/scala/schemer/ParquetSchemaSpec.scala:
--------------------------------------------------------------------------------
1 | package schemer
2 |
3 | import org.apache.spark.SparkConf
4 | import org.apache.spark.sql.{SaveMode, SparkSession}
5 | import org.scalatest.{FlatSpec, Matchers}
6 |
7 | class ParquetSchemaSpec extends FlatSpec with Matchers {
8 | implicit val spark: SparkSession = SparkSession.builder
9 | .config(new SparkConf())
10 | .master("local[*]")
11 | .getOrCreate()
12 |
13 | "ParquetSchema" should "infer avro schema" in {
14 | import spark.implicits._
15 | val df = Seq(TestRecord("iphone", "http://indix.com/iphone", 42)).toDF
16 |
17 | val dataDir = "test_parquet_avro"
18 |
19 | try {
20 | df.write.mode(SaveMode.Overwrite).parquet(dataDir)
21 | val schema = ParquetSchema[AvroSchema]().infer(dataDir)
22 | schema.schema.replaceAll("SchemerInferred_[^\"]+", "SchemerInferred") should be(
23 | "{\n \"type\" : \"record\",\n \"name\" : \"SchemerInferred\",\n \"namespace\" : \"schemer\",\n \"fields\" : [ {\n \"name\" : \"title\",\n \"type\" : [ \"string\", \"null\" ]\n }, {\n \"name\" : \"url\",\n \"type\" : [ \"string\", \"null\" ]\n }, {\n \"name\" : \"storeId\",\n \"type\" : [ \"int\", \"null\" ]\n } ]\n}"
24 | )
25 | } finally {
26 | Helpers.cleanOutputPath(dataDir)
27 | }
28 | }
29 |
30 | it should "infer json schema" in {
31 | import spark.implicits._
32 | val df = Seq(TestRecord("iphone", "http://indix.com/iphone", 42)).toDF
33 |
34 | val dataDir = "test_parquet_json"
35 |
36 | try {
37 | df.write.mode(SaveMode.Overwrite).parquet(dataDir)
38 | val schema = ParquetSchema[JSONSchema]().infer(dataDir)
39 | schema.schema.replaceAll("SchemerInferred_[^\"]+", "SchemerInferred") should be(
40 | "{\"type\":\"object\",\"properties\":{\"title\":{\"type\":\"string\"},\"url\":{\"type\":\"string\"},\"storeId\":{\"type\":\"integer\"}},\"additionalProperties\":false}"
41 | )
42 | } finally {
43 | Helpers.cleanOutputPath(dataDir)
44 | }
45 | }
46 |
47 | it should "infer csv schema" in {
48 | import spark.implicits._
49 | val df = Seq(TestRecord("iphone", "http://indix.com/iphone", 42)).toDF
50 |
51 | val dataDir = "test_parquet_csv"
52 |
53 | try {
54 | df.write.mode(SaveMode.Overwrite).parquet(dataDir)
55 | val schema = ParquetSchema[CSVSchema]().infer(dataDir)
56 | schema.schema.replaceAll("SchemerInferred_[^\"]+", "SchemerInferred") should be(
57 | "{\"fields\":[{\"name\":\"title\",\"nullable\":true,\"type\":\"string\",\"position\":0},{\"name\":\"url\",\"nullable\":true,\"type\":\"string\",\"position\":1},{\"name\":\"storeId\",\"nullable\":true,\"type\":\"int\",\"position\":2}],\"options\":{\"header\":true,\"headerBasedParser\":true,\"separator\":\",\",\"quoteChar\":\"\\\"\",\"escapeChar\":\"\\\\\"}}"
58 | )
59 | } finally {
60 | Helpers.cleanOutputPath(dataDir)
61 | }
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/schemer-registry/src/main/resources/application.conf:
--------------------------------------------------------------------------------
1 | akka {
2 | http {
3 | server {
4 | request-timeout = 90s
5 | idle-timeout = 120s
6 | }
7 | }
8 | }
9 |
10 | registry {
11 | server {
12 | host = "0.0.0.0"
13 | port = 9000
14 | port = ${?SCHEMER_REGISTRY_PORT}
15 | }
16 | inference {
17 | timeout = 60s
18 | }
19 | h2 {
20 | dataSourceClassName = "org.h2.jdbcx.JdbcDataSource"
21 | dataSource {
22 | url = "jdbc:h2:mem:registry"
23 | }
24 | }
25 | postgres {
26 | url = "postgresql://localhost:5432/schemer?user=schemer&password=schemer"
27 | url = ${?POSTGRES_URL}
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/schemer-registry/src/main/resources/aws-core-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
8 |
9 |
10 | fs.s3.impl
11 | org.apache.hadoop.fs.s3a.S3AFileSystem
12 |
13 |
14 | fs.s3n.impl
15 | org.apache.hadoop.fs.s3a.S3AFileSystem
16 |
17 |
18 |
19 | fs.AbstractFileSystem.s3.impl
20 | org.apache.hadoop.fs.s3a.S3A
21 |
22 |
23 | fs.AbstractFileSystem.s3n.impl
24 | org.apache.hadoop.fs.s3a.S3A
25 |
26 |
--------------------------------------------------------------------------------
/schemer-registry/src/main/resources/db/migration/V1__creates_schemas.sql:
--------------------------------------------------------------------------------
1 | CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
2 |
3 | CREATE TABLE "namespaces"(
4 | "id" UUID NOT NULL DEFAULT uuid_generate_v4(),
5 | "name" VARCHAR NOT NULL
6 | );
7 |
8 | ALTER TABLE "namespaces" ADD CONSTRAINT "namespaces_id" PRIMARY KEY("id");
9 | CREATE UNIQUE INDEX "namespaces_name" ON "namespaces"("name");
10 |
11 | INSERT INTO "namespaces"("name") VALUES('default');
12 |
13 | CREATE TABLE "schemas"(
14 | "id" UUID NOT NULL DEFAULT uuid_generate_v4(),
15 | "name" VARCHAR NOT NULL,
16 | "namespace" VARCHAR NOT NULL,
17 | "type" VARCHAR NOT NULL,
18 | "created_on" TIMESTAMP WITH TIME ZONE NOT NULL,
19 | "created_by" VARCHAR NOT NULL
20 |
21 | );
22 |
23 | ALTER TABLE "schemas" ADD CONSTRAINT "schemas_id" PRIMARY KEY("id");
24 | CREATE UNIQUE INDEX "schemas_name_namespace" ON "schemas"("name","namespace");
25 | ALTER TABLE "schemas" ADD CONSTRAINT "schemas_namespace_fk" FOREIGN KEY("namespace") REFERENCES "namespaces"("name");
26 |
27 | CREATE TABLE "schema_versions" (
28 | "id" UUID NOT NULL DEFAULT uuid_generate_v4(),
29 | "schema_id" UUID NOT NULL,
30 | "version" VARCHAR NOT NULL,
31 | "schema" VARCHAR NOT NULL,
32 | "created_on" TIMESTAMP WITH TIME ZONE NOT NULL,
33 | "created_by" VARCHAR NOT NULL
34 | );
35 | ALTER TABLE "schema_versions" ADD CONSTRAINT "schema_versions_id" PRIMARY KEY("id");
36 | CREATE UNIQUE INDEX "schema_versions_version" ON "schema_versions"("schema_id", "version");
37 | ALTER TABLE "schema_versions" ADD CONSTRAINT "schema_versions_schema_fk" FOREIGN KEY("schema_id") REFERENCES "schemas"("id");
--------------------------------------------------------------------------------
/schemer-registry/src/main/resources/graphql/graphiql.html:
--------------------------------------------------------------------------------
1 |
29 |
30 |
31 |
32 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 | Loading...
54 |
55 |
150 |
151 |
--------------------------------------------------------------------------------
/schemer-registry/src/main/scala/schemer/registry/actors/InferActor.scala:
--------------------------------------------------------------------------------
1 | package schemer.registry.actors
2 |
3 | import akka.actor.{Actor, ActorRef, ActorSystem, Status}
4 | import akka.event.Logging
5 | import akka.util.Timeout
6 | import org.apache.spark.sql.SparkSession
7 | import schemer._
8 | import schemer.registry.exceptions.SchemerInferenceException
9 |
10 | import scala.concurrent.Future
11 | import scala.util.{Failure, Random, Success}
12 |
13 | case class JSONSchemaInferenceRequest(paths: Seq[String])
14 | case class AvroSchemaInferenceRequest(paths: Seq[String])
15 | case class ParquetSchemaInferenceRequest(`type`: String, paths: Seq[String])
16 | case class CSVSchemaInferenceRequest(options: CSVOptions, paths: Seq[String])
17 |
18 | class InferActor(
19 | implicit val spark: SparkSession,
20 | implicit val system: ActorSystem,
21 | implicit val inferTimeout: Timeout
22 | ) extends Actor {
23 | import context.dispatcher
24 | val logger = Logging(context.system, this)
25 |
26 | def receive = {
27 | case JSONSchemaInferenceRequest(paths) =>
28 | inferSchema(sender()) {
29 | JSONSchema().infer(paths: _*)
30 | }
31 | case AvroSchemaInferenceRequest(paths) =>
32 | inferSchema(sender()) {
33 | AvroSchema().infer(paths: _*)
34 | }
35 | case CSVSchemaInferenceRequest(options, paths) =>
36 | inferSchema(sender()) {
37 | CSVSchema(options).infer(paths: _*)
38 | }
39 | case ParquetSchemaInferenceRequest(t, paths) =>
40 | inferSchema(sender()) {
41 | ParquetSchema(t).infer(paths: _*)
42 | }
43 | case _ => logger.info("Unsupported infer request")
44 | }
45 |
46 | def inferSchema(sender: ActorRef)(block: => Any) = {
47 | val jobGroup = Random.alphanumeric take 12 mkString ""
48 | logger.info(s"Starting inference for jobGroup $jobGroup")
49 |
50 | val inferFuture = Future {
51 | spark.sparkContext.setJobGroup(jobGroup, jobGroup, true)
52 | block
53 | } recoverWith {
54 | case ex =>
55 | logger.info(s"Inference for jobGroup $jobGroup failed - ${ex.getMessage}")
56 | Future.failed(SchemerInferenceException(ex.getMessage))
57 | }
58 |
59 | inferFuture onComplete {
60 | case Success(r) =>
61 | logger.info(s"Completing inference for jobGroup $jobGroup")
62 | sender ! r
63 | case Failure(f) =>
64 | sender ! Status.Failure(f)
65 | }
66 |
67 | system.scheduler.scheduleOnce(inferTimeout.duration) {
68 | logger.info(s"Cancelling jobGroup $jobGroup")
69 | spark.sparkContext.cancelJobGroup(jobGroup)
70 | }
71 |
72 | }
73 |
74 | override def preStart(): Unit =
75 | logger.info(s"Starting infer actor")
76 | }
77 |
--------------------------------------------------------------------------------
/schemer-registry/src/main/scala/schemer/registry/dao/SchemaDao.scala:
--------------------------------------------------------------------------------
1 | package schemer.registry.dao
2 |
3 | import java.util.UUID
4 |
5 | import org.joda.time.DateTime
6 | import schemer.registry.models.{Schema, SchemaVersion}
7 | import schemer.registry.sql.SqlDatabase
8 |
9 | import scala.concurrent.{ExecutionContext, Future}
10 |
11 | case class PaginatedFilter(
12 | id: Option[UUID],
13 | first: Option[Int],
14 | after: Option[DateTime],
15 | last: Option[Int],
16 | before: Option[DateTime]
17 | ) {
18 | def take = (last orElse first).filter(_ <= 10).getOrElse(10) + 1
19 | }
20 |
21 | class SchemaDao(val db: SqlDatabase)(implicit val ec: ExecutionContext) {
22 | import db.ctx._
23 |
24 | val schemas = quote(querySchema[Schema]("schemas"))
25 | def find(id: UUID) = run(schemas.filter(c => c.id == lift(id))).map(_.headOption)
26 | def create(schema: Schema): Future[UUID] = run(schemas.insert(lift(schema)).returning(_.id))
27 | def all() = run(schemas)
28 |
29 | val schemaVersions = quote(querySchema[SchemaVersion]("schema_versions"))
30 |
31 | def createVersion(schemaVersion: SchemaVersion): Future[UUID] =
32 | run(schemaVersions.insert(lift(schemaVersion)).returning(_.id))
33 |
34 | def findFirstVersions(filter: PaginatedFilter) = {
35 | val query = quote {
36 | applyCursors(lift(filter)).sortBy(_.createdOn)(Ord.descNullsLast).take(lift(filter.take))
37 | }
38 |
39 | run(query)
40 | }
41 |
42 | def findLastVersions(filter: PaginatedFilter) = {
43 | val query = quote {
44 | applyCursors(lift(filter)).sortBy(_.createdOn)(Ord.ascNullsLast).take(lift(filter.take))
45 | }
46 |
47 | run(query)
48 | }
49 |
50 | private def applyCursors =
51 | quote { (filter: PaginatedFilter) =>
52 | schemaVersions
53 | .filter(
54 | (version: SchemaVersion) =>
55 | filter.id.forall(_ == version.schemaId)
56 | && filter.after > version.createdOn
57 | && filter.before < version.createdOn
58 | )
59 | }
60 |
61 | def findLatestVersion(id: UUID) = {
62 | val query = quote {
63 | schemaVersions
64 | .filter(_.schemaId == lift(id))
65 | .filter { v1 =>
66 | schemaVersions
67 | .filter(_.schemaId == lift(id))
68 | .filter { v2 =>
69 | v1.id != v2.id && v1.createdOn < v2.createdOn
70 | }
71 | .isEmpty
72 | }
73 | }
74 |
75 | run(query).map(_.headOption)
76 | }
77 |
78 | def findVersion(id: UUID, version: String) = {
79 | val query = quote {
80 | schemaVersions.filter(_.version == lift(version)).filter(_.schemaId == lift(id))
81 | }
82 |
83 | run(query).map(_.headOption)
84 | }
85 |
86 | def findVersion(id: UUID) = run(schemaVersions.filter(c => c.id == lift(id))).map(_.headOption)
87 | }
88 |
--------------------------------------------------------------------------------
/schemer-registry/src/main/scala/schemer/registry/exceptions/SchemerException.scala:
--------------------------------------------------------------------------------
1 | package schemer.registry.exceptions
2 |
3 | class SchemerException(message: String) extends Exception(message)
4 |
--------------------------------------------------------------------------------
/schemer-registry/src/main/scala/schemer/registry/exceptions/SchemerInferenceException.scala:
--------------------------------------------------------------------------------
1 | package schemer.registry.exceptions
2 |
3 | case class SchemerInferenceException(message: String)
4 | extends SchemerException(s"Error while trying to infer schema - $message")
5 |
--------------------------------------------------------------------------------
/schemer-registry/src/main/scala/schemer/registry/exceptions/SchemerSchemaCreationException.scala:
--------------------------------------------------------------------------------
1 | package schemer.registry.exceptions
2 |
3 | case class SchemerSchemaCreationException(message: String)
4 | extends SchemerException(s"Error while trying to create new schema - $message")
5 |
--------------------------------------------------------------------------------
/schemer-registry/src/main/scala/schemer/registry/exceptions/SchemerSchemaVersionCreationException.scala:
--------------------------------------------------------------------------------
1 | package schemer.registry.exceptions
2 |
3 | case class SchemerSchemaVersionCreationException(message: String)
4 | extends SchemerException(s"Error while trying to create new schema version - $message")
5 |
--------------------------------------------------------------------------------
/schemer-registry/src/main/scala/schemer/registry/graphql/CustomGraphQLResolver.scala:
--------------------------------------------------------------------------------
1 | package schemer.registry.graphql
2 |
3 | import java.util.UUID
4 |
5 | import sangria.execution.deferred.{Deferred, DeferredResolver}
6 | import schemer._
7 | import schemer.registry.models.{SchemaSchemaVersionConnection, SchemaVersion}
8 |
9 | import scala.concurrent.ExecutionContext
10 |
11 | case class InferCSVSchemaDeferred(options: CSVOptions, paths: Seq[String]) extends Deferred[CSVSchema]
12 | case class InferJSONSchemaDeferred(paths: Seq[String]) extends Deferred[JSONSchema]
13 | case class InferParquetSchemaDeferred(`type`: String, paths: Seq[String]) extends Deferred[ParquetSchema]
14 | case class InferAvroSchemaDeferred(paths: Seq[String]) extends Deferred[AvroSchema]
15 |
16 | case class SchemaVersionsDeferred(
17 | id: UUID,
18 | first: Option[Int],
19 | after: Option[String],
20 | last: Option[Int],
21 | before: Option[String]
22 | ) extends Deferred[Seq[SchemaSchemaVersionConnection]]
23 | case class SchemaVersionLatestDeferred(id: UUID) extends Deferred[Option[SchemaVersion]]
24 |
25 | class CustomGraphQLResolver extends DeferredResolver[GraphQLService] {
26 | override def resolve(deferred: Vector[Deferred[Any]], ctx: GraphQLService, queryState: Any)(
27 | implicit ec: ExecutionContext
28 | ) = {
29 | val defMap = deferred.collect {
30 | case InferCSVSchemaDeferred(options, paths) => "csvSchemaInference" -> ctx.inferCSVSchema(options, paths)
31 | case InferJSONSchemaDeferred(paths) => "jsonSchemaInference" -> ctx.inferJSONSchema(paths)
32 | case InferParquetSchemaDeferred(t, paths) => "parquetSchemaInference" -> ctx.inferParquetSchema(t, paths)
33 | case InferAvroSchemaDeferred(paths) => "avroSchemaInference" -> ctx.inferAvroSchema(paths)
34 | case SchemaVersionsDeferred(id, first, after, last, before) =>
35 | "schemaVersions" -> ctx.schemaVersions(id, first, after, last, before)
36 | case SchemaVersionLatestDeferred(id) => "schemaVersionLatest" -> ctx.latestSchemaVersion(id)
37 | }
38 |
39 | deferred flatMap {
40 | case InferCSVSchemaDeferred(_, _) => defMap.filter(_._1 == "csvSchemaInference").map(_._2)
41 | case InferJSONSchemaDeferred(_) => defMap.filter(_._1 == "jsonSchemaInference").map(_._2)
42 | case InferParquetSchemaDeferred(_, _) => defMap.filter(_._1 == "parquetSchemaInference").map(_._2)
43 | case InferAvroSchemaDeferred(_) => defMap.filter(_._1 == "avroSchemaInference").map(_._2)
44 | case SchemaVersionsDeferred(_, _, _, _, _) => defMap.filter(_._1 == "schemaVersions").map(_._2)
45 | case SchemaVersionLatestDeferred(_) => defMap.filter(_._1 == "schemaVersionLatest").map(_._2)
46 | }
47 | }
48 | }
49 |
50 | object CustomGraphQLResolver {
51 | val deferredResolver: DeferredResolver[GraphQLService] =
52 | DeferredResolver.fetchersWithFallback(
53 | new CustomGraphQLResolver
54 | )
55 | }
56 |
--------------------------------------------------------------------------------
/schemer-registry/src/main/scala/schemer/registry/graphql/GraphQLService.scala:
--------------------------------------------------------------------------------
1 | package schemer.registry.graphql
2 |
3 | import java.util.UUID
4 |
5 | import akka.actor.{ActorRef, ActorSystem}
6 | import akka.pattern.{ask, AskTimeoutException}
7 | import akka.util.Timeout
8 | import com.github.mauricio.async.db.postgresql.exceptions.GenericDatabaseException
9 | import org.apache.spark.sql.SparkSession
10 | import sangria.macros.derive.GraphQLField
11 | import schemer._
12 | import schemer.registry.Cursor
13 | import schemer.registry.actors._
14 | import schemer.registry.dao.{PaginatedFilter, SchemaDao}
15 | import schemer.registry.exceptions.{
16 | SchemerException,
17 | SchemerInferenceException,
18 | SchemerSchemaCreationException,
19 | SchemerSchemaVersionCreationException
20 | }
21 | import schemer.registry.models._
22 | import schemer.registry.utils.Clock
23 |
24 | import scala.concurrent.{ExecutionContext, Future}
25 | import scala.language.postfixOps
26 |
27 | class GraphQLService(
28 | schemaDao: SchemaDao,
29 | inferActor: ActorRef
30 | )(
31 | implicit val spark: SparkSession,
32 | implicit val clock: Clock,
33 | implicit val ec: ExecutionContext,
34 | implicit val system: ActorSystem,
35 | implicit val inferActorTimeout: Timeout
36 | ) {
37 |
38 | def inferCSVSchema(options: CSVOptions, paths: Seq[String]) =
39 | inferWithActor(CSVSchemaInferenceRequest(options, paths))
40 |
41 | def inferJSONSchema(paths: Seq[String]) =
42 | inferWithActor(JSONSchemaInferenceRequest(paths))
43 |
44 | def inferParquetSchema(`type`: String, paths: Seq[String]) =
45 | inferWithActor(ParquetSchemaInferenceRequest(`type`, paths))
46 |
47 | def inferAvroSchema(paths: Seq[String]) =
48 | inferWithActor(AvroSchemaInferenceRequest(paths))
49 |
50 | @GraphQLField
51 | def addSchema(name: String, namespace: String, `type`: SchemaType, user: String) =
52 | schemaDao.create(Schema(name, namespace, `type`.`type`, clock.nowUtc, user)).recoverWith {
53 | case ex: GenericDatabaseException =>
54 | Future.failed(SchemerSchemaCreationException(ex.asInstanceOf[GenericDatabaseException].errorMessage.message))
55 | case ex =>
56 | Future.failed(SchemerSchemaCreationException(ex.getMessage))
57 | }
58 |
59 | @GraphQLField
60 | def addSchemaVersion(schemaId: UUID, version: String, schemaConfig: String, user: String) =
61 | schemaDao
62 | .find(schemaId)
63 | .flatMap {
64 | case Some(schema) =>
65 | val errors = Schemer.from(schema.`type`, schemaConfig).validate
66 | if (errors.isEmpty) {
67 | schemaDao.createVersion(SchemaVersion(null, schema.id, version, schemaConfig, clock.nowUtc, user))
68 | } else {
69 | Future.failed(
70 | SchemerSchemaVersionCreationException(
71 | s"Error(s) validating schema config - ${errors.mkString("[", ", ", "]")}"
72 | )
73 | )
74 | }
75 | case None => Future.failed(SchemerSchemaVersionCreationException(s"Schema with id $schemaId not found"))
76 | }
77 | .recoverWith {
78 | case ex: GenericDatabaseException =>
79 | Future.failed(
80 | SchemerSchemaVersionCreationException(ex.asInstanceOf[GenericDatabaseException].errorMessage.message)
81 | )
82 | case ex =>
83 | Future.failed(SchemerSchemaVersionCreationException(ex.getMessage))
84 | }
85 |
86 | def allSchemas = schemaDao.all()
87 |
88 | def schema(id: UUID) = schemaDao.find(id)
89 |
90 | def schemaVersion(id: UUID) = schemaDao.findVersion(id)
91 |
92 | def schemaVersions(id: UUID, first: Option[Int], after: Option[Cursor], last: Option[Int], before: Option[Cursor]) =
93 | if (first.nonEmpty && last.nonEmpty) {
94 | Future.failed(new SchemerException("Both first and last cannot be specified"))
95 | } else {
96 | import schemer.registry.utils.DateTimeUtils._
97 | val filter =
98 | PaginatedFilter(
99 | Some(id),
100 | first,
101 | after.map(_.toDateTime),
102 | last,
103 | before.map(_.toDateTime)
104 | )
105 |
106 | last
107 | .fold(schemaDao.findFirstVersions(filter))(_ => schemaDao.findLastVersions(filter))
108 | .map { versions =>
109 | val pageInfo: PageInfo = buildPageInfo(first, last, versions.length)
110 | val finalVersions = Option(pageInfo.hasMore).filter(identity).fold(versions)(_ => versions.dropRight(1))
111 | SchemaSchemaVersionConnection(
112 | pageInfo,
113 | finalVersions.map { version =>
114 | SchemaSchemaVersionEdge(version.createdOn.toCursor, version)
115 | }
116 | )
117 | }
118 | }
119 |
120 | private def buildPageInfo(first: Option[Int], last: Option[Int], count: Int) =
121 | PageInfo(first.exists(count > _), last.exists(count > _))
122 |
123 | def latestSchemaVersion(id: UUID) = schemaDao.findLatestVersion(id)
124 |
125 | def inferWithActor(message: Any) =
126 | (inferActor ? message).recoverWith {
127 | case ex: SchemerInferenceException =>
128 | Future.failed(ex)
129 | case _: AskTimeoutException =>
130 | Future.failed(SchemerInferenceException("Timeout while trying to infer schema"))
131 | case ex =>
132 | Future.failed(SchemerInferenceException(ex.getMessage))
133 | }
134 | }
135 |
--------------------------------------------------------------------------------
/schemer-registry/src/main/scala/schemer/registry/graphql/schema/GraphQLCustomTypes.scala:
--------------------------------------------------------------------------------
1 | package schemer.registry.graphql.schema
2 |
3 | import java.util.UUID
4 |
5 | import org.joda.time.format.ISODateTimeFormat
6 | import org.joda.time.{DateTime, DateTimeZone}
7 | import sangria.ast
8 | import sangria.schema.ScalarType
9 | import sangria.validation.ValueCoercionViolation
10 |
11 | import scala.util.{Failure, Success, Try}
12 |
13 | trait GraphQLCustomTypes {
14 | case object DateCoercionViolation extends ValueCoercionViolation("Date value expected")
15 |
16 | def parseDate(s: String) = Try(new DateTime(s, DateTimeZone.UTC)) match {
17 | case Success(date) => Right(date)
18 | case Failure(_) => Left(DateCoercionViolation)
19 | }
20 |
21 | def parseUUID(s: String) = Try(UUID.fromString(s)) match {
22 | case Success(uuid) => Right(uuid)
23 | case Failure(_) => Left(DateCoercionViolation)
24 | }
25 |
26 | implicit val DateTimeType = ScalarType[DateTime](
27 | "DateTime",
28 | coerceOutput = (date: DateTime, _) => ast.StringValue(ISODateTimeFormat.dateTime().print(date)),
29 | coerceUserInput = {
30 | case s: String => parseDate(s)
31 | case _ => Left(DateCoercionViolation)
32 | },
33 | coerceInput = {
34 | case ast.StringValue(s, _, _) => parseDate(s)
35 | case _ => Left(DateCoercionViolation)
36 | }
37 | )
38 |
39 | implicit val UUIDType = ScalarType[UUID](
40 | "UUID",
41 | coerceOutput = (uuid: UUID, _) => ast.StringValue(uuid.toString),
42 | coerceUserInput = {
43 | case s: String => parseUUID(s)
44 | case _ => Left(DateCoercionViolation)
45 | },
46 | coerceInput = {
47 | case ast.StringValue(s, _, _) => parseUUID(s)
48 | case _ => Left(DateCoercionViolation)
49 | }
50 | )
51 | }
52 |
--------------------------------------------------------------------------------
/schemer-registry/src/main/scala/schemer/registry/graphql/schema/InferType.scala:
--------------------------------------------------------------------------------
1 | package schemer.registry.graphql.schema
2 |
3 | import sangria.schema._
4 | import schemer.registry.graphql.schema.SchemaDefinition.constantComplexity
5 | import sangria.macros.derive.{deriveInputObjectType, deriveObjectType, InputObjectTypeName}
6 | import schemer.registry.graphql._
7 | import spray.json.DefaultJsonProtocol
8 | import sangria.marshalling.sprayJson._
9 | import schemer._
10 |
11 | trait JSONSchemaType {
12 | implicit val JSONSchemaType = ObjectType(
13 | "JSONSchema",
14 | "JSON Schema",
15 | fields[Unit, JSONSchema](
16 | Field(
17 | "schema",
18 | StringType,
19 | description = Some("CSV Schema as JSON string"),
20 | complexity = constantComplexity(10),
21 | resolve = ctx => ctx.value.schema
22 | ),
23 | Field(
24 | "sparkSchema",
25 | StringType,
26 | description = Some("Spark Schema as JSON string"),
27 | complexity = constantComplexity(100),
28 | resolve = ctx => ctx.value.sparkSchema().prettyJson
29 | )
30 | )
31 | )
32 | }
33 |
34 | trait InferType extends JSONSchemaType with DefaultJsonProtocol {
35 | lazy implicit val TypeArg = Argument("type", ParquetSchemaUnderlyingType)
36 | lazy implicit val PathsArg = Argument("paths", ListInputType(StringType))
37 | implicit val CSVOptionsFormat = jsonFormat5(CSVOptions.apply)
38 | lazy implicit val CSVOptionsInputType = deriveInputObjectType[CSVOptions](InputObjectTypeName("CSVOptionsInput"))
39 | lazy implicit val CSVOptionsArg = Argument("csvOptions", OptionInputType(CSVOptionsInputType), CSVOptions())
40 |
41 | lazy implicit val CSVFieldType = deriveObjectType[Unit, CSVField]()
42 | lazy implicit val CSVOptionsType = deriveObjectType[Unit, CSVOptions]()
43 | lazy val CSVSchemaType = ObjectType(
44 | "CSVSchema",
45 | "CSV Schema",
46 | fields[Unit, CSVSchema](
47 | Field(
48 | "fields",
49 | ListType(CSVFieldType),
50 | description = Some("Fields of the CSV Schema"),
51 | complexity = constantComplexity(1),
52 | resolve = ctx => ctx.value.fields
53 | ),
54 | Field(
55 | "options",
56 | CSVOptionsType,
57 | description = Some("Options of the CSV Schema"),
58 | complexity = constantComplexity(1),
59 | resolve = ctx => ctx.value.options
60 | ),
61 | Field(
62 | "schema",
63 | StringType,
64 | description = Some("CSV Schema as JSON string"),
65 | complexity = constantComplexity(100),
66 | resolve = ctx => ctx.value.schema()
67 | ),
68 | Field(
69 | "sparkSchema",
70 | StringType,
71 | description = Some("Spark Schema as JSON string"),
72 | complexity = constantComplexity(100),
73 | resolve = ctx => ctx.value.sparkSchema().prettyJson
74 | )
75 | )
76 | )
77 |
78 | lazy val ParquetSchemaUnderlyingType = EnumType(
79 | "ParquetSchemaType",
80 | Some("Supported schema types for Parquet"),
81 | List(
82 | EnumValue("Avro", value = schemer.ParquetSchemaType.Avro.`type`),
83 | EnumValue("Csv", value = schemer.ParquetSchemaType.Csv.`type`),
84 | EnumValue("Json", value = schemer.ParquetSchemaType.Json.`type`)
85 | )
86 | )
87 |
88 | lazy val ParquetSchemaType = ObjectType(
89 | "ParquetSchema",
90 | "Parquet Schema",
91 | fields[Unit, ParquetSchema](
92 | Field(
93 | "type",
94 | ParquetSchemaUnderlyingType,
95 | description = Some("Parquet Schema type"),
96 | complexity = constantComplexity(10),
97 | resolve = ctx => ctx.value.`type`.`type`
98 | ),
99 | Field(
100 | "schema",
101 | StringType,
102 | description = Some("Parquet Schema as JSON string"),
103 | complexity = constantComplexity(10),
104 | resolve = ctx => ctx.value.schema
105 | ),
106 | Field(
107 | "sparkSchema",
108 | StringType,
109 | description = Some("Spark Schema as JSON string"),
110 | complexity = constantComplexity(100),
111 | resolve = ctx => ctx.value.sparkSchema().prettyJson
112 | )
113 | )
114 | )
115 |
116 | lazy val AvroSchemaType = ObjectType(
117 | "AvroSchema",
118 | "Avro Schema",
119 | fields[Unit, AvroSchema](
120 | Field(
121 | "schema",
122 | StringType,
123 | description = Some("Avro Schema as string"),
124 | complexity = constantComplexity(10),
125 | resolve = ctx => ctx.value.schema
126 | ),
127 | Field(
128 | "sparkSchema",
129 | StringType,
130 | description = Some("Spark Schema as JSON string"),
131 | complexity = constantComplexity(100),
132 | resolve = ctx => ctx.value.sparkSchema().prettyJson
133 | )
134 | )
135 | )
136 |
137 | lazy val InferType = ObjectType(
138 | "Inference",
139 | "Schema Inference",
140 | fields[GraphQLService, Unit](
141 | Field(
142 | "csv",
143 | CSVSchemaType,
144 | description = Some("CSV Schema inference"),
145 | complexity = constantComplexity(500),
146 | resolve = ctx => InferCSVSchemaDeferred(ctx arg CSVOptionsArg, ctx arg PathsArg),
147 | arguments = List(CSVOptionsArg, PathsArg)
148 | ),
149 | Field(
150 | "json",
151 | JSONSchemaType,
152 | description = Some("JSON Schema inference"),
153 | complexity = constantComplexity(500),
154 | resolve = ctx => InferJSONSchemaDeferred(ctx arg PathsArg),
155 | arguments = List(PathsArg)
156 | ),
157 | Field(
158 | "parquet",
159 | ParquetSchemaType,
160 | description = Some("Parquet Schema inference"),
161 | complexity = constantComplexity(500),
162 | resolve = ctx => InferParquetSchemaDeferred(ctx arg TypeArg, ctx arg PathsArg),
163 | arguments = List(TypeArg, PathsArg)
164 | ),
165 | Field(
166 | "avro",
167 | AvroSchemaType,
168 | description = Some("Avro Schema inference"),
169 | complexity = constantComplexity(500),
170 | resolve = ctx => InferAvroSchemaDeferred(ctx arg PathsArg),
171 | arguments = List(PathsArg)
172 | )
173 | )
174 | )
175 | }
176 |
--------------------------------------------------------------------------------
/schemer-registry/src/main/scala/schemer/registry/graphql/schema/MetadataType.scala:
--------------------------------------------------------------------------------
1 | package schemer.registry.graphql.schema
2 |
3 | import buildinfo.BuildInfo
4 | import sangria.macros.derive.deriveObjectType
5 | import sangria.schema.ObjectType
6 |
7 | case class Metadata(version: String = BuildInfo.version)
8 |
9 | trait MetadataType {
10 | lazy val MetadataType: ObjectType[Unit, Metadata] = deriveObjectType()
11 | }
12 |
--------------------------------------------------------------------------------
/schemer-registry/src/main/scala/schemer/registry/graphql/schema/MutationType.scala:
--------------------------------------------------------------------------------
1 | package schemer.registry.graphql.schema
2 |
3 | import sangria.macros.derive.deriveContextObjectType
4 | import schemer.registry.graphql.GraphQLService
5 |
6 | trait MutationType extends JSONSchemaType with SchemaType with GraphQLCustomTypes {
7 | val MutationType = deriveContextObjectType[GraphQLService, GraphQLService, Unit](identity)
8 | }
9 |
--------------------------------------------------------------------------------
/schemer-registry/src/main/scala/schemer/registry/graphql/schema/SchemaDefinition.scala:
--------------------------------------------------------------------------------
1 | package schemer.registry.graphql.schema
2 |
3 | import sangria.schema.{fields, Args, Field, ListType, ObjectType, OptionType, Schema}
4 | import schemer.registry.graphql.GraphQLService
5 | import schemer.registry.models.{Schema => SSchema}
6 |
7 | object SchemaDefinition extends InferType with MetadataType with MutationType with SchemaType with GraphQLCustomTypes {
8 |
9 | def constantComplexity[Ctx](complexity: Double) =
10 | Some((_: Ctx, _: Args, child: Double) => child + complexity)
11 |
12 | val QueryType = ObjectType(
13 | "Query",
14 | "Root",
15 | fields[GraphQLService, Unit](
16 | Field(
17 | "schema",
18 | OptionType(SchemaType),
19 | description = Some("Schema"),
20 | resolve = ctx => ctx.ctx.schema(ctx arg IdArg),
21 | arguments = List(IdArg)
22 | ),
23 | Field(
24 | "schemas",
25 | ListType(SchemaType),
26 | description = Some("All Schemas"),
27 | resolve = ctx => ctx.ctx.allSchemas
28 | ),
29 | Field(
30 | "schemaVersion",
31 | OptionType(SchemaVersionType),
32 | description = Some("Schema Version"),
33 | resolve = ctx => ctx.ctx.schemaVersion(ctx arg IdArg),
34 | arguments = List(IdArg)
35 | ),
36 | Field(
37 | "infer",
38 | InferType,
39 | description = Some("Schema Inference"),
40 | resolve = _ => ()
41 | ),
42 | Field(
43 | "metadata",
44 | MetadataType,
45 | description = Some("Metadata"),
46 | complexity = constantComplexity(100),
47 | resolve = _ => Metadata()
48 | )
49 | )
50 | )
51 | val schema = Schema(QueryType, Some(MutationType))
52 | }
53 |
--------------------------------------------------------------------------------
/schemer-registry/src/main/scala/schemer/registry/graphql/schema/SchemaType.scala:
--------------------------------------------------------------------------------
1 | package schemer.registry.graphql.schema
2 |
3 | import sangria.macros.derive.deriveObjectType
4 | import sangria.schema.{Field, ObjectType, _}
5 | import schemer.{SchemaType => SSchemaType}
6 | import schemer.registry.graphql.{SchemaVersionLatestDeferred, SchemaVersionsDeferred}
7 | import schemer.registry.graphql.schema.SchemaDefinition.constantComplexity
8 | import schemer.registry.models.{
9 | PageInfo,
10 | SchemaSchemaVersionConnection,
11 | SchemaSchemaVersionEdge,
12 | SchemaVersion,
13 | Schema => SSchema
14 | }
15 |
16 | trait SchemaType extends GraphQLCustomTypes {
17 | lazy implicit val SchemaTypeType = EnumType[SSchemaType](
18 | "SchemaType",
19 | Some("Supported schema types"),
20 | List(
21 | EnumValue("Avro", value = SSchemaType.Avro),
22 | EnumValue("Csv", value = SSchemaType.Csv),
23 | EnumValue("Json", value = SSchemaType.Json),
24 | EnumValue("ParquetAvro", value = SSchemaType.ParquetAvro),
25 | EnumValue("ParquetCsv", value = SSchemaType.ParquetCsv),
26 | EnumValue("ParquetJson", value = SSchemaType.ParquetJson)
27 | )
28 | )
29 | lazy implicit val IdArg = Argument("id", UUIDType)
30 | lazy implicit val FirstArg = Argument("first", OptionInputType(IntType))
31 | lazy implicit val AfterArg = Argument("after", OptionInputType(StringType))
32 | lazy implicit val LastArg = Argument("last", OptionInputType(IntType))
33 | lazy implicit val BeforeArg = Argument("before", OptionInputType(StringType))
34 | lazy implicit val PageInfo: ObjectType[Unit, PageInfo] = deriveObjectType()
35 | lazy implicit val SchemaVersionType: ObjectType[Unit, SchemaVersion] = deriveObjectType()
36 | lazy implicit val SchemaSchemaVersionEdgeType: ObjectType[Unit, SchemaSchemaVersionEdge] = deriveObjectType()
37 | lazy implicit val SchemaSchemaVersionConnectionType: ObjectType[Unit, SchemaSchemaVersionConnection] =
38 | deriveObjectType()
39 |
40 | val SchemaType: ObjectType[Unit, SSchema] = ObjectType(
41 | "Schema",
42 | "Schema",
43 | fields[Unit, SSchema](
44 | Field(
45 | "id",
46 | UUIDType,
47 | resolve = _.value.id
48 | ),
49 | Field(
50 | "name",
51 | StringType,
52 | resolve = _.value.name
53 | ),
54 | Field(
55 | "namespace",
56 | StringType,
57 | resolve = _.value.namespace
58 | ),
59 | Field(
60 | "type",
61 | SchemaTypeType,
62 | resolve = ctx => SSchemaType.supportedTypes.find(_.`type` == ctx.value.`type`).get
63 | ),
64 | Field(
65 | "createdOn",
66 | DateTimeType,
67 | resolve = _.value.createdOn
68 | ),
69 | Field(
70 | "createdBy",
71 | StringType,
72 | resolve = _.value.createdBy
73 | ),
74 | Field(
75 | "versions",
76 | ListType(SchemaSchemaVersionConnectionType),
77 | resolve = ctx =>
78 | SchemaVersionsDeferred(ctx.value.id, ctx arg FirstArg, ctx arg AfterArg, ctx arg LastArg, ctx arg BeforeArg),
79 | complexity = constantComplexity(200),
80 | arguments = List(FirstArg, AfterArg, LastArg, BeforeArg)
81 | ),
82 | Field(
83 | "latestVersion",
84 | OptionType(SchemaVersionType),
85 | resolve = ctx => SchemaVersionLatestDeferred(ctx.value.id),
86 | complexity = constantComplexity(200)
87 | )
88 | )
89 | )
90 |
91 | }
92 |
--------------------------------------------------------------------------------
/schemer-registry/src/main/scala/schemer/registry/models/Schema.scala:
--------------------------------------------------------------------------------
1 | package schemer.registry.models
2 |
3 | import java.util.UUID
4 |
5 | import org.joda.time.DateTime
6 |
7 | case class Schema(
8 | id: UUID,
9 | name: String,
10 | namespace: String,
11 | `type`: String,
12 | createdOn: DateTime,
13 | createdBy: String
14 | )
15 |
16 | object Schema {
17 | def apply(name: String, namespace: String, `type`: String, createdOn: DateTime, createdBy: String) =
18 | new Schema(null, name, namespace, `type`, createdOn, createdBy)
19 | }
20 |
21 | case class SchemaVersion(
22 | id: UUID,
23 | schemaId: UUID,
24 | version: String,
25 | schema: String,
26 | createdOn: DateTime,
27 | createdBy: String
28 | )
29 | case class PageInfo(hasNextPage: Boolean, hasPreviousPage: Boolean) {
30 | def hasMore = hasNextPage || hasPreviousPage
31 | }
32 | case class SchemaSchemaVersionEdge(cursor: String, node: SchemaVersion)
33 | case class SchemaSchemaVersionConnection(pageInfo: PageInfo, edges: List[SchemaSchemaVersionEdge])
34 |
--------------------------------------------------------------------------------
/schemer-registry/src/main/scala/schemer/registry/package.scala:
--------------------------------------------------------------------------------
1 | package schemer
2 |
3 | package object registry {
4 | type Cursor = String
5 | }
6 |
--------------------------------------------------------------------------------
/schemer-registry/src/main/scala/schemer/registry/routes/GraphQLRoutes.scala:
--------------------------------------------------------------------------------
1 | package schemer.registry.routes
2 |
3 | import akka.http.scaladsl.model.StatusCodes.{BadRequest, InternalServerError, OK}
4 | import akka.http.scaladsl.server.Directives.{as, complete, entity, get, getFromResource, path, post}
5 | import sangria.execution._
6 | import sangria.parser.QueryParser
7 | import sangria.schema.Schema
8 | import spray.json.{JsObject, JsString, JsValue}
9 | import akka.http.scaladsl.marshallers.sprayjson.SprayJsonSupport._
10 | import akka.http.scaladsl.server.Directives._
11 | import sangria.marshalling.sprayJson._
12 | import schemer.registry.exceptions.SchemerException
13 | import schemer.registry.graphql.{CustomGraphQLResolver, GraphQLService}
14 | import schemer.registry.graphql.schema.SchemaDefinition
15 |
16 | import scala.util.{Failure, Success}
17 | import scala.concurrent.ExecutionContext.Implicits.global
18 |
19 | trait GraphQLRoutes {
20 | val graphQLService: GraphQLService
21 |
22 | case object TooComplexQuery extends Exception
23 | val rejectComplexQueries = QueryReducer.rejectComplexQueries(
24 | 1000,
25 | (_: Double, _: GraphQLService) => TooComplexQuery
26 | )
27 |
28 | val graphQLExceptionHandler: Executor.ExceptionHandler = {
29 | case (_, TooComplexQuery) => HandledException("Too complex query. Please reduce the field selection.")
30 | case (_, e: SchemerException) => HandledException(e.getMessage)
31 | }
32 |
33 | def executeGraphQLQuery(schema: Schema[GraphQLService, Unit], requestJson: JsValue) = {
34 | val JsObject(fields) = requestJson
35 |
36 | val JsString(query) = fields("query")
37 |
38 | val operation = fields.get("operationName") collect {
39 | case JsString(op) => op
40 | }
41 |
42 | val vars = fields.get("variables") match {
43 | case Some(obj: JsObject) => obj
44 | case _ => JsObject.empty
45 | }
46 |
47 | QueryParser.parse(query) match {
48 |
49 | case Success(queryDocument) =>
50 | complete(
51 | Executor
52 | .execute(
53 | schema,
54 | queryDocument,
55 | graphQLService,
56 | deferredResolver = CustomGraphQLResolver.deferredResolver,
57 | variables = vars,
58 | operationName = operation,
59 | queryReducers = rejectComplexQueries :: Nil,
60 | exceptionHandler = graphQLExceptionHandler
61 | )
62 | .map(OK -> _)
63 | .recover {
64 | case error: QueryAnalysisError => BadRequest -> error.resolveError
65 | case error: ErrorWithResolver => InternalServerError -> error.resolveError
66 | }
67 | )
68 |
69 | case Failure(error) =>
70 | complete(BadRequest -> JsObject("error" -> JsString(error.getMessage)))
71 | }
72 | }
73 |
74 | val graphQLRoutes = path("graphql") {
75 | post {
76 | entity(as[JsValue]) { requestJson =>
77 | executeGraphQLQuery(SchemaDefinition.schema, requestJson)
78 | }
79 | } ~ get {
80 | getFromResource("graphql/graphiql.html")
81 | }
82 | }
83 | }
84 |
--------------------------------------------------------------------------------
/schemer-registry/src/main/scala/schemer/registry/routes/HealthRoutes.scala:
--------------------------------------------------------------------------------
1 | package schemer.registry.routes
2 |
3 | import java.io.{StringWriter, Writer}
4 | import java.util
5 |
6 | import akka.http.scaladsl.model.{HttpCharsets, HttpEntity, MediaType}
7 | import akka.http.scaladsl.server.Directives._
8 | import io.prometheus.client.Collector.MetricFamilySamples
9 | import io.prometheus.client.CollectorRegistry
10 | import io.prometheus.client.exporter.common.TextFormat
11 | import io.prometheus.client.hotspot.DefaultExports
12 |
13 | trait HealthRoutes {
14 |
15 | DefaultExports.initialize()
16 | private val collectorRegistry = CollectorRegistry.defaultRegistry
17 | private val metricsMediaTypeParams = Map("version" -> "0.0.4")
18 | private val metricsMediaType =
19 | MediaType.customWithFixedCharset("text", "plain", HttpCharsets.`UTF-8`, params = metricsMediaTypeParams)
20 |
21 | def toPrometheusTextFormat(e: util.Enumeration[MetricFamilySamples]): String = {
22 | val writer: Writer = new StringWriter()
23 | TextFormat.write004(writer, e)
24 |
25 | writer.toString
26 | }
27 |
28 | val healthRoutes = path("health") {
29 | get {
30 | complete {
31 | "OK"
32 | }
33 | }
34 | } ~ path("metrics") {
35 | get {
36 | complete {
37 | HttpEntity(metricsMediaType, toPrometheusTextFormat(collectorRegistry.metricFamilySamples()))
38 | }
39 | }
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/schemer-registry/src/main/scala/schemer/registry/routes/Routes.scala:
--------------------------------------------------------------------------------
1 | package schemer.registry.routes
2 |
3 | import akka.http.scaladsl.model.{StatusCodes, Uri}
4 | import akka.http.scaladsl.server.Directives._
5 | import akka.http.scaladsl.server.{ExceptionHandler, RejectionHandler}
6 | import com.typesafe.scalalogging.StrictLogging
7 |
8 | trait Routes extends GraphQLRoutes with HealthRoutes with StrictLogging {
9 | private val exceptionHandler = ExceptionHandler {
10 | case e: Exception =>
11 | logger.error(s"Exception during client request processing: ${e.getMessage}", e)
12 | _.complete((StatusCodes.InternalServerError, "Internal server error"))
13 | }
14 | val rejectionHandler = RejectionHandler.default
15 | val logBlackListPaths = Seq("health")
16 | private def isBlacklistedPath(uri: Uri) =
17 | logBlackListPaths
18 | .map(s"/" + _)
19 | .exists(uri.toString().contains)
20 | val logDuration = extractRequestContext.flatMap { ctx =>
21 | val start = System.currentTimeMillis()
22 | mapResponse { resp =>
23 | val d = System.currentTimeMillis() - start
24 | if (!isBlacklistedPath(ctx.request.uri)) {
25 | logger.info(s"[${resp.status.intValue()}] ${ctx.request.method.name} ${ctx.request.uri} took: ${d}ms")
26 | }
27 | resp
28 | } & handleRejections(rejectionHandler)
29 | }
30 | val routes = logDuration {
31 | handleExceptions(exceptionHandler) {
32 | encodeResponse {
33 | graphQLRoutes ~ healthRoutes
34 | }
35 | }
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/schemer-registry/src/main/scala/schemer/registry/routes/SwaggerRoutes.scala:
--------------------------------------------------------------------------------
1 | package schemer.registry.routes
2 |
3 | import akka.http.scaladsl.model.StatusCodes
4 | import akka.http.scaladsl.server.Directives._
5 |
6 | trait SwaggerRoutes {
7 |
8 | val swaggerRoutes = pathPrefix("swagger") {
9 | pathEnd {
10 | extractUri { uri =>
11 | redirect(uri + "/", StatusCodes.TemporaryRedirect)
12 | }
13 | } ~
14 | pathSingleSlash {
15 | getFromResource("swagger-ui/index.html")
16 | } ~
17 | getFromResourceDirectory("swagger-ui")
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/schemer-registry/src/main/scala/schemer/registry/server/ConfigWithDefault.scala:
--------------------------------------------------------------------------------
1 | package schemer.registry.server
2 |
3 | import java.net.InetAddress
4 | import java.util.concurrent.TimeUnit
5 |
6 | import com.typesafe.config.{Config, ConfigFactory}
7 |
8 | trait ConfigWithDefault {
9 |
10 | def rootConfig: Config
11 |
12 | def getBoolean(path: String, default: Boolean) = ifHasPath(path, default) { _.getBoolean(path) }
13 | def getString(path: String, default: String) = ifHasPath(path, default) { _.getString(path) }
14 | def getInt(path: String, default: Int) = ifHasPath(path, default) { _.getInt(path) }
15 | def getConfig(path: String, default: Config) = ifHasPath(path, default) { _.getConfig(path) }
16 | def getMilliseconds(path: String, default: Long) = ifHasPath(path, default) {
17 | _.getDuration(path, TimeUnit.MILLISECONDS)
18 | }
19 | def getOptionalString(path: String, default: Option[String] = None) = getOptional(path) { _.getString(path) }
20 |
21 | def loadDefault(rootName: String, loadEnvConf: Boolean = true) =
22 | if (loadEnvConf) {
23 | ConfigFactory
24 | .parseResources(s"env-conf/$getHostname.conf")
25 | .withFallback(ConfigFactory.load())
26 | .getConfig(rootName)
27 | } else {
28 | ConfigFactory.load().getConfig(rootName)
29 | }
30 |
31 | protected def getHostname = InetAddress.getLocalHost.getHostName
32 |
33 | private def ifHasPath[T](path: String, default: T)(get: Config => T): T =
34 | if (rootConfig.hasPath(path)) get(rootConfig) else default
35 |
36 | private def getOptional[T](fullPath: String, default: Option[T] = None)(get: Config => T) =
37 | if (rootConfig.hasPath(fullPath)) {
38 | Some(get(rootConfig))
39 | } else {
40 | default
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/schemer-registry/src/main/scala/schemer/registry/server/InferenceConfig.scala:
--------------------------------------------------------------------------------
1 | package schemer.registry.server
2 |
3 | import com.typesafe.config.Config
4 | import java.util.concurrent.TimeUnit.SECONDS
5 | import scala.concurrent.duration._
6 |
7 | trait InferenceConfig extends ConfigWithDefault {
8 | def rootConfig: Config
9 | lazy val inferenceConfig = rootConfig.getConfig("inference")
10 | lazy val inferTimeout = inferenceConfig.getDuration("timeout", SECONDS).seconds
11 | }
12 |
--------------------------------------------------------------------------------
/schemer-registry/src/main/scala/schemer/registry/server/Main.scala:
--------------------------------------------------------------------------------
1 | package schemer.registry.server
2 |
3 | import akka.actor.ActorSystem
4 | import akka.http.scaladsl.Http
5 | import akka.http.scaladsl.Http.ServerBinding
6 | import akka.stream.ActorMaterializer
7 | import com.typesafe.scalalogging.StrictLogging
8 | import schemer.registry.routes.Routes
9 |
10 | import scala.concurrent.ExecutionContext.Implicits.global
11 | import scala.concurrent.Future
12 | import scala.util.{Failure, Success}
13 |
14 | class Main() extends StrictLogging {
15 |
16 | def start(): (Future[ServerBinding], Modules) = {
17 |
18 | implicit val _system: ActorSystem = ActorSystem("main")
19 | implicit val _materializer: ActorMaterializer = ActorMaterializer()
20 |
21 | val modules = new Modules with Routes {
22 | implicit lazy val ec = _system.dispatcher
23 | implicit lazy val mat = _materializer
24 | lazy val system = _system
25 |
26 | }
27 |
28 | (Http().bindAndHandle(modules.routes, modules.config.serverHost, modules.config.serverPort), modules)
29 | }
30 | }
31 |
32 | object Main extends App with StrictLogging {
33 | val (startFuture, modules) = new Main().start()
34 |
35 | val host = modules.config.serverHost
36 | val port = modules.config.serverPort
37 |
38 | val system = modules.system
39 |
40 | startFuture.onComplete {
41 | case Success(b) =>
42 | logger.info(s"Server started on $host:$port")
43 | sys.addShutdownHook {
44 | b.unbind()
45 | shutdown()
46 | }
47 | case Failure(e) =>
48 | logger.error(s"Cannot start server on $host:$port", e)
49 | sys.addShutdownHook {
50 | shutdown()
51 | }
52 | }
53 |
54 | def shutdown() {
55 | modules.system.terminate()
56 | logger.info("Server stopped")
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/schemer-registry/src/main/scala/schemer/registry/server/Modules.scala:
--------------------------------------------------------------------------------
1 | package schemer.registry.server
2 |
3 | import akka.actor.{ActorSystem, Props}
4 | import akka.routing.BalancingPool
5 | import akka.stream.Materializer
6 | import akka.util.Timeout
7 | import com.typesafe.config.Config
8 | import org.apache.spark.SparkConf
9 | import org.apache.spark.sql.SparkSession
10 | import schemer.registry.actors.InferActor
11 | import schemer.registry.dao.SchemaDao
12 | import schemer.registry.graphql.GraphQLService
13 | import schemer.registry.sql.{DatabaseConfig, SqlDatabase}
14 | import schemer.registry.utils.RealTimeClock
15 |
16 | import scala.concurrent.ExecutionContext
17 | import scala.concurrent.duration._
18 |
19 | trait Modules {
20 |
21 | implicit def system: ActorSystem
22 |
23 | implicit def ec: ExecutionContext
24 |
25 | implicit def mat: Materializer
26 |
27 | lazy val config = new ServerConfig with DatabaseConfig with InferenceConfig {
28 | override def rootConfig: Config = loadDefault("registry")
29 | }
30 |
31 | implicit lazy val clock = RealTimeClock
32 |
33 | implicit val spark: SparkSession = SparkSession.builder
34 | .config(new SparkConf())
35 | .master("local[*]")
36 | .getOrCreate()
37 |
38 | val hadoopConf = spark.sparkContext.hadoopConfiguration
39 |
40 | val sqlDatabase = SqlDatabase(config)
41 | sqlDatabase.updateSchema()
42 |
43 | lazy val schemaDao = new SchemaDao(sqlDatabase)
44 | lazy val inferActor = locally {
45 | implicit lazy val inferTimeout = Timeout(config.inferTimeout)
46 | system.actorOf(Props(new InferActor()).withRouter(BalancingPool(nrOfInstances = 10)), name = "InferActor")
47 | }
48 | lazy val graphQLService = locally {
49 | implicit lazy val inferActorTimeout = Timeout(config.inferTimeout + 20.seconds)
50 | new GraphQLService(schemaDao, inferActor)
51 | }
52 | }
53 |
--------------------------------------------------------------------------------
/schemer-registry/src/main/scala/schemer/registry/server/ServerConfig.scala:
--------------------------------------------------------------------------------
1 | package schemer.registry.server
2 |
3 | import com.typesafe.config.Config
4 |
5 | trait ServerConfig extends ConfigWithDefault {
6 |
7 | def rootConfig: Config
8 |
9 | lazy val serverHost: String = rootConfig.getString("server.host")
10 | lazy val serverPort: Int = rootConfig.getInt("server.port")
11 | }
12 |
--------------------------------------------------------------------------------
/schemer-registry/src/main/scala/schemer/registry/sql/DatabaseConfig.scala:
--------------------------------------------------------------------------------
1 | package schemer.registry.sql
2 |
3 | import com.typesafe.config.Config
4 | import schemer.registry.server.ConfigWithDefault
5 |
6 | trait DatabaseConfig extends ConfigWithDefault {
7 | def rootConfig: Config
8 |
9 | val h2config = rootConfig.getConfig("h2")
10 | val postgresConfig = rootConfig.getConfig("postgres")
11 | }
12 |
--------------------------------------------------------------------------------
/schemer-registry/src/main/scala/schemer/registry/sql/SqlDatabase.scala:
--------------------------------------------------------------------------------
1 | package schemer.registry.sql
2 |
3 | import io.getquill.{PostgresAsyncContext, SnakeCase}
4 | import org.apache.commons.lang3.StringUtils
5 | import org.flywaydb.core.Flyway
6 | import org.joda.time.DateTime
7 |
8 | trait Quotes { this: PostgresAsyncContext[_] =>
9 | implicit class DateTimeQuotes(l: DateTime) {
10 | def >(r: DateTime) = quote(infix"$l > $r".as[Boolean])
11 | def <(r: DateTime) = quote(infix"$l < $r".as[Boolean])
12 | }
13 |
14 | implicit class OptDateTimeQuotes(l: Option[DateTime]) {
15 | def >(r: DateTime) = quote(infix"($l::timestamptz is null or $l > $r)".as[Boolean])
16 | def <(r: DateTime) = quote(infix"($l::timestamptz is null or $l < $r)".as[Boolean])
17 | }
18 | }
19 |
20 | case class SqlDatabase(config: DatabaseConfig) {
21 | lazy val ctx = new PostgresAsyncContext(SnakeCase, config.postgresConfig) with Quotes
22 |
23 | def updateSchema() = {
24 | val postgresUrl = config.postgresConfig.getString("url")
25 | if (StringUtils.isNotEmpty(postgresUrl)) {
26 | val flyway = new Flyway()
27 | flyway.setOutOfOrder(true)
28 | flyway.setDataSource(s"jdbc:$postgresUrl", "", "")
29 | flyway.migrate()
30 | }
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/schemer-registry/src/main/scala/schemer/registry/sql/package.scala:
--------------------------------------------------------------------------------
1 | package schemer.registry
2 |
3 | import io.getquill.{PostgresAsyncContext, SnakeCase}
4 |
5 | package object sql {
6 | type DbContext = PostgresAsyncContext[SnakeCase]
7 | }
8 |
--------------------------------------------------------------------------------
/schemer-registry/src/main/scala/schemer/registry/utils/Clock.scala:
--------------------------------------------------------------------------------
1 | package schemer.registry.utils
2 |
3 | import org.joda.time.{DateTime, DateTimeZone, Duration}
4 | import org.joda.time.format.PeriodFormatterBuilder
5 |
6 | trait Clock {
7 | def now: DateTime
8 | def nowUtc: DateTime
9 | def nowMillis: Long
10 | }
11 |
12 | object RealTimeClock extends Clock with Serializable {
13 | def now = DateTime.now()
14 | def nowUtc = DateTime.now(DateTimeZone.UTC)
15 | def nowMillis = System.currentTimeMillis()
16 | }
17 |
18 | class FixtureTimeClock(millis: Long) extends Clock with Serializable {
19 | def now = new DateTime(millis)
20 | def nowUtc = new DateTime(millis, DateTimeZone.UTC)
21 | def nowMillis = millis
22 | }
23 |
24 | class FormatDuration() {
25 | def format(time: Duration): String = {
26 | val period = time.toPeriod()
27 | val hms = new PeriodFormatterBuilder()
28 | .printZeroAlways()
29 | .appendHours()
30 | .appendSeparator(" hours ")
31 | .appendMinutes()
32 | .appendSeparator(" minutes ")
33 | .appendSeconds()
34 | .appendSuffix(" seconds")
35 | .toFormatter()
36 | hms.print(period)
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/schemer-registry/src/main/scala/schemer/registry/utils/DateTimeUtils.scala:
--------------------------------------------------------------------------------
1 | package schemer.registry.utils
2 |
3 | import java.nio.charset.StandardCharsets
4 | import java.util.Base64
5 |
6 | import org.joda.time.DateTime
7 | import schemer.registry.Cursor
8 |
9 | object DateTimeUtils {
10 | implicit class DateTimeCursor(val dt: DateTime) {
11 | def toCursor: Cursor = Base64.getEncoder.encodeToString(dt.getMillis.toString.getBytes(StandardCharsets.UTF_8))
12 | }
13 |
14 | implicit class CursorDateTime(val cursor: Cursor) {
15 | def toDateTime: DateTime =
16 | new DateTime(new String(Base64.getDecoder.decode(cursor), StandardCharsets.UTF_8).toLong)
17 | }
18 |
19 | }
20 |
--------------------------------------------------------------------------------
/schemer-ui.md:
--------------------------------------------------------------------------------
1 | # Schemer UI screens
2 |
3 | ## Browse schemas
4 |
5 |
6 |
7 |
8 |
9 | ## Schema Details
10 |
11 |
12 |
13 |
14 |
15 | ## JSON representaion of Schema
16 |
17 |
18 |
19 |
20 |
21 | ## Create Schema
22 |
23 |
24 |
25 |
26 |
27 | ## Create Schema Version
28 |
29 |
30 |
31 |
32 |
33 | ## Field definition Wizard
34 |
35 |
36 |
37 |
--------------------------------------------------------------------------------
/secring.gpg.enc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/indix/schemer/16069813fa20b652f8e7735b64a47519e3ee8f14/secring.gpg.enc
--------------------------------------------------------------------------------
/sonatype.sbt:
--------------------------------------------------------------------------------
1 | credentials += Credentials(
2 | "Sonatype Nexus Repository Manager",
3 | "oss.sonatype.org",
4 | System.getenv("SONATYPE_USERNAME"),
5 | System.getenv("SONATYPE_PASSWORD")
6 | )
7 |
--------------------------------------------------------------------------------