├── .gitignore ├── .scalafmt.conf ├── .travis.yml ├── LICENSE ├── README.md ├── build.sbt ├── docker-compose.yml ├── project ├── Dependencies.scala ├── build.properties ├── plugins.sbt └── project │ └── plugins.sbt ├── publish.sh ├── pubring.gpg.enc ├── resources └── images │ ├── 001.png │ ├── 002.png │ ├── 003.png │ ├── 004.png │ ├── 005.png │ ├── 006.png │ ├── 007.png │ ├── 008.png │ ├── schemer-logo-text-wide.png │ ├── schemer-logo-text-wide.svg │ ├── schemer-logo-text.png │ ├── schemer-logo-text.svg │ ├── schemer-logo.png │ └── schemer-logo.svg ├── schemer-core └── src │ ├── main │ └── scala │ │ └── schemer │ │ ├── AvroSchema.scala │ │ ├── CSVSchema.scala │ │ ├── JSONSchema.scala │ │ ├── ParquetSchema.scala │ │ ├── SchemaLike.scala │ │ ├── Schemer.scala │ │ └── utils │ │ ├── JSONUtil.scala │ │ └── JsonSchemaValidationUtil.scala │ └── test │ ├── resources │ ├── test.csv │ ├── test.json │ └── test.tsv │ └── scala │ └── schemer │ ├── AvroSchemaSpec.scala │ ├── CSVSchemaSpec.scala │ ├── Helpers.scala │ ├── JSONSchemaSpec.scala │ └── ParquetSchemaSpec.scala ├── schemer-registry └── src │ └── main │ ├── resources │ ├── application.conf │ ├── aws-core-site.xml │ ├── db │ │ └── migration │ │ │ └── V1__creates_schemas.sql │ └── graphql │ │ └── graphiql.html │ └── scala │ └── schemer │ └── registry │ ├── actors │ └── InferActor.scala │ ├── dao │ └── SchemaDao.scala │ ├── exceptions │ ├── SchemerException.scala │ ├── SchemerInferenceException.scala │ ├── SchemerSchemaCreationException.scala │ └── SchemerSchemaVersionCreationException.scala │ ├── graphql │ ├── CustomGraphQLResolver.scala │ ├── GraphQLService.scala │ └── schema │ │ ├── GraphQLCustomTypes.scala │ │ ├── InferType.scala │ │ ├── MetadataType.scala │ │ ├── MutationType.scala │ │ ├── SchemaDefinition.scala │ │ └── SchemaType.scala │ ├── models │ └── Schema.scala │ ├── package.scala │ ├── routes │ ├── GraphQLRoutes.scala │ ├── HealthRoutes.scala │ ├── Routes.scala │ └── SwaggerRoutes.scala │ ├── server │ ├── ConfigWithDefault.scala │ ├── InferenceConfig.scala │ ├── Main.scala │ ├── Modules.scala │ └── ServerConfig.scala │ ├── sql │ ├── DatabaseConfig.scala │ ├── SqlDatabase.scala │ └── package.scala │ └── utils │ ├── Clock.scala │ └── DateTimeUtils.scala ├── schemer-ui.md ├── secring.gpg.enc └── sonatype.sbt /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | target/ 3 | *.log 4 | schemer_db/ 5 | -------------------------------------------------------------------------------- /.scalafmt.conf: -------------------------------------------------------------------------------- 1 | style = defaultWithAlign 2 | maxColumn = 120 3 | align.openParenCallSite = false 4 | align.openParenDefnSite = false 5 | danglingParentheses = true 6 | 7 | rewrite.rules = [RedundantBraces, RedundantParens, SortImports, PreferCurlyFors] 8 | rewrite.redundantBraces.includeUnitMethods = true 9 | rewrite.redundantBraces.stringInterpolation = true 10 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: scala 2 | scala: 3 | - 2.11.11 4 | jdk: 5 | - oraclejdk8 6 | sudo: required 7 | services: 8 | - docker 9 | before_cache: 10 | - find $HOME/.sbt -name "*.lock" | xargs rm 11 | - find $HOME/.ivy2 -name "ivydata-*.properties" | xargs rm 12 | cache: 13 | directories: 14 | - "$HOME/.ivy2/cache" 15 | - "$HOME/.sbt/boot/" 16 | before_deploy: 17 | - openssl aes-256-cbc -pass pass:$ENCRYPTION_PASSWORD -in secring.gpg.enc -out local.secring.gpg 18 | -d 19 | - openssl aes-256-cbc -pass pass:$ENCRYPTION_PASSWORD -in pubring.gpg.enc -out local.pubring.gpg 20 | -d 21 | deploy: 22 | - provider: script 23 | script: "./publish.sh" 24 | skip_cleanup: true 25 | on: 26 | tags: true 27 | jdk: oraclejdk8 28 | scala: 2.11.11 29 | env: 30 | global: 31 | - BUILD_LABEL=1.0.${TRAVIS_BUILD_NUMBER} 32 | - secure: KUG4LdZBUn11TWMysOvF7jbwkHZzWRUIJqz6HlcXEOIdN4rzt2An0L2KB9L5L+DG6M2e0aHyl6ajeopSF8SYV3OYmPe2RKO2pJifhfn9Z301gpV3nKQKp2maZHTeoOSeWNOHV1APJAd5WyHsmPp1zKGHc7ePLcse9f45KuK0YxOnM+i1APUzWsRdS3PSaDfByP++LYNh4V8Bz4Xl75TgLs/kUtY2xxmFakWZe0PE87qjdadAI42Q7/b6sWHb17WNIzMg/8Vfn//C7XsQjmhXS5dwa2JilyFTZKLJ0N89h2KoMGgF4rJQWS7DLo5mFccqnAvx1iN4gZvzmvyQgZ2JCK4QxAxxC1umcJvJ4Yf7pzZaHqMrthCLz8eNixFSYtoCskJzyrHyow99MhOTnv9NcsOZxdT25j20wI2E2JxoYRDLOZcBd7xSiXCmyzQE3/V+FEaqCUcY2dFHdA8V6GnOmetycgxyWd23S7dCM4IQEB8UAnCM/UgIWGIhTAMbxUtJKEIMfA+VVQwJUPvKNkVB2HjUUK5q+//+B9H9WJrvR16urcyC8MpgWJWex5J7mwlvs1999t1S/EUPUmHlJQX7XQuN5kSOyQ2B+L+r0FW8LyHx4ffxy1qW9x9kFzGsWNKTKJd3PWJONKlcdWLYU/T56EFBzWZwPt74sVCRhtQ0G30= 33 | - secure: 2hLgeG61N9TZ8xNRSMCMlMiVatsoxsTT6Gj1ohbZC2cMTsEmMVPvYxOkBrjqu2VsJBcmyyiKqw8LX1FQBvU3eDIOss+mIQndjTvJIZIw1jx2KK6TNliT4nK0sORyHFDAmXR43AZPjpJVsg28uR80/Nwr6+7OoZ6Cy8kA2w+iO33vW2V/jXhD3H1XQRd+T7NE9RQigcY7OVjq1SqHE+6sH8CVQL94cqzV7fhHVeC5oBoze4bn+KcmK/iRl6zW9cXlyS+kjhPkVrS0NOADcXnCjUkkw+5N1TWSABXSBBO3VxkuE6W311T/NeNYFDP7CrjzrH1XEyTXjf9A3RVyJVRIEaDX4SHadb6eLRWSAgTO1d6tRxl8IcsnDoYghiMw61ggx4tkV6OQa5nF0U+0GJOYC/HLnqSIkv7/YM6LMp9AprWlWD+wB9wQ4l/09Ssed7hz/VoWnL7ymDIj/fJnsFnoOIJebciAo2pLWyftjjBIxEnQbGVyXSRdXVgxKpTyIeQY57xZPBugi4Q2KmBrLl+l2ean49eb827mMhM8FBdyLqbM8FKFln8E7Hokzw835/HKn23pqzLGO8/H/6rHOBkqwmjoL06FbiqHZyV16XCr0Jy/Bh6xoQaCYoE8ZhcTYzeBkTDzJIcfXqWIvNRUQzwHu+EogEXju/3c5G/2nITdz+g= 34 | - secure: yyiWyS++Q/ZUvRcRRi8Md0u7VqwQvA74cn57eSLJuUFEHsr4fDjVqkA9HhwUCWdCJC5UIm72c9gqkuyVjq9Vxb3M+YPF6h+sCsOfH3bDowq8ozWBIC1W8giORNAoANVcaFnDgd+9RkXqkIAarVgyff1r5Cdks5EfQ0lsCnaphdyrO0h1jQx8A5UMCZU6za52NDviyn92TUH5Z+XEBJzWg7/WV18zfr8/5VPK5ErYzcNk6s9Gq8WfLtf3b9aCdOkjBB1HzqB/pqEMOfxtERjSBtw66lB5Fzn6xk0O0cn4jfzyI/DpFfzi9Ecwuwm/kMX+ZSPUOe/riEt5/D5wJKEMio0G2Zt77Ulkq72ed2h801gLzjuNLvN3b+zk38Va0bPgWGARFVGVbAytve1xcpCJxQ8ZG+hgFq95WoTYguWUT2ny3xo+0aZm+r/jqT8o05p/UjbzPsTAeTsL0m8Qxr/+JPwvuLZIekzIg23JBe5PPHaIp5PdgYfCv+s02TdZNGEIYa9K3jo6yUsnZXClJaZov9tmmkvyw95/2bhgZfWn5dDqyNu/YrpsQ9k+sDfC01UmXXHl10jAWBy+wjAYw2F7UTR9/MVXC041j6vZmOrfVLdYvXvJslmVjR2qUq9zBrF6uZIOCaARMoy8YCcJFTruHFemdUJ0ElCOtrfpuZkDnkY= 35 | - secure: vvDoGnUlOIDqvN1+zKjxXUeOCEQ9AFURhXV1y8lUDBqZXHshoEV6MZKe7cL2vkBrCGLgEAOlX7vplW44spIxkL78zLtyxmUCsla4VcCT41iYkrBCM4LOKlzsKM3U0xoo76EP9zYXul6O/vIL9W9RvL78RyiC3C4vC8tUahuoxaWpyYvOFcKEI7sVf1O7/HeAKqaRRitsAuk8nfZDkTMU8FRjvxp2aY7I3hGaQVVMKe3lpns3R7T74FuQvnkfyr7T/O0/GnGKJELl8lBtdQlsWoZOXYQL7ZzaEWVjFsmWTNqtKU7DMv9j5eC/4JWqnQK2A3Eda0bPx0sgbymM79IAqBw5fwauyfUFMOsvmA/bEDoDF8cD/moPJmZBkxa2HC1DKgpsWE46fFJqqTZKaaaDZulkviGfqBUbwmZqGDlm1Xo3vrYBCOm1tXgzkuPRoksoGN3K3u2gUL9IeE0kiR0xx0qp62R/Kh51cycY1/w8yaNNOfB+1YdcMb7wv3SWGWohE0R/ke7x5luOvV5TCyqOJjmQb1OkzT4AeqACYjTBK6JhgcE5XqsveaF4NuKSA6CtfV19jgyFDEKjU8202SWeLzQew/RFFTSUea3xD/PfBQcKkGizfmCHF1tJmEqFgP09u7uqt0X8q3xls3882CGU46MB6GNmkB+nwXu2DCD1WyQ= 36 | - secure: loYtA4a1MMNZDyezwFf/2PNkJeli0E9mpmzeHeWQ1oB5+CVHE/vruOhXPHKDBU/RUBqM2Y09d00fA1Jz0N+0KFhj2nAJDZz2UR6LysOo56Xps4E+NSmMd72qntQtwP3qmWa6kqUYjr31y4i4O2zwlG/gRmq1BNBoouco3BIat8CMOram3WKgjyQSnvLIq9jtrq45uRc0po+nCzqnyGwmwG2sZKkajPTQpO4VjTPu+fmW8bbGhSC62UI2JpWnWev/8CNKaERvzs7s4iwf2ksMoS55iQSyixMp5k1L/qKtee6dYkbZli8gSuhdyNYaIIzTfV42YEUT7Jmwdxcw24sdgHcrCcQNtnVAGvW/EyMq+f1lwoJTabekfAcij30W+lwShnbS/69itl7gozm5dcClmihrtMUgQbb50b8Err+HHayy8xGDipCvtpoAku9NrSG+aX5BZA6BXmZw0VfUiPZXMQFMWA/n40cqzZclPb3cKQSbDd0WfukkmL/nWh2RKxyOLZgH0FLgzoDt3U8pUdR2YmDqZidUoVG9Iz4Lt5c7CTzNSoEaUhWiSFC+tytwn8OPzYq1uJtwv8lPaWvqQA/5+c9Tni5G2LsfRrrBlxSkHkCWqPuhVze5ByxLRank7CBOwBjLOJmv9tZt4NJX7YJhdYAs/jV7ya4jJBVmw1zUdhk= 37 | - secure: G0Wv6Odzqh6GrkyxbV6v8Xq1seQD97rX9C+NfuzwTAh2MVUopnPZGh/Oakiwdb/kTAIpb70dKrZaYqv+mTz/1oTQMg4T2e1OAgItZwR/d6SB/cOKjkxtFgI/q0N5/ukw1N5LE7LrTwoBWtoYIdTO4EqoxTcTkbx+VJczzMAlD3VRICzMsvpxXoOw0BkvESTxEU/yrX3AxRA5VziVvFeSb8cpmgvNYAWmlczaujChPedWOWoO0Z2lFhFU8LYBBzsIwBSIOd6f93vlOCOGQtYOCx4FybRkHs7B1Yjihfz68prR047vDvuT4y+evI9xIpvk3AjN2zkV92EXCo26/fqbepooiQ2cVSpQs+RtTT5P2jPasWFwm1K9G6a/5Ir1dFB7CXKl9EmEUxBnRR0+ZWYWssVc6sTduxvSpMx80CBlWWsHkSlbD/by5fWQptgQp4DYG63Pifc+MuPEHjhl3z1PvyhtccWtGbrYsCNoWlGRT497E8QaBQ8BValqg9sgZ/He9RzzHB6UBve8co92W8rgfOdE+JlgJEr+oIkbB5jnnJIGqC6cDW10aWfjCzIotfmk/MACvaaLTkXT4U786iktf0AR06kPOKssFGrvrqSFqui/ugbHsggBYbfgRC0D76VLQdBPFSuJNplgJamGvFGeoQHk6v+v08dNrQ70S/qp894= 38 | - secure: q5a1a4lDIQLaU32xilGkP91zGiKqW+m5KwuzwAhAIirmHZ9sD6cky5zoOPc6oYCeE4UGIowv9gkGJyMPMD+Jl7pom47/FfmvsS/pcMCzasgWueEsjF2Y9TRHdRPyeGFejJzLgOWu78ss8HOobmlHZPrXzl3Zxx+HkLfwFJ23bH7IIyBYmhC54XGCBuIxU/sbkUctCICOUNXxpzLey63dV/O4cGLzYkpXMtS/6CbwwQj/Hj+TKzaaonqeOAMY4hPj9WE0Vc/QnhstGGHHDIAhBYkNdvs1NLNvxRa1R0i+uGGjnQVhWXOfrKnFhn5qWHOYXvk58f+iW2Ey69Tg21iqOA9SBgrjtDUQLRhP9iAQJbu6EevKLFZcXqfeFJtDfDiUTxpycLWj8GD4k+sz40O2DcIGiPRvzI5v1ZN87baeojUGCjP7I3c2Mv4XigylZzhPJ7m9rPV/G5WG8aKaK/qUpJ7ynl4XpFAMcC7QGb+eQXOw6BqFMhDw4sGR2qu3LQ/flxmpzXZPkHyLK6m2oSGXVbk0w2UvO9pnF+XillQ4YOzB1O+OqbqjZtDu+ZmrB4NK/QzvyGyUS6C/7ancDdlgHZhcoLw50jp27msD+JBv5NcR0y1IAvPwwWqiyg/teaOxl/hZ80BT0AwpC0nNc83UbORGYGcLr6o46OoZNP5xlQ4= 39 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # schemer 2 | [![Build Status](https://travis-ci.org/indix/schemer.svg?branch=master)](https://travis-ci.org/indix/schemer) [![Maven](https://maven-badges.herokuapp.com/maven-central/com.indix/schemer-core_2.11/badge.svg)](http://repo1.maven.org/maven2/com/indix/schemer-core_2.11/) [![Docker Pulls](https://img.shields.io/docker/pulls/indix/schemer-registry.svg)](https://hub.docker.com/r/indix/schemer-registry/) 3 | 4 |

5 | 6 |

7 | 8 | Schema registry with support for CSV, TSV, AVRO, JSON and Parquet. Has ability to infer schema from a given data source. 9 | 10 | ## Schemer UI [WIP] 11 | 12 |

13 | 14 |

15 | 16 | Schemer UI is the wizard based frontend for Schemer. It provides a wizard based schema creation and versioning workflow apart from browsing and search capabilities. It is a work in progress. [More screens](schemer-ui.md) 17 | 18 | ## Schemer Core 19 | 20 | `schemer-core` is the core library that implements most of the logic needed to understand the supported schema types along with the schema inference. To use `schemer-core` directly, just add it to your dependencies: 21 | 22 | ``` 23 | libraryDependencies += "com.indix" %% "schemer" % "v0.2.3" 24 | ``` 25 | 26 | ## Schemer Registry 27 | 28 | `schemer-registry` is a schema registry for storing the metadata about schema and schema versions. It provides a GraphQL API for adding, viewing and inferring schemas. 29 | 30 | Schemer Registry is available as a [docker image at DockeHub](https://hub.docker.com/r/indix/schemer-registry/) 31 | 32 | ### Running Locally 33 | 34 | Local docker based PostgreSQL can be run as follows: 35 | 36 | ``` 37 | docker run -e POSTGRES_USER=schemer -e POSTGRES_PASSWORD=schemer -e PGDATA=/var/lib/postgresql/data/pgdata -e POSTGRES_DB=schemer -v $(pwd)/schemer_db:/var/lib/postgresql/data/pgdata -p 5432:5432 postgres:9.5.0 38 | ``` 39 | 40 | Remove `schmer_db` folder to clear all data and start from scratch. 41 | 42 | The registry service can be run using `sbt`: 43 | 44 | ```bash 45 | sbt "project registry" ~reStart 46 | ``` 47 | 48 | 49 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | import Dependencies._ 2 | import com.typesafe.sbt.packager.Keys.{daemonUser, dockerBaseImage, dockerExposedPorts, dockerRepository, packageName} 3 | import spray.revolver.RevolverPlugin 4 | import spray.revolver.RevolverPlugin.autoImport.Revolver 5 | 6 | val libVersion = sys.env.get("TRAVIS_TAG") orElse sys.env.get("BUILD_LABEL") getOrElse s"1.0.0-${System.currentTimeMillis / 1000}-SNAPSHOT" 7 | 8 | lazy val publishSettings = Seq( 9 | publishMavenStyle := true, 10 | pgpSecretRing := file("local.secring.gpg"), 11 | pgpPublicRing := file("local.pubring.gpg"), 12 | pgpPassphrase := Some(sys.env.getOrElse("GPG_PASSPHRASE", "").toCharArray), 13 | credentials += Credentials( 14 | "Sonatype Nexus Repository Manager", 15 | "oss.sonatype.org", 16 | System.getenv("SONATYPE_USERNAME"), 17 | System.getenv("SONATYPE_PASSWORD") 18 | ), 19 | publishTo := { 20 | val nexus = "https://oss.sonatype.org/" 21 | if (isSnapshot.value) 22 | Some("snapshots" at nexus + "content/repositories/snapshots") 23 | else 24 | Some("releases" at nexus + "service/local/staging/deploy/maven2") 25 | }, 26 | publishArtifact in Test := false, 27 | pomIncludeRepository := { _ => 28 | false 29 | }, 30 | pomExtra := 31 | https://github.com/indix/schemer 32 | 33 | 34 | Apache License 35 | https://raw.githubusercontent.com/indix/schemer/master/LICENSE 36 | repo 37 | 38 | 39 | 40 | git@github.com:indix/schemer.git 41 | scm:git:git@github.com:indix/schemer.git 42 | 43 | 44 | 45 | indix 46 | Indix 47 | http://www.indix.com 48 | 49 | 50 | ) 51 | 52 | lazy val schemer = Project( 53 | id = "schemer", 54 | base = file(".") 55 | ) aggregate (core, registry) 56 | 57 | lazy val core = (project in file("schemer-core")) 58 | .settings( 59 | inThisBuild( 60 | List( 61 | organization := "com.indix", 62 | scalaVersion := "2.11.11", 63 | crossScalaVersions := Seq("2.11.11"), 64 | version := libVersion, 65 | scalafmtOnCompile := true 66 | ) 67 | ), 68 | name := "schemer-core", 69 | libraryDependencies ++= sparkStackProvided ++ Seq(jsonSchemaValidator, scalaTest) 70 | ) 71 | .settings(publishSettings: _*) 72 | 73 | lazy val registry = (project in file("schemer-registry")) 74 | .enablePlugins(BuildInfoPlugin) 75 | .enablePlugins(AshScriptPlugin) 76 | .enablePlugins(JavaAppPackaging) 77 | .enablePlugins(DockerPlugin) 78 | .settings( 79 | dockerBaseImage := "anapsix/alpine-java:8u131b11_server-jre_unlimited", 80 | packageName in Docker := "schemer-registry", 81 | dockerExposedPorts := Seq(9000), 82 | version in Docker := libVersion, 83 | daemonUser in Docker := "root", 84 | dockerRepository := Some("indix"), 85 | Revolver.enableDebugging(port = 5005, suspend = false) 86 | ) 87 | .settings( 88 | inThisBuild( 89 | List( 90 | organization := "com.indix", 91 | scalaVersion := "2.11.11", 92 | version := libVersion, 93 | scalafmtOnCompile := true 94 | ) 95 | ), 96 | name := "schemer-registry", 97 | libraryDependencies ++= sparkStack ++ akkaStack ++ loggingStack ++ Seq( 98 | hadoopAws, 99 | sangria, 100 | sangriaSpray, 101 | postgres, 102 | quill, 103 | quillAsyncPostgres, 104 | flyway, 105 | prometheusClient, 106 | prometheusClientCommon, 107 | prometheusHotspot, 108 | scalaTest 109 | ), 110 | excludeDependencies ++= Seq( 111 | ExclusionRule("com.typesafe.scala-logging", "scala-logging-slf4j_2.11") 112 | ) 113 | ) dependsOn core 114 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | 3 | services: 4 | postgres: 5 | image: postgres:9.5.0 6 | ports: 7 | - 5432:5432 8 | environment: 9 | - POSTGRES_USER=schemer 10 | - POSTGRES_PASSWORD=schemer 11 | - PGDATA=/var/lib/postgresql/data/pgdata 12 | - POSTGRES_DB=schemer 13 | volumes: 14 | - ./schemer_db:/var/lib/postgresql/data/pgdata 15 | schemer: 16 | image: indix/schemer-registry:latest 17 | restart: always 18 | ports: 19 | - 9000:9000 20 | depends_on: 21 | - postgres 22 | environment: 23 | - POSTGRES_URL=postgresql://postgres:5432/schemer?user=schemer&password=schemer 24 | 25 | -------------------------------------------------------------------------------- /project/Dependencies.scala: -------------------------------------------------------------------------------- 1 | import sbt.{ExclusionRule, _} 2 | 3 | object Versions { 4 | val sparkVersion = "2.3.1" 5 | val akkaHttpVersion = "10.0.10" 6 | } 7 | 8 | object Dependencies { 9 | lazy val scalaTest = "org.scalatest" %% "scalatest" % "3.0.3" % Test 10 | lazy val sparkCore = "org.apache.spark" %% "spark-core" % Versions.sparkVersion 11 | lazy val sparkSql = "org.apache.spark" %% "spark-sql" % Versions.sparkVersion 12 | lazy val sparkAvro = "com.databricks" %% "spark-avro" % "4.0.0" 13 | 14 | lazy val sparkStack = Seq(sparkCore, sparkSql, sparkAvro) 15 | lazy val sparkStackProvided = sparkStack.map(_ % Provided) 16 | 17 | lazy val hadoopAws = "org.apache.hadoop" % "hadoop-aws" % "2.6.0" 18 | 19 | lazy val jsonSchemaValidator = "com.github.fge" % "json-schema-validator" % "2.2.6" excludeAll { 20 | ExclusionRule("javax.mail") 21 | } 22 | 23 | lazy val prometheusClient = "io.prometheus" % "simpleclient" % "0.2.0" 24 | lazy val prometheusClientCommon = "io.prometheus" % "simpleclient_common" % "0.2.0" 25 | lazy val prometheusHotspot = "io.prometheus" % "simpleclient_hotspot" % "0.2.0" 26 | 27 | lazy val akkaHttpCore = "com.typesafe.akka" %% "akka-http-core" % Versions.akkaHttpVersion 28 | lazy val akkaHttp = "com.typesafe.akka" %% "akka-http" % Versions.akkaHttpVersion 29 | lazy val sprayJsonAkka = "com.typesafe.akka" %% "akka-http-spray-json" % Versions.akkaHttpVersion 30 | lazy val akkaHttpTestkit = "com.typesafe.akka" %% "akka-http-testkit" % Versions.akkaHttpVersion % Test 31 | lazy val akkaStack = 32 | Seq(akkaHttpCore, akkaHttp, sprayJsonAkka, akkaHttpTestkit) 33 | 34 | lazy val sangria = "org.sangria-graphql" %% "sangria" % "1.2.0" 35 | lazy val sangriaSpray = "org.sangria-graphql" %% "sangria-spray-json" % "1.0.0" 36 | 37 | val logbackClassic = "ch.qos.logback" % "logback-classic" % "1.2.3" 38 | val scalaLogging = "com.typesafe.scala-logging" %% "scala-logging" % "3.7.2" 39 | 40 | val loggingStack = Seq(logbackClassic, scalaLogging) 41 | 42 | val postgres = "org.postgresql" % "postgresql" % "9.4.1208" 43 | val quill = "io.getquill" %% "quill-jdbc" % "2.3.1" 44 | val quillAsyncPostgres = "io.getquill" %% "quill-async-postgres" % "2.3.1" 45 | val flyway = "org.flywaydb" % "flyway-core" % "4.1.1" 46 | } 47 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=1.0.4 2 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.lucidchart" % "sbt-scalafmt" % "1.12") 2 | addSbtPlugin("com.eed3si9n" % "sbt-buildinfo" % "0.7.0") 3 | addSbtPlugin("com.typesafe.sbt" % "sbt-native-packager" % "1.3.2") 4 | addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.1.0") 5 | addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "2.0") 6 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.9.0") 7 | addSbtPlugin("io.spray" % "sbt-revolver" % "0.9.1") 8 | scalafmtOnCompile in ThisBuild := true -------------------------------------------------------------------------------- /project/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.lucidchart" % "sbt-scalafmt" % "1.12") -------------------------------------------------------------------------------- /publish.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -ex 4 | 5 | sbt "project core" +publishSigned 6 | sbt sonatypeReleaseAll 7 | 8 | docker login -u "$DOCKER_USERNAME" -p "$DOCKER_PASSWORD" 9 | sbt docker:publishLocal 10 | docker push indix/schemer-registry:${TRAVIS_TAG} 11 | docker tag indix/schemer-registry:${TRAVIS_TAG} indix/schemer-registry:latest 12 | docker push indix/schemer-registry:latest 13 | -------------------------------------------------------------------------------- /pubring.gpg.enc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/indix/schemer/16069813fa20b652f8e7735b64a47519e3ee8f14/pubring.gpg.enc -------------------------------------------------------------------------------- /resources/images/001.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/indix/schemer/16069813fa20b652f8e7735b64a47519e3ee8f14/resources/images/001.png -------------------------------------------------------------------------------- /resources/images/002.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/indix/schemer/16069813fa20b652f8e7735b64a47519e3ee8f14/resources/images/002.png -------------------------------------------------------------------------------- /resources/images/003.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/indix/schemer/16069813fa20b652f8e7735b64a47519e3ee8f14/resources/images/003.png -------------------------------------------------------------------------------- /resources/images/004.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/indix/schemer/16069813fa20b652f8e7735b64a47519e3ee8f14/resources/images/004.png -------------------------------------------------------------------------------- /resources/images/005.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/indix/schemer/16069813fa20b652f8e7735b64a47519e3ee8f14/resources/images/005.png -------------------------------------------------------------------------------- /resources/images/006.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/indix/schemer/16069813fa20b652f8e7735b64a47519e3ee8f14/resources/images/006.png -------------------------------------------------------------------------------- /resources/images/007.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/indix/schemer/16069813fa20b652f8e7735b64a47519e3ee8f14/resources/images/007.png -------------------------------------------------------------------------------- /resources/images/008.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/indix/schemer/16069813fa20b652f8e7735b64a47519e3ee8f14/resources/images/008.png -------------------------------------------------------------------------------- /resources/images/schemer-logo-text-wide.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/indix/schemer/16069813fa20b652f8e7735b64a47519e3ee8f14/resources/images/schemer-logo-text-wide.png -------------------------------------------------------------------------------- /resources/images/schemer-logo-text-wide.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | schemer 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /resources/images/schemer-logo-text.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/indix/schemer/16069813fa20b652f8e7735b64a47519e3ee8f14/resources/images/schemer-logo-text.png -------------------------------------------------------------------------------- /resources/images/schemer-logo-text.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | schemer 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /resources/images/schemer-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/indix/schemer/16069813fa20b652f8e7735b64a47519e3ee8f14/resources/images/schemer-logo.png -------------------------------------------------------------------------------- /resources/images/schemer-logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /schemer-core/src/main/scala/schemer/AvroSchema.scala: -------------------------------------------------------------------------------- 1 | package schemer 2 | import java.io.IOException 3 | 4 | import com.databricks.spark.avro.SchemaConverters 5 | import org.apache.avro.Schema.Parser 6 | import org.apache.avro.SchemaBuilder 7 | import org.apache.spark.sql.SparkSession 8 | import org.apache.spark.sql.types.StructType 9 | 10 | import scala.util.Random 11 | 12 | case class AvroSchemaBase() extends SchemaLikeBase[AvroSchema] { 13 | override def infer(paths: String*)(implicit spark: SparkSession) = { 14 | val schema = spark.read.format("com.databricks.spark.avro").load(paths: _*).schema 15 | 16 | AvroSchema(schema) 17 | } 18 | } 19 | 20 | case class AvroSchema(schema: String) extends SchemaLike { 21 | 22 | private def avroSchema() = new Parser().parse(schema) 23 | 24 | override def validate = 25 | try { 26 | sparkSchema() 27 | List.empty 28 | } catch { 29 | case e: IOException => List(s"Error while consuming Avro schema: ${e.getMessage}") 30 | } 31 | 32 | override def sparkSchema() = SchemaConverters.toSqlType(avroSchema()).dataType.asInstanceOf[StructType] 33 | 34 | override def toDf(paths: String*)(implicit spark: SparkSession) = 35 | spark.read.format("com.databricks.spark.avro").load(paths: _*) 36 | } 37 | 38 | object AvroSchema { 39 | def apply(): AvroSchemaBase = AvroSchemaBase() 40 | 41 | def apply(schema: StructType): AvroSchema = 42 | apply(schema, s"SchemerInferred_${Random.alphanumeric take 12 mkString ""}", "schemer") 43 | 44 | def apply(schema: StructType, record: String, namespace: String): AvroSchema = { 45 | val builder = SchemaBuilder.record(record).namespace(namespace) 46 | val avroSchema = SchemaConverters.convertStructToAvro(schema, builder, namespace).toString(true) 47 | new AvroSchema(avroSchema) 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /schemer-core/src/main/scala/schemer/CSVSchema.scala: -------------------------------------------------------------------------------- 1 | package schemer 2 | 3 | import com.fasterxml.jackson.annotation.JsonProperty 4 | import org.apache.spark.sql.types._ 5 | import org.apache.spark.sql.{DataFrame, SparkSession} 6 | import schemer.utils.JSONUtil 7 | 8 | case class CSVOptions( 9 | header: Boolean = true, 10 | headerBasedParser: Boolean = false, 11 | separator: String = ",", 12 | quoteChar: String = "\"", 13 | escapeChar: String = "\\" 14 | ) 15 | 16 | case class CSVSchemaBase(csvOptions: CSVOptions) extends SchemaLikeBase[CSVSchema] { 17 | override def infer(paths: String*)(implicit @transient spark: SparkSession) = { 18 | val schema = spark.read 19 | .option("header", csvOptions.header.toString) 20 | .option("delimiter", csvOptions.separator) 21 | .option("quote", csvOptions.quoteChar) 22 | .option("escape", csvOptions.escapeChar) 23 | .option("nullValue", null) 24 | .option("inferSchema", "true") 25 | .csv(paths: _*) 26 | .schema 27 | 28 | CSVSchema(schema, csvOptions) 29 | } 30 | } 31 | 32 | case class CSVSchema( 33 | @JsonProperty(required = true) fields: List[CSVField], 34 | options: CSVOptions = CSVOptions() 35 | ) extends SchemaLike { 36 | 37 | override def validate: List[String] = 38 | validateFields ++ validateMetaFields 39 | 40 | override def sparkSchema() = { 41 | val structFields = this.fields.map(field => StructField(field.name, getDataType(field.`type`), field.nullable)) 42 | StructType(structFields) 43 | } 44 | 45 | def toDf(paths: String*)(implicit @transient spark: SparkSession) = { 46 | val csvDF = spark.read 47 | .option("delimiter", options.separator) 48 | .option("quote", options.quoteChar) 49 | .option("escape", options.escapeChar) 50 | .option("nullValue", null) 51 | .csv(paths: _*) 52 | val orderedSchema = reconcileSchemaFieldOrder(sparkSchema(), csvDF) 53 | 54 | spark.read 55 | .option("header", options.header.toString) 56 | .option("delimiter", options.separator) 57 | .option("quote", options.quoteChar) 58 | .option("escape", options.escapeChar) 59 | .option("nullValue", null) 60 | .schema(orderedSchema) 61 | .csv(paths: _*) 62 | } 63 | 64 | private def reconcileSchemaFieldOrder(sparkSchema: StructType, csvDF: DataFrame) = 65 | if (options.headerBasedParser && options.header) { 66 | val actualHeaders = csvDF 67 | .first() 68 | .toSeq 69 | .map(_.toString) 70 | StructType(actualHeaders.map(field => sparkSchema(sparkSchema.fieldIndex(field)))) 71 | } else { 72 | sparkSchema 73 | } 74 | 75 | private def getDataType(csvFieldType: String) = 76 | csvFieldType.toLowerCase match { 77 | case "int" | "integer" => IntegerType 78 | case "long" => LongType 79 | case "double" => DoubleType 80 | case "float" => FloatType 81 | case "string" => StringType 82 | case "datetime" => DateType 83 | case "boolean" => BooleanType 84 | case _ => StringType 85 | } 86 | 87 | private def validateFields = 88 | if (fields.nonEmpty) { 89 | List.empty 90 | } else { 91 | List("fields can't be empty in a CSVSchema") 92 | } 93 | 94 | private def validateMetaFields = 95 | if (options.header && fields.exists(_.position.isEmpty)) { 96 | List("CSVSchema with hasHeader=false should have valid position numbers on all fields") 97 | } else { 98 | List.empty 99 | } 100 | 101 | override def schema() = 102 | JSONUtil.toJson(this) 103 | } 104 | 105 | object CSVSchema { 106 | def apply(schema: String): CSVSchema = 107 | JSONUtil.fromJson[CSVSchema](schema) 108 | 109 | def apply(options: CSVOptions): CSVSchemaBase = 110 | CSVSchemaBase(options) 111 | 112 | def apply(): CSVSchemaBase = 113 | CSVSchemaBase(CSVOptions()) 114 | def apply( 115 | schema: StructType, 116 | options: CSVOptions 117 | ): CSVSchema = { 118 | val fields = schema.fields.zipWithIndex.map { 119 | case (f: StructField, i: Int) => CSVField(f.name, f.nullable, getCsvType(f.dataType), Some(i)) 120 | }.toList 121 | 122 | new CSVSchema(fields, options) 123 | } 124 | 125 | def apply( 126 | schema: StructType, 127 | options: Map[String, String] 128 | ): CSVSchema = { 129 | val fields = schema.fields.zipWithIndex.map { 130 | case (f: StructField, i: Int) => CSVField(f.name, f.nullable, getCsvType(f.dataType), Some(i)) 131 | }.toList 132 | 133 | val csvOptions = CSVOptions( 134 | options.getOrElse("header", "true").toBoolean, 135 | options.getOrElse("headerBasedParser", "true").toBoolean, 136 | options.getOrElse("separator", ","), 137 | options.getOrElse("quoteChar", "\""), 138 | options.getOrElse("escapeChar", "\\") 139 | ) 140 | 141 | new CSVSchema(fields, csvOptions) 142 | } 143 | 144 | private def getCsvType(sparkType: DataType) = sparkType match { 145 | case IntegerType => "int" 146 | case LongType => "long" 147 | case DoubleType => "double" 148 | case FloatType => "float" 149 | case StringType => "string" 150 | case DateType => "datetime" 151 | case BooleanType => "boolean" 152 | case _ => "string" 153 | } 154 | } 155 | 156 | case class CSVField( 157 | name: String, 158 | nullable: Boolean, 159 | `type`: String, 160 | position: Option[Int] 161 | ) 162 | -------------------------------------------------------------------------------- /schemer-core/src/main/scala/schemer/JSONSchema.scala: -------------------------------------------------------------------------------- 1 | package schemer 2 | 3 | import com.fasterxml.jackson.databind.JsonNode 4 | import com.github.fge.jackson.JsonLoader 5 | import com.github.fge.jsonschema.main.JsonSchemaFactory 6 | import org.apache.spark.sql.SparkSession 7 | import org.apache.spark.sql.types._ 8 | import schemer.utils.{JSONUtil, JsonSchemaValidationUtil} 9 | 10 | import scala.annotation.tailrec 11 | import scala.collection.JavaConverters._ 12 | 13 | abstract trait JSONSchemaNode { 14 | def toJSON: String = JSONUtil.toJson(this) 15 | } 16 | 17 | case class ObjectSchema( 18 | `type`: String = "object", 19 | properties: Map[String, JSONSchemaNode], 20 | additionalProperties: Boolean = false, 21 | $schema: Option[String] = None 22 | ) extends JSONSchemaNode 23 | 24 | case class StringSchema( 25 | `type`: String = "string", 26 | format: Option[String] = None, 27 | pattern: Option[String] = None, 28 | minLength: Option[Int] = None, 29 | maxLength: Option[Int] = None 30 | ) extends JSONSchemaNode 31 | 32 | case class IntegerSchema(`type`: String = "integer", minimum: Option[BigInt] = None, maximum: Option[BigInt] = None) 33 | extends JSONSchemaNode 34 | 35 | case class NumberSchema(`type`: String = "number", minimum: Option[Double] = None, maximum: Option[Double] = None) 36 | extends JSONSchemaNode 37 | 38 | case class BooleanSchema(`type`: String = "boolean") extends JSONSchemaNode 39 | 40 | case class ArraySchema(`type`: String = "array", items: JSONSchemaNode) extends JSONSchemaNode 41 | 42 | case class JSONSchemaBase() extends SchemaLikeBase[JSONSchema] { 43 | 44 | @tailrec 45 | private def processStructFields( 46 | fields: List[StructField], 47 | accum: List[(String, JSONSchemaNode)] = Nil 48 | ): List[(String, JSONSchemaNode)] = 49 | fields match { 50 | case x :: xs => 51 | processStructFields(xs, accum ++ List(processField(x))) 52 | case Nil => accum 53 | } 54 | 55 | private def processField(x: StructField) = 56 | (x.name, processDataType(x.dataType)) 57 | 58 | private def processDataType(dataType: DataType): JSONSchemaNode = dataType match { 59 | case StringType => StringSchema() 60 | case LongType | IntegerType => IntegerSchema() 61 | case DoubleType => NumberSchema() 62 | case BooleanType => BooleanSchema() 63 | case f if f.isInstanceOf[StructType] => convertSparkToJsonSchema(dataType.asInstanceOf[StructType]) 64 | case f if f.isInstanceOf[ArrayType] => 65 | ArraySchema(items = processDataType(dataType.asInstanceOf[ArrayType].elementType)) 66 | } 67 | 68 | def convertSparkToJsonSchema(schema: StructType, draft: Option[String] = None) = 69 | ObjectSchema(properties = processStructFields(schema.fields.toList).toMap, $schema = draft) 70 | 71 | override def infer(paths: String*)(implicit spark: SparkSession) = { 72 | val sampleJsonData = spark.read.textFile(paths: _*).limit(1000) 73 | val schema = spark.read.json(sampleJsonData.rdd).schema 74 | val jsonSchema = convertSparkToJsonSchema(schema, Some("http://json-schema.org/draft-06/schema#")).toJSON 75 | JSONSchema(jsonSchema) 76 | } 77 | } 78 | 79 | case class JSONSchema(schema: String) extends SchemaLike { 80 | 81 | private val jsonSchema = JsonLoader.fromString(schema) 82 | 83 | override def validate: List[String] = { 84 | val validator = JsonSchemaFactory.byDefault().getSyntaxValidator 85 | val report = validator.validateSchema(jsonSchema) 86 | val syntaxErrors = JsonSchemaValidationUtil.process(report) 87 | if (syntaxErrors.isEmpty) { 88 | try { 89 | sparkSchema() 90 | List.empty 91 | } catch { 92 | case e: UnsupportedOperationException => List(e.getMessage) 93 | } 94 | } else { 95 | syntaxErrors 96 | } 97 | } 98 | 99 | override def sparkSchema(): StructType = jsonToStructType(jsonSchema).asInstanceOf[StructType] 100 | 101 | def toDf(paths: String*)(implicit spark: SparkSession) = 102 | spark.read 103 | .schema(sparkSchema()) 104 | .json(paths: _*) 105 | 106 | private def getRequiredProps(jsonSchema: JsonNode) = 107 | if (jsonSchema.has("required") && jsonSchema.get("required").isArray) { 108 | Some(jsonSchema.get("required").elements().asScala.map(_.asText())) 109 | } else { 110 | None 111 | } 112 | 113 | private def toArrayType(field: JsonNode) = { 114 | val itemsNode = field.get("items") 115 | if (itemsNode != null && itemsNode.isArray) { 116 | ArrayType(jsonToStructType(itemsNode.get(0))) 117 | } else if (itemsNode != null && itemsNode.isObject) { 118 | ArrayType(jsonToStructType(itemsNode)) 119 | } else { 120 | ArrayType(StringType) 121 | } 122 | } 123 | 124 | private def toObjectType(jsonSchema: JsonNode) = { 125 | val requiredFields = getRequiredProps(jsonSchema).getOrElse(List.empty) 126 | if (jsonSchema.has("patternProperties")) { 127 | MapType( 128 | StringType, 129 | jsonToStructType(jsonSchema.get("patternProperties").fields().asScala.toList.head.getValue) 130 | ) 131 | } else { 132 | StructType( 133 | jsonSchema 134 | .get("properties") 135 | .fields() 136 | .asScala 137 | .toList 138 | .map(field => { 139 | val fieldType = jsonToStructType(field.getValue) 140 | StructField(field.getKey, fieldType, nullable = !requiredFields.toList.contains(field.getKey)) 141 | }) 142 | ) 143 | } 144 | } 145 | 146 | private def jsonToStructType(jsonSchema: JsonNode): DataType = 147 | jsonSchema.get("type").asText() match { 148 | case "array" => toArrayType(jsonSchema) 149 | case "object" => toObjectType(jsonSchema) 150 | case "boolean" => BooleanType 151 | case "string" => StringType 152 | case "integer" => LongType 153 | case "number" => DoubleType 154 | case _ => 155 | throw new UnsupportedOperationException( 156 | s"Trying to convert a unsupported type ${jsonSchema.get("type").asText()}. Types other than (boolean, string, integer, number, object, array) aren't supported" 157 | ) 158 | 159 | } 160 | } 161 | 162 | object JSONSchema { 163 | def apply(): JSONSchemaBase = JSONSchemaBase() 164 | def apply(schema: StructType): JSONSchema = JSONSchema(JSONSchemaBase().convertSparkToJsonSchema(schema).toJSON) 165 | } 166 | -------------------------------------------------------------------------------- /schemer-core/src/main/scala/schemer/ParquetSchema.scala: -------------------------------------------------------------------------------- 1 | package schemer 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.types.StructType 5 | 6 | import scala.reflect.runtime.universe._ 7 | 8 | sealed trait ParquetSchemaType { 9 | val `type`: String 10 | } 11 | 12 | object ParquetSchemaType { 13 | case object Avro extends ParquetSchemaType { 14 | override val `type`: String = "avro" 15 | } 16 | case object Csv extends ParquetSchemaType { 17 | override val `type`: String = "csv" 18 | } 19 | case object Json extends ParquetSchemaType { 20 | override val `type`: String = "json" 21 | } 22 | 23 | val supportedTypes = List(Avro, Csv, Json).map(_.`type`) 24 | } 25 | 26 | case class ParquetSchemaBase[T <: SchemaLike: TypeTag](override val options: Map[String, String] = Map()) 27 | extends SchemaLikeBase[ParquetSchema] { 28 | override def infer(paths: String*)(implicit spark: SparkSession) = { 29 | val schema = spark.read.parquet(paths: _*).schema 30 | val underlyingSchema = typeOf[T] match { 31 | case t if t =:= typeOf[AvroSchema] => (ParquetSchemaType.Avro, AvroSchema(schema)) 32 | case t if t =:= typeOf[JSONSchema] => (ParquetSchemaType.Json, JSONSchema(schema)) 33 | case t if t =:= typeOf[CSVSchema] => (ParquetSchemaType.Csv, CSVSchema(schema, options)) 34 | } 35 | 36 | ParquetSchema(underlyingSchema._2.schema(), underlyingSchema._1) 37 | } 38 | } 39 | 40 | case class ParquetSchema(schema: String, `type`: ParquetSchemaType) extends SchemaLike { 41 | 42 | val schemaType = `type` match { 43 | case ParquetSchemaType.Avro => AvroSchema(schema) 44 | case ParquetSchemaType.Csv => CSVSchema(schema) 45 | case ParquetSchemaType.Json => JSONSchema(schema) 46 | } 47 | 48 | override def validate = schemaType.validate 49 | 50 | def toDf(paths: String*)(implicit spark: SparkSession) = 51 | spark.read 52 | .schema(sparkSchema()) 53 | .parquet(paths: _*) 54 | 55 | override def sparkSchema(): StructType = schemaType.sparkSchema() 56 | } 57 | 58 | object ParquetSchema { 59 | def apply[T <: SchemaLike: TypeTag]() = ParquetSchemaBase[T]() 60 | def apply(`type`: String) = `type` match { 61 | case ParquetSchemaType.Avro.`type` => apply[AvroSchema]() 62 | case ParquetSchemaType.Csv.`type` => apply[CSVSchema]() 63 | case ParquetSchemaType.Json.`type` => apply[JSONSchema]() 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /schemer-core/src/main/scala/schemer/SchemaLike.scala: -------------------------------------------------------------------------------- 1 | package schemer 2 | 3 | import org.apache.spark.sql.types.StructType 4 | import org.apache.spark.sql.{DataFrame, SparkSession} 5 | 6 | private[schemer] trait SchemaLikeBase[T <: SchemaLike] { 7 | val options: Map[String, String] = Map() 8 | def infer(paths: String*)(implicit @transient spark: SparkSession): T 9 | } 10 | 11 | private[schemer] trait SchemaLike { 12 | def validate: List[String] 13 | 14 | def sparkSchema(): StructType 15 | 16 | def schema(): String 17 | 18 | def toDf(paths: String*)(implicit @transient spark: SparkSession): DataFrame 19 | } 20 | -------------------------------------------------------------------------------- /schemer-core/src/main/scala/schemer/Schemer.scala: -------------------------------------------------------------------------------- 1 | package schemer 2 | 3 | sealed trait SchemaType { 4 | val `type`: String 5 | } 6 | 7 | object SchemaType { 8 | case object Avro extends SchemaType { 9 | override val `type`: String = "avro" 10 | } 11 | case object Csv extends SchemaType { 12 | override val `type`: String = "csv" 13 | } 14 | case object Json extends SchemaType { 15 | override val `type`: String = "json" 16 | } 17 | case object ParquetAvro extends SchemaType { 18 | override val `type`: String = "parquet_avro" 19 | } 20 | case object ParquetCsv extends SchemaType { 21 | override val `type`: String = "parquet_csv" 22 | } 23 | case object ParquetJson extends SchemaType { 24 | override val `type`: String = "parquet_json" 25 | } 26 | val supportedTypes = List(Avro, Csv, Json, ParquetAvro, ParquetCsv, ParquetJson) 27 | } 28 | 29 | object Schemer { 30 | def from(`type`: String, config: String): SchemaLike = `type` match { 31 | case "avro" => AvroSchema(config) 32 | case "csv" => CSVSchema(config) 33 | case "json" => JSONSchema(config) 34 | case "parquet_avro" => ParquetSchema(config, ParquetSchemaType.Avro) 35 | case "parquet_csv" => ParquetSchema(config, ParquetSchemaType.Csv) 36 | case "parquet_json" => ParquetSchema(config, ParquetSchemaType.Json) 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /schemer-core/src/main/scala/schemer/utils/JSONUtil.scala: -------------------------------------------------------------------------------- 1 | package schemer.utils 2 | 3 | import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper, SerializationFeature} 4 | import com.fasterxml.jackson.module.scala.DefaultScalaModule 5 | import com.fasterxml.jackson.annotation.JsonInclude.Include 6 | 7 | import scala.reflect.ClassTag 8 | 9 | private[schemer] object JSONUtil { 10 | private val mapper = new ObjectMapper() 11 | 12 | mapper.registerModule(DefaultScalaModule) 13 | 14 | mapper.setSerializationInclusion(Include.NON_NULL) 15 | 16 | def toJson(value: Any) = mapper.writeValueAsString(value) 17 | 18 | def prettyJson(value: Any) = mapper.enable(SerializationFeature.INDENT_OUTPUT).writeValueAsString(value) 19 | 20 | def fromJson[T: ClassTag](json: String) = { 21 | val classType = implicitly[ClassTag[T]].runtimeClass.asInstanceOf[Class[T]] 22 | mapper.readValue[T](json, classType) 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /schemer-core/src/main/scala/schemer/utils/JsonSchemaValidationUtil.scala: -------------------------------------------------------------------------------- 1 | package schemer.utils 2 | 3 | import com.github.fge.jsonschema.core.report.ProcessingReport 4 | import scala.collection.JavaConverters._ 5 | 6 | object JsonSchemaValidationUtil { 7 | def process(report: ProcessingReport): List[String] = 8 | if (!report.isSuccess) { 9 | getErrorsFromReport(report) 10 | } else { 11 | List.empty 12 | } 13 | 14 | private def getErrorsFromReport(report: ProcessingReport) = { 15 | val errorList = report.iterator.asScala.toList 16 | .map { message => 17 | message.asJson() 18 | } 19 | .filter { json => 20 | json.get("level").asText == "error" 21 | } 22 | .map { json => 23 | json.get("message").asText 24 | } 25 | errorList 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /schemer-core/src/test/resources/test.csv: -------------------------------------------------------------------------------- 1 | title,url,storeId 2 | iphone,http://indix.com/iphone,42 3 | galaxy,http://indix.com/galaxy,43 4 | lumia,http://indix.com/lumia,44 -------------------------------------------------------------------------------- /schemer-core/src/test/resources/test.json: -------------------------------------------------------------------------------- 1 | {"title": "iphone", "url": "http://indix.com/iphone", "imageUrls": ["http://indix.com/iphone.jpg"], "storeId": 42, "price": {"min": 10.0, "max": 100.0 }, "isAvailable": false} -------------------------------------------------------------------------------- /schemer-core/src/test/resources/test.tsv: -------------------------------------------------------------------------------- 1 | title url storeId 2 | iphone http://indix.com/iphone 42 3 | galaxy http://indix.com/galaxy 43 4 | lumia http://indix.com/lumia 44 -------------------------------------------------------------------------------- /schemer-core/src/test/scala/schemer/AvroSchemaSpec.scala: -------------------------------------------------------------------------------- 1 | package schemer 2 | 3 | import com.databricks.spark.avro._ 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.sql.types.{IntegerType, StringType} 6 | import org.apache.spark.sql.{SaveMode, SparkSession} 7 | import org.scalatest.{FlatSpec, Matchers} 8 | 9 | class AvroSchemaSpec extends FlatSpec with Matchers { 10 | implicit val spark: SparkSession = SparkSession.builder 11 | .config(new SparkConf()) 12 | .master("local[*]") 13 | .getOrCreate() 14 | 15 | "AvroSchema" should "infer avro schema from given path" in { 16 | import spark.implicits._ 17 | val df = Seq(TestRecord("iphone", "http://indix.com/iphone", 42)).toDF 18 | 19 | try { 20 | df.write.mode(SaveMode.Overwrite).avro("test") 21 | val schema = AvroSchema().infer("test") 22 | schema.schema.replaceAll("SchemerInferred_[^\"]+", "SchemerInferred") should be( 23 | "{\n \"type\" : \"record\",\n \"name\" : \"SchemerInferred\",\n \"namespace\" : \"schemer\",\n \"fields\" : [ {\n \"name\" : \"title\",\n \"type\" : [ \"string\", \"null\" ]\n }, {\n \"name\" : \"url\",\n \"type\" : [ \"string\", \"null\" ]\n }, {\n \"name\" : \"storeId\",\n \"type\" : [ \"int\", \"null\" ]\n } ]\n}" 24 | ) 25 | } finally { 26 | Helpers.cleanOutputPath("test") 27 | } 28 | } 29 | 30 | it should "get spark schema" in { 31 | val schema = AvroSchema( 32 | "{\n \"type\" : \"record\",\n \"name\" : \"SchemerInferred\",\n \"namespace\" : \"schemer\",\n \"fields\" : [ {\n \"name\" : \"title\",\n \"type\" : [ \"string\", \"null\" ]\n }, {\n \"name\" : \"url\",\n \"type\" : [ \"string\", \"null\" ]\n }, {\n \"name\" : \"storeId\",\n \"type\" : [ \"int\", \"null\" ]\n } ]\n}" 33 | ) 34 | val schemaFields = schema.sparkSchema().fields 35 | schemaFields.length should be(3) 36 | 37 | schemaFields(0).name should be("title") 38 | schemaFields(0).dataType should be(StringType) 39 | 40 | schemaFields(1).name should be("url") 41 | schemaFields(1).dataType should be(StringType) 42 | 43 | schemaFields(2).name should be("storeId") 44 | schemaFields(2).dataType should be(IntegerType) 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /schemer-core/src/test/scala/schemer/CSVSchemaSpec.scala: -------------------------------------------------------------------------------- 1 | package schemer 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} 6 | import org.scalatest._ 7 | 8 | import scala.util.Try 9 | 10 | class CSVSchemaSpec extends FlatSpec with Matchers { 11 | implicit val spark: SparkSession = SparkSession.builder 12 | .config(new SparkConf()) 13 | .master("local[*]") 14 | .getOrCreate() 15 | 16 | "CSVSchema" should "infer schema from given path" in { 17 | val path = getClass.getClassLoader.getResource("test.csv").getPath 18 | 19 | val inferredSchema = CSVSchema().infer(path) 20 | val fields = inferredSchema.fields 21 | 22 | fields.length should be(3) 23 | fields(0).name should be("title") 24 | fields(0).`type` should be("string") 25 | 26 | fields(1).name should be("url") 27 | fields(1).`type` should be("string") 28 | 29 | fields(2).name should be("storeId") 30 | fields(2).`type` should be("int") 31 | } 32 | 33 | it should "infer schema without header from file" in { 34 | val path = getClass.getClassLoader.getResource("test.csv").getPath 35 | 36 | val inferredSchema = CSVSchema(CSVOptions(false)).infer(path) 37 | val fields = inferredSchema.fields 38 | 39 | fields.length should be(3) 40 | fields(0).name should be("_c0") 41 | fields(0).`type` should be("string") 42 | 43 | fields(1).name should be("_c1") 44 | fields(1).`type` should be("string") 45 | 46 | fields(2).name should be("_c2") 47 | fields(2).`type` should be("string") 48 | } 49 | 50 | it should "infer schema and read" in { 51 | val path = getClass.getClassLoader.getResource("test.csv").getPath 52 | 53 | val inferredSchema = CSVSchema().infer(path) 54 | import spark.implicits._ 55 | val output = inferredSchema.toDf(path).as[TestRecord].collect() 56 | 57 | output.length should be(3) 58 | output(0).title should be("iphone") 59 | output(0).url should be("http://indix.com/iphone") 60 | output(0).storeId should be(42) 61 | } 62 | 63 | it should "infer schema and read from TSV" in { 64 | val path = getClass.getClassLoader.getResource("test.tsv").getPath 65 | 66 | val inferredSchema = CSVSchema(CSVOptions(headerBasedParser = true, separator = "\t")).infer(path) 67 | import spark.implicits._ 68 | val output = inferredSchema.toDf(path).as[TestRecord].collect() 69 | 70 | output.length should be(3) 71 | output(0).title should be("iphone") 72 | output(0).url should be("http://indix.com/iphone") 73 | output(0).storeId should be(42) 74 | } 75 | 76 | it should "infer schema and get schema json" in { 77 | val path = getClass.getClassLoader.getResource("test.csv").getPath 78 | 79 | val inferredSchema = CSVSchema().infer(path) 80 | 81 | inferredSchema.schema() should be( 82 | "{\"fields\":[{\"name\":\"title\",\"nullable\":true,\"type\":\"string\",\"position\":0},{\"name\":\"url\",\"nullable\":true,\"type\":\"string\",\"position\":1},{\"name\":\"storeId\",\"nullable\":true,\"type\":\"int\",\"position\":2}],\"options\":{\"header\":true,\"headerBasedParser\":false,\"separator\":\",\",\"quoteChar\":\"\\\"\",\"escapeChar\":\"\\\\\"}}" 83 | ) 84 | } 85 | 86 | it should "get schema from json" in { 87 | val schema = CSVSchema( 88 | "{\"fields\":[{\"name\":\"title\",\"nullable\":true,\"type\":\"string\",\"position\":0},{\"name\":\"url\",\"nullable\":true,\"type\":\"string\",\"position\":1},{\"name\":\"storeId\",\"nullable\":true,\"type\":\"int\",\"position\":2}],\"options\":{\"header\":true,\"headerBasedParser\":false,\"separator\":\",\",\"quoteChar\":\"\\\"\",\"escapeChar\":\"\\\\\"}}" 89 | ) 90 | 91 | schema.sparkSchema() should be( 92 | StructType( 93 | Seq( 94 | StructField("title", StringType, true), 95 | StructField("url", StringType, true), 96 | StructField("storeId", IntegerType, true) 97 | ) 98 | ) 99 | ) 100 | } 101 | 102 | it should "handle empty fields" in { 103 | val schema = CSVSchema( 104 | "{\"fields\":[], \"options\": {}}" 105 | ) 106 | 107 | schema.sparkSchema() should be( 108 | StructType(List()) 109 | ) 110 | } 111 | 112 | it should "handle error parsing json" in { 113 | Try(CSVSchema("{}")).failed.get.getMessage should startWith("Missing required creator property 'fields'") 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /schemer-core/src/test/scala/schemer/Helpers.scala: -------------------------------------------------------------------------------- 1 | package schemer 2 | 3 | import java.net.URI 4 | 5 | import org.apache.hadoop.conf.Configuration 6 | import org.apache.hadoop.fs.{FileSystem, Path} 7 | 8 | case class TestRecord(title: String, url: String, storeId: Int) 9 | 10 | object Helpers { 11 | 12 | def cleanOutputPath(output: String) { 13 | val outputPath = new Path(output) 14 | if (fileExists(output)) 15 | outputPath.getFileSystem(new Configuration()).delete(outputPath, true) 16 | } 17 | 18 | def fileExists(fileLocation: String) = { 19 | val fs = FileSystem.get(new URI(fileLocation), new Configuration()) 20 | fs.exists(new Path(fileLocation)) 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /schemer-core/src/test/scala/schemer/JSONSchemaSpec.scala: -------------------------------------------------------------------------------- 1 | package schemer 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.types._ 6 | import org.scalatest.{FlatSpec, Matchers} 7 | 8 | class JSONSchemaSpec extends FlatSpec with Matchers { 9 | implicit val spark: SparkSession = SparkSession.builder 10 | .config(new SparkConf()) 11 | .master("local[*]") 12 | .getOrCreate() 13 | 14 | "JSONSchema" should "infer json schema" in { 15 | val path = getClass.getClassLoader.getResource("test.json").getPath 16 | 17 | val inferredSchema = JSONSchema().infer(path) 18 | inferredSchema.schema should be( 19 | "{\"type\":\"object\",\"properties\":{\"imageUrls\":{\"type\":\"array\",\"items\":{\"type\":\"string\"}},\"url\":{\"type\":\"string\"},\"price\":{\"type\":\"object\",\"properties\":{\"max\":{\"type\":\"number\"},\"min\":{\"type\":\"number\"}},\"additionalProperties\":false},\"storeId\":{\"type\":\"integer\"},\"isAvailable\":{\"type\":\"boolean\"},\"title\":{\"type\":\"string\"}},\"additionalProperties\":false}" 20 | ) 21 | 22 | val fields = inferredSchema.sparkSchema().fields 23 | fields.length should be(6) 24 | fields.map(f => (f.name, f.dataType)) should contain allElementsOf List( 25 | ("title", StringType), 26 | ("url", StringType), 27 | ("storeId", LongType), 28 | ("price", StructType(Seq(StructField("max", DoubleType), StructField("min", DoubleType)))), 29 | ("isAvailable", BooleanType), 30 | ("imageUrls", ArrayType(StringType)) 31 | ) 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /schemer-core/src/test/scala/schemer/ParquetSchemaSpec.scala: -------------------------------------------------------------------------------- 1 | package schemer 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.sql.{SaveMode, SparkSession} 5 | import org.scalatest.{FlatSpec, Matchers} 6 | 7 | class ParquetSchemaSpec extends FlatSpec with Matchers { 8 | implicit val spark: SparkSession = SparkSession.builder 9 | .config(new SparkConf()) 10 | .master("local[*]") 11 | .getOrCreate() 12 | 13 | "ParquetSchema" should "infer avro schema" in { 14 | import spark.implicits._ 15 | val df = Seq(TestRecord("iphone", "http://indix.com/iphone", 42)).toDF 16 | 17 | val dataDir = "test_parquet_avro" 18 | 19 | try { 20 | df.write.mode(SaveMode.Overwrite).parquet(dataDir) 21 | val schema = ParquetSchema[AvroSchema]().infer(dataDir) 22 | schema.schema.replaceAll("SchemerInferred_[^\"]+", "SchemerInferred") should be( 23 | "{\n \"type\" : \"record\",\n \"name\" : \"SchemerInferred\",\n \"namespace\" : \"schemer\",\n \"fields\" : [ {\n \"name\" : \"title\",\n \"type\" : [ \"string\", \"null\" ]\n }, {\n \"name\" : \"url\",\n \"type\" : [ \"string\", \"null\" ]\n }, {\n \"name\" : \"storeId\",\n \"type\" : [ \"int\", \"null\" ]\n } ]\n}" 24 | ) 25 | } finally { 26 | Helpers.cleanOutputPath(dataDir) 27 | } 28 | } 29 | 30 | it should "infer json schema" in { 31 | import spark.implicits._ 32 | val df = Seq(TestRecord("iphone", "http://indix.com/iphone", 42)).toDF 33 | 34 | val dataDir = "test_parquet_json" 35 | 36 | try { 37 | df.write.mode(SaveMode.Overwrite).parquet(dataDir) 38 | val schema = ParquetSchema[JSONSchema]().infer(dataDir) 39 | schema.schema.replaceAll("SchemerInferred_[^\"]+", "SchemerInferred") should be( 40 | "{\"type\":\"object\",\"properties\":{\"title\":{\"type\":\"string\"},\"url\":{\"type\":\"string\"},\"storeId\":{\"type\":\"integer\"}},\"additionalProperties\":false}" 41 | ) 42 | } finally { 43 | Helpers.cleanOutputPath(dataDir) 44 | } 45 | } 46 | 47 | it should "infer csv schema" in { 48 | import spark.implicits._ 49 | val df = Seq(TestRecord("iphone", "http://indix.com/iphone", 42)).toDF 50 | 51 | val dataDir = "test_parquet_csv" 52 | 53 | try { 54 | df.write.mode(SaveMode.Overwrite).parquet(dataDir) 55 | val schema = ParquetSchema[CSVSchema]().infer(dataDir) 56 | schema.schema.replaceAll("SchemerInferred_[^\"]+", "SchemerInferred") should be( 57 | "{\"fields\":[{\"name\":\"title\",\"nullable\":true,\"type\":\"string\",\"position\":0},{\"name\":\"url\",\"nullable\":true,\"type\":\"string\",\"position\":1},{\"name\":\"storeId\",\"nullable\":true,\"type\":\"int\",\"position\":2}],\"options\":{\"header\":true,\"headerBasedParser\":true,\"separator\":\",\",\"quoteChar\":\"\\\"\",\"escapeChar\":\"\\\\\"}}" 58 | ) 59 | } finally { 60 | Helpers.cleanOutputPath(dataDir) 61 | } 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /schemer-registry/src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | akka { 2 | http { 3 | server { 4 | request-timeout = 90s 5 | idle-timeout = 120s 6 | } 7 | } 8 | } 9 | 10 | registry { 11 | server { 12 | host = "0.0.0.0" 13 | port = 9000 14 | port = ${?SCHEMER_REGISTRY_PORT} 15 | } 16 | inference { 17 | timeout = 60s 18 | } 19 | h2 { 20 | dataSourceClassName = "org.h2.jdbcx.JdbcDataSource" 21 | dataSource { 22 | url = "jdbc:h2:mem:registry" 23 | } 24 | } 25 | postgres { 26 | url = "postgresql://localhost:5432/schemer?user=schemer&password=schemer" 27 | url = ${?POSTGRES_URL} 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /schemer-registry/src/main/resources/aws-core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 8 | 9 | 10 | fs.s3.impl 11 | org.apache.hadoop.fs.s3a.S3AFileSystem 12 | 13 | 14 | fs.s3n.impl 15 | org.apache.hadoop.fs.s3a.S3AFileSystem 16 | 17 | 18 | 19 | fs.AbstractFileSystem.s3.impl 20 | org.apache.hadoop.fs.s3a.S3A 21 | 22 | 23 | fs.AbstractFileSystem.s3n.impl 24 | org.apache.hadoop.fs.s3a.S3A 25 | 26 | -------------------------------------------------------------------------------- /schemer-registry/src/main/resources/db/migration/V1__creates_schemas.sql: -------------------------------------------------------------------------------- 1 | CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; 2 | 3 | CREATE TABLE "namespaces"( 4 | "id" UUID NOT NULL DEFAULT uuid_generate_v4(), 5 | "name" VARCHAR NOT NULL 6 | ); 7 | 8 | ALTER TABLE "namespaces" ADD CONSTRAINT "namespaces_id" PRIMARY KEY("id"); 9 | CREATE UNIQUE INDEX "namespaces_name" ON "namespaces"("name"); 10 | 11 | INSERT INTO "namespaces"("name") VALUES('default'); 12 | 13 | CREATE TABLE "schemas"( 14 | "id" UUID NOT NULL DEFAULT uuid_generate_v4(), 15 | "name" VARCHAR NOT NULL, 16 | "namespace" VARCHAR NOT NULL, 17 | "type" VARCHAR NOT NULL, 18 | "created_on" TIMESTAMP WITH TIME ZONE NOT NULL, 19 | "created_by" VARCHAR NOT NULL 20 | 21 | ); 22 | 23 | ALTER TABLE "schemas" ADD CONSTRAINT "schemas_id" PRIMARY KEY("id"); 24 | CREATE UNIQUE INDEX "schemas_name_namespace" ON "schemas"("name","namespace"); 25 | ALTER TABLE "schemas" ADD CONSTRAINT "schemas_namespace_fk" FOREIGN KEY("namespace") REFERENCES "namespaces"("name"); 26 | 27 | CREATE TABLE "schema_versions" ( 28 | "id" UUID NOT NULL DEFAULT uuid_generate_v4(), 29 | "schema_id" UUID NOT NULL, 30 | "version" VARCHAR NOT NULL, 31 | "schema" VARCHAR NOT NULL, 32 | "created_on" TIMESTAMP WITH TIME ZONE NOT NULL, 33 | "created_by" VARCHAR NOT NULL 34 | ); 35 | ALTER TABLE "schema_versions" ADD CONSTRAINT "schema_versions_id" PRIMARY KEY("id"); 36 | CREATE UNIQUE INDEX "schema_versions_version" ON "schema_versions"("schema_id", "version"); 37 | ALTER TABLE "schema_versions" ADD CONSTRAINT "schema_versions_schema_fk" FOREIGN KEY("schema_id") REFERENCES "schemas"("id"); -------------------------------------------------------------------------------- /schemer-registry/src/main/resources/graphql/graphiql.html: -------------------------------------------------------------------------------- 1 | 29 | 30 | 31 | 32 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 |
Loading...
54 | 55 | 150 | 151 | -------------------------------------------------------------------------------- /schemer-registry/src/main/scala/schemer/registry/actors/InferActor.scala: -------------------------------------------------------------------------------- 1 | package schemer.registry.actors 2 | 3 | import akka.actor.{Actor, ActorRef, ActorSystem, Status} 4 | import akka.event.Logging 5 | import akka.util.Timeout 6 | import org.apache.spark.sql.SparkSession 7 | import schemer._ 8 | import schemer.registry.exceptions.SchemerInferenceException 9 | 10 | import scala.concurrent.Future 11 | import scala.util.{Failure, Random, Success} 12 | 13 | case class JSONSchemaInferenceRequest(paths: Seq[String]) 14 | case class AvroSchemaInferenceRequest(paths: Seq[String]) 15 | case class ParquetSchemaInferenceRequest(`type`: String, paths: Seq[String]) 16 | case class CSVSchemaInferenceRequest(options: CSVOptions, paths: Seq[String]) 17 | 18 | class InferActor( 19 | implicit val spark: SparkSession, 20 | implicit val system: ActorSystem, 21 | implicit val inferTimeout: Timeout 22 | ) extends Actor { 23 | import context.dispatcher 24 | val logger = Logging(context.system, this) 25 | 26 | def receive = { 27 | case JSONSchemaInferenceRequest(paths) => 28 | inferSchema(sender()) { 29 | JSONSchema().infer(paths: _*) 30 | } 31 | case AvroSchemaInferenceRequest(paths) => 32 | inferSchema(sender()) { 33 | AvroSchema().infer(paths: _*) 34 | } 35 | case CSVSchemaInferenceRequest(options, paths) => 36 | inferSchema(sender()) { 37 | CSVSchema(options).infer(paths: _*) 38 | } 39 | case ParquetSchemaInferenceRequest(t, paths) => 40 | inferSchema(sender()) { 41 | ParquetSchema(t).infer(paths: _*) 42 | } 43 | case _ => logger.info("Unsupported infer request") 44 | } 45 | 46 | def inferSchema(sender: ActorRef)(block: => Any) = { 47 | val jobGroup = Random.alphanumeric take 12 mkString "" 48 | logger.info(s"Starting inference for jobGroup $jobGroup") 49 | 50 | val inferFuture = Future { 51 | spark.sparkContext.setJobGroup(jobGroup, jobGroup, true) 52 | block 53 | } recoverWith { 54 | case ex => 55 | logger.info(s"Inference for jobGroup $jobGroup failed - ${ex.getMessage}") 56 | Future.failed(SchemerInferenceException(ex.getMessage)) 57 | } 58 | 59 | inferFuture onComplete { 60 | case Success(r) => 61 | logger.info(s"Completing inference for jobGroup $jobGroup") 62 | sender ! r 63 | case Failure(f) => 64 | sender ! Status.Failure(f) 65 | } 66 | 67 | system.scheduler.scheduleOnce(inferTimeout.duration) { 68 | logger.info(s"Cancelling jobGroup $jobGroup") 69 | spark.sparkContext.cancelJobGroup(jobGroup) 70 | } 71 | 72 | } 73 | 74 | override def preStart(): Unit = 75 | logger.info(s"Starting infer actor") 76 | } 77 | -------------------------------------------------------------------------------- /schemer-registry/src/main/scala/schemer/registry/dao/SchemaDao.scala: -------------------------------------------------------------------------------- 1 | package schemer.registry.dao 2 | 3 | import java.util.UUID 4 | 5 | import org.joda.time.DateTime 6 | import schemer.registry.models.{Schema, SchemaVersion} 7 | import schemer.registry.sql.SqlDatabase 8 | 9 | import scala.concurrent.{ExecutionContext, Future} 10 | 11 | case class PaginatedFilter( 12 | id: Option[UUID], 13 | first: Option[Int], 14 | after: Option[DateTime], 15 | last: Option[Int], 16 | before: Option[DateTime] 17 | ) { 18 | def take = (last orElse first).filter(_ <= 10).getOrElse(10) + 1 19 | } 20 | 21 | class SchemaDao(val db: SqlDatabase)(implicit val ec: ExecutionContext) { 22 | import db.ctx._ 23 | 24 | val schemas = quote(querySchema[Schema]("schemas")) 25 | def find(id: UUID) = run(schemas.filter(c => c.id == lift(id))).map(_.headOption) 26 | def create(schema: Schema): Future[UUID] = run(schemas.insert(lift(schema)).returning(_.id)) 27 | def all() = run(schemas) 28 | 29 | val schemaVersions = quote(querySchema[SchemaVersion]("schema_versions")) 30 | 31 | def createVersion(schemaVersion: SchemaVersion): Future[UUID] = 32 | run(schemaVersions.insert(lift(schemaVersion)).returning(_.id)) 33 | 34 | def findFirstVersions(filter: PaginatedFilter) = { 35 | val query = quote { 36 | applyCursors(lift(filter)).sortBy(_.createdOn)(Ord.descNullsLast).take(lift(filter.take)) 37 | } 38 | 39 | run(query) 40 | } 41 | 42 | def findLastVersions(filter: PaginatedFilter) = { 43 | val query = quote { 44 | applyCursors(lift(filter)).sortBy(_.createdOn)(Ord.ascNullsLast).take(lift(filter.take)) 45 | } 46 | 47 | run(query) 48 | } 49 | 50 | private def applyCursors = 51 | quote { (filter: PaginatedFilter) => 52 | schemaVersions 53 | .filter( 54 | (version: SchemaVersion) => 55 | filter.id.forall(_ == version.schemaId) 56 | && filter.after > version.createdOn 57 | && filter.before < version.createdOn 58 | ) 59 | } 60 | 61 | def findLatestVersion(id: UUID) = { 62 | val query = quote { 63 | schemaVersions 64 | .filter(_.schemaId == lift(id)) 65 | .filter { v1 => 66 | schemaVersions 67 | .filter(_.schemaId == lift(id)) 68 | .filter { v2 => 69 | v1.id != v2.id && v1.createdOn < v2.createdOn 70 | } 71 | .isEmpty 72 | } 73 | } 74 | 75 | run(query).map(_.headOption) 76 | } 77 | 78 | def findVersion(id: UUID, version: String) = { 79 | val query = quote { 80 | schemaVersions.filter(_.version == lift(version)).filter(_.schemaId == lift(id)) 81 | } 82 | 83 | run(query).map(_.headOption) 84 | } 85 | 86 | def findVersion(id: UUID) = run(schemaVersions.filter(c => c.id == lift(id))).map(_.headOption) 87 | } 88 | -------------------------------------------------------------------------------- /schemer-registry/src/main/scala/schemer/registry/exceptions/SchemerException.scala: -------------------------------------------------------------------------------- 1 | package schemer.registry.exceptions 2 | 3 | class SchemerException(message: String) extends Exception(message) 4 | -------------------------------------------------------------------------------- /schemer-registry/src/main/scala/schemer/registry/exceptions/SchemerInferenceException.scala: -------------------------------------------------------------------------------- 1 | package schemer.registry.exceptions 2 | 3 | case class SchemerInferenceException(message: String) 4 | extends SchemerException(s"Error while trying to infer schema - $message") 5 | -------------------------------------------------------------------------------- /schemer-registry/src/main/scala/schemer/registry/exceptions/SchemerSchemaCreationException.scala: -------------------------------------------------------------------------------- 1 | package schemer.registry.exceptions 2 | 3 | case class SchemerSchemaCreationException(message: String) 4 | extends SchemerException(s"Error while trying to create new schema - $message") 5 | -------------------------------------------------------------------------------- /schemer-registry/src/main/scala/schemer/registry/exceptions/SchemerSchemaVersionCreationException.scala: -------------------------------------------------------------------------------- 1 | package schemer.registry.exceptions 2 | 3 | case class SchemerSchemaVersionCreationException(message: String) 4 | extends SchemerException(s"Error while trying to create new schema version - $message") 5 | -------------------------------------------------------------------------------- /schemer-registry/src/main/scala/schemer/registry/graphql/CustomGraphQLResolver.scala: -------------------------------------------------------------------------------- 1 | package schemer.registry.graphql 2 | 3 | import java.util.UUID 4 | 5 | import sangria.execution.deferred.{Deferred, DeferredResolver} 6 | import schemer._ 7 | import schemer.registry.models.{SchemaSchemaVersionConnection, SchemaVersion} 8 | 9 | import scala.concurrent.ExecutionContext 10 | 11 | case class InferCSVSchemaDeferred(options: CSVOptions, paths: Seq[String]) extends Deferred[CSVSchema] 12 | case class InferJSONSchemaDeferred(paths: Seq[String]) extends Deferred[JSONSchema] 13 | case class InferParquetSchemaDeferred(`type`: String, paths: Seq[String]) extends Deferred[ParquetSchema] 14 | case class InferAvroSchemaDeferred(paths: Seq[String]) extends Deferred[AvroSchema] 15 | 16 | case class SchemaVersionsDeferred( 17 | id: UUID, 18 | first: Option[Int], 19 | after: Option[String], 20 | last: Option[Int], 21 | before: Option[String] 22 | ) extends Deferred[Seq[SchemaSchemaVersionConnection]] 23 | case class SchemaVersionLatestDeferred(id: UUID) extends Deferred[Option[SchemaVersion]] 24 | 25 | class CustomGraphQLResolver extends DeferredResolver[GraphQLService] { 26 | override def resolve(deferred: Vector[Deferred[Any]], ctx: GraphQLService, queryState: Any)( 27 | implicit ec: ExecutionContext 28 | ) = { 29 | val defMap = deferred.collect { 30 | case InferCSVSchemaDeferred(options, paths) => "csvSchemaInference" -> ctx.inferCSVSchema(options, paths) 31 | case InferJSONSchemaDeferred(paths) => "jsonSchemaInference" -> ctx.inferJSONSchema(paths) 32 | case InferParquetSchemaDeferred(t, paths) => "parquetSchemaInference" -> ctx.inferParquetSchema(t, paths) 33 | case InferAvroSchemaDeferred(paths) => "avroSchemaInference" -> ctx.inferAvroSchema(paths) 34 | case SchemaVersionsDeferred(id, first, after, last, before) => 35 | "schemaVersions" -> ctx.schemaVersions(id, first, after, last, before) 36 | case SchemaVersionLatestDeferred(id) => "schemaVersionLatest" -> ctx.latestSchemaVersion(id) 37 | } 38 | 39 | deferred flatMap { 40 | case InferCSVSchemaDeferred(_, _) => defMap.filter(_._1 == "csvSchemaInference").map(_._2) 41 | case InferJSONSchemaDeferred(_) => defMap.filter(_._1 == "jsonSchemaInference").map(_._2) 42 | case InferParquetSchemaDeferred(_, _) => defMap.filter(_._1 == "parquetSchemaInference").map(_._2) 43 | case InferAvroSchemaDeferred(_) => defMap.filter(_._1 == "avroSchemaInference").map(_._2) 44 | case SchemaVersionsDeferred(_, _, _, _, _) => defMap.filter(_._1 == "schemaVersions").map(_._2) 45 | case SchemaVersionLatestDeferred(_) => defMap.filter(_._1 == "schemaVersionLatest").map(_._2) 46 | } 47 | } 48 | } 49 | 50 | object CustomGraphQLResolver { 51 | val deferredResolver: DeferredResolver[GraphQLService] = 52 | DeferredResolver.fetchersWithFallback( 53 | new CustomGraphQLResolver 54 | ) 55 | } 56 | -------------------------------------------------------------------------------- /schemer-registry/src/main/scala/schemer/registry/graphql/GraphQLService.scala: -------------------------------------------------------------------------------- 1 | package schemer.registry.graphql 2 | 3 | import java.util.UUID 4 | 5 | import akka.actor.{ActorRef, ActorSystem} 6 | import akka.pattern.{ask, AskTimeoutException} 7 | import akka.util.Timeout 8 | import com.github.mauricio.async.db.postgresql.exceptions.GenericDatabaseException 9 | import org.apache.spark.sql.SparkSession 10 | import sangria.macros.derive.GraphQLField 11 | import schemer._ 12 | import schemer.registry.Cursor 13 | import schemer.registry.actors._ 14 | import schemer.registry.dao.{PaginatedFilter, SchemaDao} 15 | import schemer.registry.exceptions.{ 16 | SchemerException, 17 | SchemerInferenceException, 18 | SchemerSchemaCreationException, 19 | SchemerSchemaVersionCreationException 20 | } 21 | import schemer.registry.models._ 22 | import schemer.registry.utils.Clock 23 | 24 | import scala.concurrent.{ExecutionContext, Future} 25 | import scala.language.postfixOps 26 | 27 | class GraphQLService( 28 | schemaDao: SchemaDao, 29 | inferActor: ActorRef 30 | )( 31 | implicit val spark: SparkSession, 32 | implicit val clock: Clock, 33 | implicit val ec: ExecutionContext, 34 | implicit val system: ActorSystem, 35 | implicit val inferActorTimeout: Timeout 36 | ) { 37 | 38 | def inferCSVSchema(options: CSVOptions, paths: Seq[String]) = 39 | inferWithActor(CSVSchemaInferenceRequest(options, paths)) 40 | 41 | def inferJSONSchema(paths: Seq[String]) = 42 | inferWithActor(JSONSchemaInferenceRequest(paths)) 43 | 44 | def inferParquetSchema(`type`: String, paths: Seq[String]) = 45 | inferWithActor(ParquetSchemaInferenceRequest(`type`, paths)) 46 | 47 | def inferAvroSchema(paths: Seq[String]) = 48 | inferWithActor(AvroSchemaInferenceRequest(paths)) 49 | 50 | @GraphQLField 51 | def addSchema(name: String, namespace: String, `type`: SchemaType, user: String) = 52 | schemaDao.create(Schema(name, namespace, `type`.`type`, clock.nowUtc, user)).recoverWith { 53 | case ex: GenericDatabaseException => 54 | Future.failed(SchemerSchemaCreationException(ex.asInstanceOf[GenericDatabaseException].errorMessage.message)) 55 | case ex => 56 | Future.failed(SchemerSchemaCreationException(ex.getMessage)) 57 | } 58 | 59 | @GraphQLField 60 | def addSchemaVersion(schemaId: UUID, version: String, schemaConfig: String, user: String) = 61 | schemaDao 62 | .find(schemaId) 63 | .flatMap { 64 | case Some(schema) => 65 | val errors = Schemer.from(schema.`type`, schemaConfig).validate 66 | if (errors.isEmpty) { 67 | schemaDao.createVersion(SchemaVersion(null, schema.id, version, schemaConfig, clock.nowUtc, user)) 68 | } else { 69 | Future.failed( 70 | SchemerSchemaVersionCreationException( 71 | s"Error(s) validating schema config - ${errors.mkString("[", ", ", "]")}" 72 | ) 73 | ) 74 | } 75 | case None => Future.failed(SchemerSchemaVersionCreationException(s"Schema with id $schemaId not found")) 76 | } 77 | .recoverWith { 78 | case ex: GenericDatabaseException => 79 | Future.failed( 80 | SchemerSchemaVersionCreationException(ex.asInstanceOf[GenericDatabaseException].errorMessage.message) 81 | ) 82 | case ex => 83 | Future.failed(SchemerSchemaVersionCreationException(ex.getMessage)) 84 | } 85 | 86 | def allSchemas = schemaDao.all() 87 | 88 | def schema(id: UUID) = schemaDao.find(id) 89 | 90 | def schemaVersion(id: UUID) = schemaDao.findVersion(id) 91 | 92 | def schemaVersions(id: UUID, first: Option[Int], after: Option[Cursor], last: Option[Int], before: Option[Cursor]) = 93 | if (first.nonEmpty && last.nonEmpty) { 94 | Future.failed(new SchemerException("Both first and last cannot be specified")) 95 | } else { 96 | import schemer.registry.utils.DateTimeUtils._ 97 | val filter = 98 | PaginatedFilter( 99 | Some(id), 100 | first, 101 | after.map(_.toDateTime), 102 | last, 103 | before.map(_.toDateTime) 104 | ) 105 | 106 | last 107 | .fold(schemaDao.findFirstVersions(filter))(_ => schemaDao.findLastVersions(filter)) 108 | .map { versions => 109 | val pageInfo: PageInfo = buildPageInfo(first, last, versions.length) 110 | val finalVersions = Option(pageInfo.hasMore).filter(identity).fold(versions)(_ => versions.dropRight(1)) 111 | SchemaSchemaVersionConnection( 112 | pageInfo, 113 | finalVersions.map { version => 114 | SchemaSchemaVersionEdge(version.createdOn.toCursor, version) 115 | } 116 | ) 117 | } 118 | } 119 | 120 | private def buildPageInfo(first: Option[Int], last: Option[Int], count: Int) = 121 | PageInfo(first.exists(count > _), last.exists(count > _)) 122 | 123 | def latestSchemaVersion(id: UUID) = schemaDao.findLatestVersion(id) 124 | 125 | def inferWithActor(message: Any) = 126 | (inferActor ? message).recoverWith { 127 | case ex: SchemerInferenceException => 128 | Future.failed(ex) 129 | case _: AskTimeoutException => 130 | Future.failed(SchemerInferenceException("Timeout while trying to infer schema")) 131 | case ex => 132 | Future.failed(SchemerInferenceException(ex.getMessage)) 133 | } 134 | } 135 | -------------------------------------------------------------------------------- /schemer-registry/src/main/scala/schemer/registry/graphql/schema/GraphQLCustomTypes.scala: -------------------------------------------------------------------------------- 1 | package schemer.registry.graphql.schema 2 | 3 | import java.util.UUID 4 | 5 | import org.joda.time.format.ISODateTimeFormat 6 | import org.joda.time.{DateTime, DateTimeZone} 7 | import sangria.ast 8 | import sangria.schema.ScalarType 9 | import sangria.validation.ValueCoercionViolation 10 | 11 | import scala.util.{Failure, Success, Try} 12 | 13 | trait GraphQLCustomTypes { 14 | case object DateCoercionViolation extends ValueCoercionViolation("Date value expected") 15 | 16 | def parseDate(s: String) = Try(new DateTime(s, DateTimeZone.UTC)) match { 17 | case Success(date) => Right(date) 18 | case Failure(_) => Left(DateCoercionViolation) 19 | } 20 | 21 | def parseUUID(s: String) = Try(UUID.fromString(s)) match { 22 | case Success(uuid) => Right(uuid) 23 | case Failure(_) => Left(DateCoercionViolation) 24 | } 25 | 26 | implicit val DateTimeType = ScalarType[DateTime]( 27 | "DateTime", 28 | coerceOutput = (date: DateTime, _) => ast.StringValue(ISODateTimeFormat.dateTime().print(date)), 29 | coerceUserInput = { 30 | case s: String => parseDate(s) 31 | case _ => Left(DateCoercionViolation) 32 | }, 33 | coerceInput = { 34 | case ast.StringValue(s, _, _) => parseDate(s) 35 | case _ => Left(DateCoercionViolation) 36 | } 37 | ) 38 | 39 | implicit val UUIDType = ScalarType[UUID]( 40 | "UUID", 41 | coerceOutput = (uuid: UUID, _) => ast.StringValue(uuid.toString), 42 | coerceUserInput = { 43 | case s: String => parseUUID(s) 44 | case _ => Left(DateCoercionViolation) 45 | }, 46 | coerceInput = { 47 | case ast.StringValue(s, _, _) => parseUUID(s) 48 | case _ => Left(DateCoercionViolation) 49 | } 50 | ) 51 | } 52 | -------------------------------------------------------------------------------- /schemer-registry/src/main/scala/schemer/registry/graphql/schema/InferType.scala: -------------------------------------------------------------------------------- 1 | package schemer.registry.graphql.schema 2 | 3 | import sangria.schema._ 4 | import schemer.registry.graphql.schema.SchemaDefinition.constantComplexity 5 | import sangria.macros.derive.{deriveInputObjectType, deriveObjectType, InputObjectTypeName} 6 | import schemer.registry.graphql._ 7 | import spray.json.DefaultJsonProtocol 8 | import sangria.marshalling.sprayJson._ 9 | import schemer._ 10 | 11 | trait JSONSchemaType { 12 | implicit val JSONSchemaType = ObjectType( 13 | "JSONSchema", 14 | "JSON Schema", 15 | fields[Unit, JSONSchema]( 16 | Field( 17 | "schema", 18 | StringType, 19 | description = Some("CSV Schema as JSON string"), 20 | complexity = constantComplexity(10), 21 | resolve = ctx => ctx.value.schema 22 | ), 23 | Field( 24 | "sparkSchema", 25 | StringType, 26 | description = Some("Spark Schema as JSON string"), 27 | complexity = constantComplexity(100), 28 | resolve = ctx => ctx.value.sparkSchema().prettyJson 29 | ) 30 | ) 31 | ) 32 | } 33 | 34 | trait InferType extends JSONSchemaType with DefaultJsonProtocol { 35 | lazy implicit val TypeArg = Argument("type", ParquetSchemaUnderlyingType) 36 | lazy implicit val PathsArg = Argument("paths", ListInputType(StringType)) 37 | implicit val CSVOptionsFormat = jsonFormat5(CSVOptions.apply) 38 | lazy implicit val CSVOptionsInputType = deriveInputObjectType[CSVOptions](InputObjectTypeName("CSVOptionsInput")) 39 | lazy implicit val CSVOptionsArg = Argument("csvOptions", OptionInputType(CSVOptionsInputType), CSVOptions()) 40 | 41 | lazy implicit val CSVFieldType = deriveObjectType[Unit, CSVField]() 42 | lazy implicit val CSVOptionsType = deriveObjectType[Unit, CSVOptions]() 43 | lazy val CSVSchemaType = ObjectType( 44 | "CSVSchema", 45 | "CSV Schema", 46 | fields[Unit, CSVSchema]( 47 | Field( 48 | "fields", 49 | ListType(CSVFieldType), 50 | description = Some("Fields of the CSV Schema"), 51 | complexity = constantComplexity(1), 52 | resolve = ctx => ctx.value.fields 53 | ), 54 | Field( 55 | "options", 56 | CSVOptionsType, 57 | description = Some("Options of the CSV Schema"), 58 | complexity = constantComplexity(1), 59 | resolve = ctx => ctx.value.options 60 | ), 61 | Field( 62 | "schema", 63 | StringType, 64 | description = Some("CSV Schema as JSON string"), 65 | complexity = constantComplexity(100), 66 | resolve = ctx => ctx.value.schema() 67 | ), 68 | Field( 69 | "sparkSchema", 70 | StringType, 71 | description = Some("Spark Schema as JSON string"), 72 | complexity = constantComplexity(100), 73 | resolve = ctx => ctx.value.sparkSchema().prettyJson 74 | ) 75 | ) 76 | ) 77 | 78 | lazy val ParquetSchemaUnderlyingType = EnumType( 79 | "ParquetSchemaType", 80 | Some("Supported schema types for Parquet"), 81 | List( 82 | EnumValue("Avro", value = schemer.ParquetSchemaType.Avro.`type`), 83 | EnumValue("Csv", value = schemer.ParquetSchemaType.Csv.`type`), 84 | EnumValue("Json", value = schemer.ParquetSchemaType.Json.`type`) 85 | ) 86 | ) 87 | 88 | lazy val ParquetSchemaType = ObjectType( 89 | "ParquetSchema", 90 | "Parquet Schema", 91 | fields[Unit, ParquetSchema]( 92 | Field( 93 | "type", 94 | ParquetSchemaUnderlyingType, 95 | description = Some("Parquet Schema type"), 96 | complexity = constantComplexity(10), 97 | resolve = ctx => ctx.value.`type`.`type` 98 | ), 99 | Field( 100 | "schema", 101 | StringType, 102 | description = Some("Parquet Schema as JSON string"), 103 | complexity = constantComplexity(10), 104 | resolve = ctx => ctx.value.schema 105 | ), 106 | Field( 107 | "sparkSchema", 108 | StringType, 109 | description = Some("Spark Schema as JSON string"), 110 | complexity = constantComplexity(100), 111 | resolve = ctx => ctx.value.sparkSchema().prettyJson 112 | ) 113 | ) 114 | ) 115 | 116 | lazy val AvroSchemaType = ObjectType( 117 | "AvroSchema", 118 | "Avro Schema", 119 | fields[Unit, AvroSchema]( 120 | Field( 121 | "schema", 122 | StringType, 123 | description = Some("Avro Schema as string"), 124 | complexity = constantComplexity(10), 125 | resolve = ctx => ctx.value.schema 126 | ), 127 | Field( 128 | "sparkSchema", 129 | StringType, 130 | description = Some("Spark Schema as JSON string"), 131 | complexity = constantComplexity(100), 132 | resolve = ctx => ctx.value.sparkSchema().prettyJson 133 | ) 134 | ) 135 | ) 136 | 137 | lazy val InferType = ObjectType( 138 | "Inference", 139 | "Schema Inference", 140 | fields[GraphQLService, Unit]( 141 | Field( 142 | "csv", 143 | CSVSchemaType, 144 | description = Some("CSV Schema inference"), 145 | complexity = constantComplexity(500), 146 | resolve = ctx => InferCSVSchemaDeferred(ctx arg CSVOptionsArg, ctx arg PathsArg), 147 | arguments = List(CSVOptionsArg, PathsArg) 148 | ), 149 | Field( 150 | "json", 151 | JSONSchemaType, 152 | description = Some("JSON Schema inference"), 153 | complexity = constantComplexity(500), 154 | resolve = ctx => InferJSONSchemaDeferred(ctx arg PathsArg), 155 | arguments = List(PathsArg) 156 | ), 157 | Field( 158 | "parquet", 159 | ParquetSchemaType, 160 | description = Some("Parquet Schema inference"), 161 | complexity = constantComplexity(500), 162 | resolve = ctx => InferParquetSchemaDeferred(ctx arg TypeArg, ctx arg PathsArg), 163 | arguments = List(TypeArg, PathsArg) 164 | ), 165 | Field( 166 | "avro", 167 | AvroSchemaType, 168 | description = Some("Avro Schema inference"), 169 | complexity = constantComplexity(500), 170 | resolve = ctx => InferAvroSchemaDeferred(ctx arg PathsArg), 171 | arguments = List(PathsArg) 172 | ) 173 | ) 174 | ) 175 | } 176 | -------------------------------------------------------------------------------- /schemer-registry/src/main/scala/schemer/registry/graphql/schema/MetadataType.scala: -------------------------------------------------------------------------------- 1 | package schemer.registry.graphql.schema 2 | 3 | import buildinfo.BuildInfo 4 | import sangria.macros.derive.deriveObjectType 5 | import sangria.schema.ObjectType 6 | 7 | case class Metadata(version: String = BuildInfo.version) 8 | 9 | trait MetadataType { 10 | lazy val MetadataType: ObjectType[Unit, Metadata] = deriveObjectType() 11 | } 12 | -------------------------------------------------------------------------------- /schemer-registry/src/main/scala/schemer/registry/graphql/schema/MutationType.scala: -------------------------------------------------------------------------------- 1 | package schemer.registry.graphql.schema 2 | 3 | import sangria.macros.derive.deriveContextObjectType 4 | import schemer.registry.graphql.GraphQLService 5 | 6 | trait MutationType extends JSONSchemaType with SchemaType with GraphQLCustomTypes { 7 | val MutationType = deriveContextObjectType[GraphQLService, GraphQLService, Unit](identity) 8 | } 9 | -------------------------------------------------------------------------------- /schemer-registry/src/main/scala/schemer/registry/graphql/schema/SchemaDefinition.scala: -------------------------------------------------------------------------------- 1 | package schemer.registry.graphql.schema 2 | 3 | import sangria.schema.{fields, Args, Field, ListType, ObjectType, OptionType, Schema} 4 | import schemer.registry.graphql.GraphQLService 5 | import schemer.registry.models.{Schema => SSchema} 6 | 7 | object SchemaDefinition extends InferType with MetadataType with MutationType with SchemaType with GraphQLCustomTypes { 8 | 9 | def constantComplexity[Ctx](complexity: Double) = 10 | Some((_: Ctx, _: Args, child: Double) => child + complexity) 11 | 12 | val QueryType = ObjectType( 13 | "Query", 14 | "Root", 15 | fields[GraphQLService, Unit]( 16 | Field( 17 | "schema", 18 | OptionType(SchemaType), 19 | description = Some("Schema"), 20 | resolve = ctx => ctx.ctx.schema(ctx arg IdArg), 21 | arguments = List(IdArg) 22 | ), 23 | Field( 24 | "schemas", 25 | ListType(SchemaType), 26 | description = Some("All Schemas"), 27 | resolve = ctx => ctx.ctx.allSchemas 28 | ), 29 | Field( 30 | "schemaVersion", 31 | OptionType(SchemaVersionType), 32 | description = Some("Schema Version"), 33 | resolve = ctx => ctx.ctx.schemaVersion(ctx arg IdArg), 34 | arguments = List(IdArg) 35 | ), 36 | Field( 37 | "infer", 38 | InferType, 39 | description = Some("Schema Inference"), 40 | resolve = _ => () 41 | ), 42 | Field( 43 | "metadata", 44 | MetadataType, 45 | description = Some("Metadata"), 46 | complexity = constantComplexity(100), 47 | resolve = _ => Metadata() 48 | ) 49 | ) 50 | ) 51 | val schema = Schema(QueryType, Some(MutationType)) 52 | } 53 | -------------------------------------------------------------------------------- /schemer-registry/src/main/scala/schemer/registry/graphql/schema/SchemaType.scala: -------------------------------------------------------------------------------- 1 | package schemer.registry.graphql.schema 2 | 3 | import sangria.macros.derive.deriveObjectType 4 | import sangria.schema.{Field, ObjectType, _} 5 | import schemer.{SchemaType => SSchemaType} 6 | import schemer.registry.graphql.{SchemaVersionLatestDeferred, SchemaVersionsDeferred} 7 | import schemer.registry.graphql.schema.SchemaDefinition.constantComplexity 8 | import schemer.registry.models.{ 9 | PageInfo, 10 | SchemaSchemaVersionConnection, 11 | SchemaSchemaVersionEdge, 12 | SchemaVersion, 13 | Schema => SSchema 14 | } 15 | 16 | trait SchemaType extends GraphQLCustomTypes { 17 | lazy implicit val SchemaTypeType = EnumType[SSchemaType]( 18 | "SchemaType", 19 | Some("Supported schema types"), 20 | List( 21 | EnumValue("Avro", value = SSchemaType.Avro), 22 | EnumValue("Csv", value = SSchemaType.Csv), 23 | EnumValue("Json", value = SSchemaType.Json), 24 | EnumValue("ParquetAvro", value = SSchemaType.ParquetAvro), 25 | EnumValue("ParquetCsv", value = SSchemaType.ParquetCsv), 26 | EnumValue("ParquetJson", value = SSchemaType.ParquetJson) 27 | ) 28 | ) 29 | lazy implicit val IdArg = Argument("id", UUIDType) 30 | lazy implicit val FirstArg = Argument("first", OptionInputType(IntType)) 31 | lazy implicit val AfterArg = Argument("after", OptionInputType(StringType)) 32 | lazy implicit val LastArg = Argument("last", OptionInputType(IntType)) 33 | lazy implicit val BeforeArg = Argument("before", OptionInputType(StringType)) 34 | lazy implicit val PageInfo: ObjectType[Unit, PageInfo] = deriveObjectType() 35 | lazy implicit val SchemaVersionType: ObjectType[Unit, SchemaVersion] = deriveObjectType() 36 | lazy implicit val SchemaSchemaVersionEdgeType: ObjectType[Unit, SchemaSchemaVersionEdge] = deriveObjectType() 37 | lazy implicit val SchemaSchemaVersionConnectionType: ObjectType[Unit, SchemaSchemaVersionConnection] = 38 | deriveObjectType() 39 | 40 | val SchemaType: ObjectType[Unit, SSchema] = ObjectType( 41 | "Schema", 42 | "Schema", 43 | fields[Unit, SSchema]( 44 | Field( 45 | "id", 46 | UUIDType, 47 | resolve = _.value.id 48 | ), 49 | Field( 50 | "name", 51 | StringType, 52 | resolve = _.value.name 53 | ), 54 | Field( 55 | "namespace", 56 | StringType, 57 | resolve = _.value.namespace 58 | ), 59 | Field( 60 | "type", 61 | SchemaTypeType, 62 | resolve = ctx => SSchemaType.supportedTypes.find(_.`type` == ctx.value.`type`).get 63 | ), 64 | Field( 65 | "createdOn", 66 | DateTimeType, 67 | resolve = _.value.createdOn 68 | ), 69 | Field( 70 | "createdBy", 71 | StringType, 72 | resolve = _.value.createdBy 73 | ), 74 | Field( 75 | "versions", 76 | ListType(SchemaSchemaVersionConnectionType), 77 | resolve = ctx => 78 | SchemaVersionsDeferred(ctx.value.id, ctx arg FirstArg, ctx arg AfterArg, ctx arg LastArg, ctx arg BeforeArg), 79 | complexity = constantComplexity(200), 80 | arguments = List(FirstArg, AfterArg, LastArg, BeforeArg) 81 | ), 82 | Field( 83 | "latestVersion", 84 | OptionType(SchemaVersionType), 85 | resolve = ctx => SchemaVersionLatestDeferred(ctx.value.id), 86 | complexity = constantComplexity(200) 87 | ) 88 | ) 89 | ) 90 | 91 | } 92 | -------------------------------------------------------------------------------- /schemer-registry/src/main/scala/schemer/registry/models/Schema.scala: -------------------------------------------------------------------------------- 1 | package schemer.registry.models 2 | 3 | import java.util.UUID 4 | 5 | import org.joda.time.DateTime 6 | 7 | case class Schema( 8 | id: UUID, 9 | name: String, 10 | namespace: String, 11 | `type`: String, 12 | createdOn: DateTime, 13 | createdBy: String 14 | ) 15 | 16 | object Schema { 17 | def apply(name: String, namespace: String, `type`: String, createdOn: DateTime, createdBy: String) = 18 | new Schema(null, name, namespace, `type`, createdOn, createdBy) 19 | } 20 | 21 | case class SchemaVersion( 22 | id: UUID, 23 | schemaId: UUID, 24 | version: String, 25 | schema: String, 26 | createdOn: DateTime, 27 | createdBy: String 28 | ) 29 | case class PageInfo(hasNextPage: Boolean, hasPreviousPage: Boolean) { 30 | def hasMore = hasNextPage || hasPreviousPage 31 | } 32 | case class SchemaSchemaVersionEdge(cursor: String, node: SchemaVersion) 33 | case class SchemaSchemaVersionConnection(pageInfo: PageInfo, edges: List[SchemaSchemaVersionEdge]) 34 | -------------------------------------------------------------------------------- /schemer-registry/src/main/scala/schemer/registry/package.scala: -------------------------------------------------------------------------------- 1 | package schemer 2 | 3 | package object registry { 4 | type Cursor = String 5 | } 6 | -------------------------------------------------------------------------------- /schemer-registry/src/main/scala/schemer/registry/routes/GraphQLRoutes.scala: -------------------------------------------------------------------------------- 1 | package schemer.registry.routes 2 | 3 | import akka.http.scaladsl.model.StatusCodes.{BadRequest, InternalServerError, OK} 4 | import akka.http.scaladsl.server.Directives.{as, complete, entity, get, getFromResource, path, post} 5 | import sangria.execution._ 6 | import sangria.parser.QueryParser 7 | import sangria.schema.Schema 8 | import spray.json.{JsObject, JsString, JsValue} 9 | import akka.http.scaladsl.marshallers.sprayjson.SprayJsonSupport._ 10 | import akka.http.scaladsl.server.Directives._ 11 | import sangria.marshalling.sprayJson._ 12 | import schemer.registry.exceptions.SchemerException 13 | import schemer.registry.graphql.{CustomGraphQLResolver, GraphQLService} 14 | import schemer.registry.graphql.schema.SchemaDefinition 15 | 16 | import scala.util.{Failure, Success} 17 | import scala.concurrent.ExecutionContext.Implicits.global 18 | 19 | trait GraphQLRoutes { 20 | val graphQLService: GraphQLService 21 | 22 | case object TooComplexQuery extends Exception 23 | val rejectComplexQueries = QueryReducer.rejectComplexQueries( 24 | 1000, 25 | (_: Double, _: GraphQLService) => TooComplexQuery 26 | ) 27 | 28 | val graphQLExceptionHandler: Executor.ExceptionHandler = { 29 | case (_, TooComplexQuery) => HandledException("Too complex query. Please reduce the field selection.") 30 | case (_, e: SchemerException) => HandledException(e.getMessage) 31 | } 32 | 33 | def executeGraphQLQuery(schema: Schema[GraphQLService, Unit], requestJson: JsValue) = { 34 | val JsObject(fields) = requestJson 35 | 36 | val JsString(query) = fields("query") 37 | 38 | val operation = fields.get("operationName") collect { 39 | case JsString(op) => op 40 | } 41 | 42 | val vars = fields.get("variables") match { 43 | case Some(obj: JsObject) => obj 44 | case _ => JsObject.empty 45 | } 46 | 47 | QueryParser.parse(query) match { 48 | 49 | case Success(queryDocument) => 50 | complete( 51 | Executor 52 | .execute( 53 | schema, 54 | queryDocument, 55 | graphQLService, 56 | deferredResolver = CustomGraphQLResolver.deferredResolver, 57 | variables = vars, 58 | operationName = operation, 59 | queryReducers = rejectComplexQueries :: Nil, 60 | exceptionHandler = graphQLExceptionHandler 61 | ) 62 | .map(OK -> _) 63 | .recover { 64 | case error: QueryAnalysisError => BadRequest -> error.resolveError 65 | case error: ErrorWithResolver => InternalServerError -> error.resolveError 66 | } 67 | ) 68 | 69 | case Failure(error) => 70 | complete(BadRequest -> JsObject("error" -> JsString(error.getMessage))) 71 | } 72 | } 73 | 74 | val graphQLRoutes = path("graphql") { 75 | post { 76 | entity(as[JsValue]) { requestJson => 77 | executeGraphQLQuery(SchemaDefinition.schema, requestJson) 78 | } 79 | } ~ get { 80 | getFromResource("graphql/graphiql.html") 81 | } 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /schemer-registry/src/main/scala/schemer/registry/routes/HealthRoutes.scala: -------------------------------------------------------------------------------- 1 | package schemer.registry.routes 2 | 3 | import java.io.{StringWriter, Writer} 4 | import java.util 5 | 6 | import akka.http.scaladsl.model.{HttpCharsets, HttpEntity, MediaType} 7 | import akka.http.scaladsl.server.Directives._ 8 | import io.prometheus.client.Collector.MetricFamilySamples 9 | import io.prometheus.client.CollectorRegistry 10 | import io.prometheus.client.exporter.common.TextFormat 11 | import io.prometheus.client.hotspot.DefaultExports 12 | 13 | trait HealthRoutes { 14 | 15 | DefaultExports.initialize() 16 | private val collectorRegistry = CollectorRegistry.defaultRegistry 17 | private val metricsMediaTypeParams = Map("version" -> "0.0.4") 18 | private val metricsMediaType = 19 | MediaType.customWithFixedCharset("text", "plain", HttpCharsets.`UTF-8`, params = metricsMediaTypeParams) 20 | 21 | def toPrometheusTextFormat(e: util.Enumeration[MetricFamilySamples]): String = { 22 | val writer: Writer = new StringWriter() 23 | TextFormat.write004(writer, e) 24 | 25 | writer.toString 26 | } 27 | 28 | val healthRoutes = path("health") { 29 | get { 30 | complete { 31 | "OK" 32 | } 33 | } 34 | } ~ path("metrics") { 35 | get { 36 | complete { 37 | HttpEntity(metricsMediaType, toPrometheusTextFormat(collectorRegistry.metricFamilySamples())) 38 | } 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /schemer-registry/src/main/scala/schemer/registry/routes/Routes.scala: -------------------------------------------------------------------------------- 1 | package schemer.registry.routes 2 | 3 | import akka.http.scaladsl.model.{StatusCodes, Uri} 4 | import akka.http.scaladsl.server.Directives._ 5 | import akka.http.scaladsl.server.{ExceptionHandler, RejectionHandler} 6 | import com.typesafe.scalalogging.StrictLogging 7 | 8 | trait Routes extends GraphQLRoutes with HealthRoutes with StrictLogging { 9 | private val exceptionHandler = ExceptionHandler { 10 | case e: Exception => 11 | logger.error(s"Exception during client request processing: ${e.getMessage}", e) 12 | _.complete((StatusCodes.InternalServerError, "Internal server error")) 13 | } 14 | val rejectionHandler = RejectionHandler.default 15 | val logBlackListPaths = Seq("health") 16 | private def isBlacklistedPath(uri: Uri) = 17 | logBlackListPaths 18 | .map(s"/" + _) 19 | .exists(uri.toString().contains) 20 | val logDuration = extractRequestContext.flatMap { ctx => 21 | val start = System.currentTimeMillis() 22 | mapResponse { resp => 23 | val d = System.currentTimeMillis() - start 24 | if (!isBlacklistedPath(ctx.request.uri)) { 25 | logger.info(s"[${resp.status.intValue()}] ${ctx.request.method.name} ${ctx.request.uri} took: ${d}ms") 26 | } 27 | resp 28 | } & handleRejections(rejectionHandler) 29 | } 30 | val routes = logDuration { 31 | handleExceptions(exceptionHandler) { 32 | encodeResponse { 33 | graphQLRoutes ~ healthRoutes 34 | } 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /schemer-registry/src/main/scala/schemer/registry/routes/SwaggerRoutes.scala: -------------------------------------------------------------------------------- 1 | package schemer.registry.routes 2 | 3 | import akka.http.scaladsl.model.StatusCodes 4 | import akka.http.scaladsl.server.Directives._ 5 | 6 | trait SwaggerRoutes { 7 | 8 | val swaggerRoutes = pathPrefix("swagger") { 9 | pathEnd { 10 | extractUri { uri => 11 | redirect(uri + "/", StatusCodes.TemporaryRedirect) 12 | } 13 | } ~ 14 | pathSingleSlash { 15 | getFromResource("swagger-ui/index.html") 16 | } ~ 17 | getFromResourceDirectory("swagger-ui") 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /schemer-registry/src/main/scala/schemer/registry/server/ConfigWithDefault.scala: -------------------------------------------------------------------------------- 1 | package schemer.registry.server 2 | 3 | import java.net.InetAddress 4 | import java.util.concurrent.TimeUnit 5 | 6 | import com.typesafe.config.{Config, ConfigFactory} 7 | 8 | trait ConfigWithDefault { 9 | 10 | def rootConfig: Config 11 | 12 | def getBoolean(path: String, default: Boolean) = ifHasPath(path, default) { _.getBoolean(path) } 13 | def getString(path: String, default: String) = ifHasPath(path, default) { _.getString(path) } 14 | def getInt(path: String, default: Int) = ifHasPath(path, default) { _.getInt(path) } 15 | def getConfig(path: String, default: Config) = ifHasPath(path, default) { _.getConfig(path) } 16 | def getMilliseconds(path: String, default: Long) = ifHasPath(path, default) { 17 | _.getDuration(path, TimeUnit.MILLISECONDS) 18 | } 19 | def getOptionalString(path: String, default: Option[String] = None) = getOptional(path) { _.getString(path) } 20 | 21 | def loadDefault(rootName: String, loadEnvConf: Boolean = true) = 22 | if (loadEnvConf) { 23 | ConfigFactory 24 | .parseResources(s"env-conf/$getHostname.conf") 25 | .withFallback(ConfigFactory.load()) 26 | .getConfig(rootName) 27 | } else { 28 | ConfigFactory.load().getConfig(rootName) 29 | } 30 | 31 | protected def getHostname = InetAddress.getLocalHost.getHostName 32 | 33 | private def ifHasPath[T](path: String, default: T)(get: Config => T): T = 34 | if (rootConfig.hasPath(path)) get(rootConfig) else default 35 | 36 | private def getOptional[T](fullPath: String, default: Option[T] = None)(get: Config => T) = 37 | if (rootConfig.hasPath(fullPath)) { 38 | Some(get(rootConfig)) 39 | } else { 40 | default 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /schemer-registry/src/main/scala/schemer/registry/server/InferenceConfig.scala: -------------------------------------------------------------------------------- 1 | package schemer.registry.server 2 | 3 | import com.typesafe.config.Config 4 | import java.util.concurrent.TimeUnit.SECONDS 5 | import scala.concurrent.duration._ 6 | 7 | trait InferenceConfig extends ConfigWithDefault { 8 | def rootConfig: Config 9 | lazy val inferenceConfig = rootConfig.getConfig("inference") 10 | lazy val inferTimeout = inferenceConfig.getDuration("timeout", SECONDS).seconds 11 | } 12 | -------------------------------------------------------------------------------- /schemer-registry/src/main/scala/schemer/registry/server/Main.scala: -------------------------------------------------------------------------------- 1 | package schemer.registry.server 2 | 3 | import akka.actor.ActorSystem 4 | import akka.http.scaladsl.Http 5 | import akka.http.scaladsl.Http.ServerBinding 6 | import akka.stream.ActorMaterializer 7 | import com.typesafe.scalalogging.StrictLogging 8 | import schemer.registry.routes.Routes 9 | 10 | import scala.concurrent.ExecutionContext.Implicits.global 11 | import scala.concurrent.Future 12 | import scala.util.{Failure, Success} 13 | 14 | class Main() extends StrictLogging { 15 | 16 | def start(): (Future[ServerBinding], Modules) = { 17 | 18 | implicit val _system: ActorSystem = ActorSystem("main") 19 | implicit val _materializer: ActorMaterializer = ActorMaterializer() 20 | 21 | val modules = new Modules with Routes { 22 | implicit lazy val ec = _system.dispatcher 23 | implicit lazy val mat = _materializer 24 | lazy val system = _system 25 | 26 | } 27 | 28 | (Http().bindAndHandle(modules.routes, modules.config.serverHost, modules.config.serverPort), modules) 29 | } 30 | } 31 | 32 | object Main extends App with StrictLogging { 33 | val (startFuture, modules) = new Main().start() 34 | 35 | val host = modules.config.serverHost 36 | val port = modules.config.serverPort 37 | 38 | val system = modules.system 39 | 40 | startFuture.onComplete { 41 | case Success(b) => 42 | logger.info(s"Server started on $host:$port") 43 | sys.addShutdownHook { 44 | b.unbind() 45 | shutdown() 46 | } 47 | case Failure(e) => 48 | logger.error(s"Cannot start server on $host:$port", e) 49 | sys.addShutdownHook { 50 | shutdown() 51 | } 52 | } 53 | 54 | def shutdown() { 55 | modules.system.terminate() 56 | logger.info("Server stopped") 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /schemer-registry/src/main/scala/schemer/registry/server/Modules.scala: -------------------------------------------------------------------------------- 1 | package schemer.registry.server 2 | 3 | import akka.actor.{ActorSystem, Props} 4 | import akka.routing.BalancingPool 5 | import akka.stream.Materializer 6 | import akka.util.Timeout 7 | import com.typesafe.config.Config 8 | import org.apache.spark.SparkConf 9 | import org.apache.spark.sql.SparkSession 10 | import schemer.registry.actors.InferActor 11 | import schemer.registry.dao.SchemaDao 12 | import schemer.registry.graphql.GraphQLService 13 | import schemer.registry.sql.{DatabaseConfig, SqlDatabase} 14 | import schemer.registry.utils.RealTimeClock 15 | 16 | import scala.concurrent.ExecutionContext 17 | import scala.concurrent.duration._ 18 | 19 | trait Modules { 20 | 21 | implicit def system: ActorSystem 22 | 23 | implicit def ec: ExecutionContext 24 | 25 | implicit def mat: Materializer 26 | 27 | lazy val config = new ServerConfig with DatabaseConfig with InferenceConfig { 28 | override def rootConfig: Config = loadDefault("registry") 29 | } 30 | 31 | implicit lazy val clock = RealTimeClock 32 | 33 | implicit val spark: SparkSession = SparkSession.builder 34 | .config(new SparkConf()) 35 | .master("local[*]") 36 | .getOrCreate() 37 | 38 | val hadoopConf = spark.sparkContext.hadoopConfiguration 39 | 40 | val sqlDatabase = SqlDatabase(config) 41 | sqlDatabase.updateSchema() 42 | 43 | lazy val schemaDao = new SchemaDao(sqlDatabase) 44 | lazy val inferActor = locally { 45 | implicit lazy val inferTimeout = Timeout(config.inferTimeout) 46 | system.actorOf(Props(new InferActor()).withRouter(BalancingPool(nrOfInstances = 10)), name = "InferActor") 47 | } 48 | lazy val graphQLService = locally { 49 | implicit lazy val inferActorTimeout = Timeout(config.inferTimeout + 20.seconds) 50 | new GraphQLService(schemaDao, inferActor) 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /schemer-registry/src/main/scala/schemer/registry/server/ServerConfig.scala: -------------------------------------------------------------------------------- 1 | package schemer.registry.server 2 | 3 | import com.typesafe.config.Config 4 | 5 | trait ServerConfig extends ConfigWithDefault { 6 | 7 | def rootConfig: Config 8 | 9 | lazy val serverHost: String = rootConfig.getString("server.host") 10 | lazy val serverPort: Int = rootConfig.getInt("server.port") 11 | } 12 | -------------------------------------------------------------------------------- /schemer-registry/src/main/scala/schemer/registry/sql/DatabaseConfig.scala: -------------------------------------------------------------------------------- 1 | package schemer.registry.sql 2 | 3 | import com.typesafe.config.Config 4 | import schemer.registry.server.ConfigWithDefault 5 | 6 | trait DatabaseConfig extends ConfigWithDefault { 7 | def rootConfig: Config 8 | 9 | val h2config = rootConfig.getConfig("h2") 10 | val postgresConfig = rootConfig.getConfig("postgres") 11 | } 12 | -------------------------------------------------------------------------------- /schemer-registry/src/main/scala/schemer/registry/sql/SqlDatabase.scala: -------------------------------------------------------------------------------- 1 | package schemer.registry.sql 2 | 3 | import io.getquill.{PostgresAsyncContext, SnakeCase} 4 | import org.apache.commons.lang3.StringUtils 5 | import org.flywaydb.core.Flyway 6 | import org.joda.time.DateTime 7 | 8 | trait Quotes { this: PostgresAsyncContext[_] => 9 | implicit class DateTimeQuotes(l: DateTime) { 10 | def >(r: DateTime) = quote(infix"$l > $r".as[Boolean]) 11 | def <(r: DateTime) = quote(infix"$l < $r".as[Boolean]) 12 | } 13 | 14 | implicit class OptDateTimeQuotes(l: Option[DateTime]) { 15 | def >(r: DateTime) = quote(infix"($l::timestamptz is null or $l > $r)".as[Boolean]) 16 | def <(r: DateTime) = quote(infix"($l::timestamptz is null or $l < $r)".as[Boolean]) 17 | } 18 | } 19 | 20 | case class SqlDatabase(config: DatabaseConfig) { 21 | lazy val ctx = new PostgresAsyncContext(SnakeCase, config.postgresConfig) with Quotes 22 | 23 | def updateSchema() = { 24 | val postgresUrl = config.postgresConfig.getString("url") 25 | if (StringUtils.isNotEmpty(postgresUrl)) { 26 | val flyway = new Flyway() 27 | flyway.setOutOfOrder(true) 28 | flyway.setDataSource(s"jdbc:$postgresUrl", "", "") 29 | flyway.migrate() 30 | } 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /schemer-registry/src/main/scala/schemer/registry/sql/package.scala: -------------------------------------------------------------------------------- 1 | package schemer.registry 2 | 3 | import io.getquill.{PostgresAsyncContext, SnakeCase} 4 | 5 | package object sql { 6 | type DbContext = PostgresAsyncContext[SnakeCase] 7 | } 8 | -------------------------------------------------------------------------------- /schemer-registry/src/main/scala/schemer/registry/utils/Clock.scala: -------------------------------------------------------------------------------- 1 | package schemer.registry.utils 2 | 3 | import org.joda.time.{DateTime, DateTimeZone, Duration} 4 | import org.joda.time.format.PeriodFormatterBuilder 5 | 6 | trait Clock { 7 | def now: DateTime 8 | def nowUtc: DateTime 9 | def nowMillis: Long 10 | } 11 | 12 | object RealTimeClock extends Clock with Serializable { 13 | def now = DateTime.now() 14 | def nowUtc = DateTime.now(DateTimeZone.UTC) 15 | def nowMillis = System.currentTimeMillis() 16 | } 17 | 18 | class FixtureTimeClock(millis: Long) extends Clock with Serializable { 19 | def now = new DateTime(millis) 20 | def nowUtc = new DateTime(millis, DateTimeZone.UTC) 21 | def nowMillis = millis 22 | } 23 | 24 | class FormatDuration() { 25 | def format(time: Duration): String = { 26 | val period = time.toPeriod() 27 | val hms = new PeriodFormatterBuilder() 28 | .printZeroAlways() 29 | .appendHours() 30 | .appendSeparator(" hours ") 31 | .appendMinutes() 32 | .appendSeparator(" minutes ") 33 | .appendSeconds() 34 | .appendSuffix(" seconds") 35 | .toFormatter() 36 | hms.print(period) 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /schemer-registry/src/main/scala/schemer/registry/utils/DateTimeUtils.scala: -------------------------------------------------------------------------------- 1 | package schemer.registry.utils 2 | 3 | import java.nio.charset.StandardCharsets 4 | import java.util.Base64 5 | 6 | import org.joda.time.DateTime 7 | import schemer.registry.Cursor 8 | 9 | object DateTimeUtils { 10 | implicit class DateTimeCursor(val dt: DateTime) { 11 | def toCursor: Cursor = Base64.getEncoder.encodeToString(dt.getMillis.toString.getBytes(StandardCharsets.UTF_8)) 12 | } 13 | 14 | implicit class CursorDateTime(val cursor: Cursor) { 15 | def toDateTime: DateTime = 16 | new DateTime(new String(Base64.getDecoder.decode(cursor), StandardCharsets.UTF_8).toLong) 17 | } 18 | 19 | } 20 | -------------------------------------------------------------------------------- /schemer-ui.md: -------------------------------------------------------------------------------- 1 | # Schemer UI screens 2 | 3 | ## Browse schemas 4 | 5 |

6 | 7 |

8 | 9 | ## Schema Details 10 | 11 |

12 | 13 |

14 | 15 | ## JSON representaion of Schema 16 | 17 |

18 | 19 |

20 | 21 | ## Create Schema 22 | 23 |

24 | 25 |

26 | 27 | ## Create Schema Version 28 | 29 |

30 | 31 |

32 | 33 | ## Field definition Wizard 34 | 35 |

36 | 37 |

-------------------------------------------------------------------------------- /secring.gpg.enc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/indix/schemer/16069813fa20b652f8e7735b64a47519e3ee8f14/secring.gpg.enc -------------------------------------------------------------------------------- /sonatype.sbt: -------------------------------------------------------------------------------- 1 | credentials += Credentials( 2 | "Sonatype Nexus Repository Manager", 3 | "oss.sonatype.org", 4 | System.getenv("SONATYPE_USERNAME"), 5 | System.getenv("SONATYPE_PASSWORD") 6 | ) 7 | --------------------------------------------------------------------------------