├── .credo.exs ├── .formatter.exs ├── .gitignore ├── .semaphore └── semaphore.yml ├── .tool-versions ├── LICENSE ├── README.md ├── config ├── config.exs └── test.exs ├── docs └── todo.md ├── lib ├── adapters │ ├── csv.ex │ ├── file_manager │ │ └── file_manager.ex │ └── identity.ex ├── data_quacker.ex ├── data_quacker │ ├── adapter.ex │ ├── builder.ex │ ├── context.ex │ ├── matcher.ex │ ├── skipper.ex │ ├── sourcer.ex │ ├── transformer.ex │ └── validator.ex └── schema │ ├── error.ex │ ├── helpers │ ├── fun_wrapper.ex │ └── wrapped_fun.ex │ ├── schema.ex │ └── state.ex ├── mix.exs ├── mix.lock ├── priv └── plts │ └── .gitignore └── test ├── adapters ├── csv_test.exs └── identity_test.exs ├── data_quacker └── skipper_test.exs ├── examples ├── pond_example_test.exs ├── pricing_example_test.exs └── students_example_test.exs ├── helpers_tests └── fun_wrapper_test.exs ├── schema └── state_test.exs ├── support ├── case.ex └── mock_file_manager.ex └── test_helper.exs /.credo.exs: -------------------------------------------------------------------------------- 1 | %{ 2 | configs: [ 3 | %{ 4 | name: "default", 5 | files: %{ 6 | included: [ 7 | "lib/", 8 | "test/", 9 | "priv/" 10 | ], 11 | excluded: [~r"/_build/", ~r"/deps/", ~r"/node_modules/", ~r"/priv/repo/migrations/"] 12 | }, 13 | plugins: [], 14 | requires: [], 15 | strict: true, 16 | parse_timeout: 5000, 17 | color: true, 18 | checks: [ 19 | {Credo.Check.Consistency.ExceptionNames, []}, 20 | {Credo.Check.Consistency.LineEndings, []}, 21 | {Credo.Check.Consistency.ParameterPatternMatching, []}, 22 | {Credo.Check.Consistency.SpaceAroundOperators, []}, 23 | {Credo.Check.Consistency.SpaceInParentheses, []}, 24 | {Credo.Check.Consistency.TabsOrSpaces, []}, 25 | {Credo.Check.Design.AliasUsage, 26 | [priority: :low, if_nested_deeper_than: 3, if_called_more_often_than: 1]}, 27 | {Credo.Check.Design.TagTODO, false}, 28 | {Credo.Check.Design.TagFIXME, []}, 29 | {Credo.Check.Readability.AliasOrder, []}, 30 | {Credo.Check.Readability.FunctionNames, []}, 31 | {Credo.Check.Readability.LargeNumbers, []}, 32 | {Credo.Check.Readability.MaxLineLength, [priority: :low, max_length: 120]}, 33 | {Credo.Check.Readability.ModuleAttributeNames, []}, 34 | {Credo.Check.Readability.ModuleDoc, false}, 35 | {Credo.Check.Readability.ModuleNames, []}, 36 | {Credo.Check.Readability.ParenthesesInCondition, []}, 37 | {Credo.Check.Readability.ParenthesesOnZeroArityDefs, false}, 38 | {Credo.Check.Readability.PredicateFunctionNames, []}, 39 | {Credo.Check.Readability.PreferImplicitTry, []}, 40 | {Credo.Check.Readability.RedundantBlankLines, []}, 41 | {Credo.Check.Readability.Semicolons, []}, 42 | {Credo.Check.Readability.SpaceAfterCommas, []}, 43 | {Credo.Check.Readability.StringSigils, []}, 44 | {Credo.Check.Readability.TrailingBlankLine, []}, 45 | {Credo.Check.Readability.TrailingWhiteSpace, []}, 46 | {Credo.Check.Readability.UnnecessaryAliasExpansion, []}, 47 | {Credo.Check.Readability.VariableNames, []}, 48 | {Credo.Check.Refactor.CondStatements, []}, 49 | {Credo.Check.Refactor.CyclomaticComplexity, false}, 50 | {Credo.Check.Refactor.FunctionArity, []}, 51 | {Credo.Check.Refactor.LongQuoteBlocks, []}, 52 | {Credo.Check.Refactor.MatchInCondition, []}, 53 | {Credo.Check.Refactor.NegatedConditionsInUnless, []}, 54 | {Credo.Check.Refactor.NegatedConditionsWithElse, []}, 55 | {Credo.Check.Refactor.Nesting, []}, 56 | {Credo.Check.Refactor.UnlessWithElse, []}, 57 | {Credo.Check.Refactor.WithClauses, []}, 58 | {Credo.Check.Warning.ApplicationConfigInModuleAttribute, []}, 59 | {Credo.Check.Warning.BoolOperationOnSameValues, []}, 60 | {Credo.Check.Warning.ExpensiveEmptyEnumCheck, []}, 61 | {Credo.Check.Warning.IExPry, []}, 62 | {Credo.Check.Warning.IoInspect, []}, 63 | {Credo.Check.Warning.MixEnv, false}, 64 | {Credo.Check.Warning.OperationOnSameValues, []}, 65 | {Credo.Check.Warning.OperationWithConstantResult, []}, 66 | {Credo.Check.Warning.RaiseInsideRescue, []}, 67 | {Credo.Check.Warning.UnusedEnumOperation, []}, 68 | {Credo.Check.Warning.UnusedFileOperation, []}, 69 | {Credo.Check.Warning.UnusedKeywordOperation, []}, 70 | {Credo.Check.Warning.UnusedListOperation, []}, 71 | {Credo.Check.Warning.UnusedPathOperation, []}, 72 | {Credo.Check.Warning.UnusedRegexOperation, []}, 73 | {Credo.Check.Warning.UnusedStringOperation, []}, 74 | {Credo.Check.Warning.UnusedTupleOperation, []}, 75 | {Credo.Check.Warning.UnsafeExec, []}, 76 | {Credo.Check.Consistency.MultiAliasImportRequireUse, []}, 77 | {Credo.Check.Consistency.UnusedVariableNames, []}, 78 | {Credo.Check.Design.DuplicatedCode, false}, 79 | {Credo.Check.Readability.AliasAs, []}, 80 | {Credo.Check.Readability.BlockPipe, false}, 81 | {Credo.Check.Readability.ImplTrue, []}, 82 | {Credo.Check.Readability.MultiAlias, []}, 83 | {Credo.Check.Readability.SeparateAliasRequire, false}, 84 | {Credo.Check.Readability.SinglePipe, []}, 85 | {Credo.Check.Readability.Specs, false}, 86 | {Credo.Check.Readability.StrictModuleLayout, false}, 87 | {Credo.Check.Readability.WithCustomTaggedTuple, false}, 88 | {Credo.Check.Refactor.ABCSize, false}, 89 | {Credo.Check.Refactor.AppendSingleItem, []}, 90 | {Credo.Check.Refactor.DoubleBooleanNegation, []}, 91 | {Credo.Check.Refactor.ModuleDependencies, []}, 92 | {Credo.Check.Refactor.NegatedIsNil, false}, 93 | {Credo.Check.Refactor.PipeChainStart, []}, 94 | {Credo.Check.Refactor.VariableRebinding, false}, 95 | {Credo.Check.Warning.LeakyEnvironment, false}, 96 | {Credo.Check.Warning.MapGetUnsafePass, false}, 97 | {Credo.Check.Warning.UnsafeToAtom, []} 98 | ] 99 | } 100 | ] 101 | } 102 | -------------------------------------------------------------------------------- /.formatter.exs: -------------------------------------------------------------------------------- 1 | # Used by "mix format" 2 | [ 3 | inputs: ["{mix,.formatter}.exs", "{config,lib,test}/**/*.{ex,exs}"] 4 | ] 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # The directory Mix will write compiled artifacts to. 2 | /_build/ 3 | 4 | # If you run "mix test --cover", coverage assets end up here. 5 | /cover/ 6 | 7 | # The directory Mix downloads your dependencies sources to. 8 | /deps/ 9 | 10 | # Where third-party dependencies like ExDoc output generated docs. 11 | /doc/ 12 | 13 | # Ignore .fetch files in case you like to edit your project deps locally. 14 | /.fetch 15 | 16 | # If the VM crashes, it generates a dump, let's ignore it too. 17 | erl_crash.dump 18 | 19 | # Also ignore archive artifacts (built via "mix archive.build"). 20 | *.ez 21 | 22 | # Ignore package tarball (built via "mix hex.build"). 23 | data_quacker-*.tar 24 | 25 | -------------------------------------------------------------------------------- /.semaphore/semaphore.yml: -------------------------------------------------------------------------------- 1 | version: "v1.0" 2 | name: Tests and checks 3 | agent: 4 | machine: 5 | type: e1-standard-2 6 | os_image: ubuntu1804 7 | containers: 8 | - name: main 9 | image: semaphoreci/elixir:1.12.0 10 | 11 | blocks: 12 | - name: Install dependencies 13 | task: 14 | jobs: 15 | - name: Install dependencies 16 | commands: 17 | - checkout 18 | - mix local.hex --force 19 | - mix local.rebar --force 20 | - cache restore 21 | - cache restore dialyzer-plt 22 | - mix do deps.get, compile, dialyzer --plt 23 | - MIX_ENV=test mix compile 24 | - cache store 25 | - cache store dialyzer-plt priv/plts/ 26 | - name: Run checks 27 | task: 28 | prologue: 29 | commands: 30 | - checkout 31 | - mix local.hex --force 32 | - mix local.rebar --force 33 | - cache restore 34 | - cache restore dialyzer-plt 35 | jobs: 36 | - name: Run formatter 37 | commands: 38 | - mix format --check-formatted 39 | - name: Run credo 40 | commands: 41 | - mix credo --strict 42 | - name: Run dialyzer 43 | commands: 44 | - mix dialyzer 45 | - cache store 46 | - name: Run tests 47 | task: 48 | env_vars: 49 | - name: MIX_ENV 50 | value: test 51 | prologue: 52 | commands: 53 | - checkout 54 | - mix local.hex --force 55 | - mix local.rebar --force 56 | - cache restore 57 | jobs: 58 | - name: Run tests 59 | commands: 60 | - mix test 61 | -------------------------------------------------------------------------------- /.tool-versions: -------------------------------------------------------------------------------- 1 | elixir 1.12.2-otp-24 2 | erlang 24.0.5 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DataQuacker 2 | 3 | DataQuacker is a library which aims at helping validating, transforming and parsing non-sandboxed data, like CSV files. 4 | 5 | It features a simple DSL similar to that of Ecto, which allows the user to declaratively describe the rules for mapping columns in the source into a desired structure. It also makes it easy to specify rules for validating, transforming and skipping specific fields and rows. 6 | 7 | The documentation along with usage examples can be found at [hexdocs.pm](https://hexdocs.pm/data_quacker/DataQuacker.html) 8 | 9 | To see the next steps for this library take a look at: [todo.md](./docs/todo.md) 10 | 11 | ## Installation 12 | 13 | To install the library, add it to your `mix.exs` deps. 14 | 15 | ```elixir 16 | def deps do 17 | [ 18 | {:data_quacker, "~> 0.1.1"} 19 | ] 20 | end 21 | ``` 22 | 23 | ## Contribution 24 | 25 | Any contribution is greatly appreciated. If you find anything working incorrectly or missing in this library or its documentation, please open an issue or a pull request. 26 | 27 | Issues inquiring about usage and best practices are also welcome. 28 | 29 | ## Testimonials 30 | 31 | "..." ~ the rubber duck on my desk 32 | -------------------------------------------------------------------------------- /config/config.exs: -------------------------------------------------------------------------------- 1 | use Mix.Config 2 | 3 | if Mix.env() == :test do 4 | import_config("test.exs") 5 | end 6 | -------------------------------------------------------------------------------- /config/test.exs: -------------------------------------------------------------------------------- 1 | use Mix.Config 2 | 3 | config :data_quacker, :file_manager, DataQuacker.MockFileManager 4 | -------------------------------------------------------------------------------- /docs/todo.md: -------------------------------------------------------------------------------- 1 | # TODOs 2 | - [x] Rename `parse` to `transform` 3 | - [x] Throw a meaningful exception if a transformer or validator returns an unexpected data type 4 | - [x] Basic documentation 5 | - [x] Generalise parsing, allow adapters 6 | - [x] Guard against fields or schemas with non-atom names 7 | - [x] Guard against two fields of identical name in the same parent, two schemas of identical name within the same module 8 | - [x] Example tests 9 | - [ ] Tests for Schema 10 | - [ ] Tests for DataQuacker (core) 11 | - [ ] Tests for Adapters 12 | - [ ] Typespecs, Dialyzer 13 | - [ ] Full documentation 14 | -------------------------------------------------------------------------------- /lib/adapters/csv.ex: -------------------------------------------------------------------------------- 1 | defmodule DataQuacker.Adapters.CSV do 2 | @moduledoc ~S""" 3 | This is a CSV adapter which can parse CSV files 4 | from the local filesystem or fetched over http. 5 | 6 | It is the default used if no adapter is specified. 7 | 8 | ## Example source 9 | 10 | - Local file path: `"path/to/csv/file.csv"` 11 | - Remote file url: `"https://remote_file.com/file/abc"` 12 | """ 13 | 14 | @behaviour DataQuacker.Adapter 15 | 16 | @impl DataQuacker.Adapter 17 | @doc ~S""" 18 | Takes in a string with the path or url to the file, and a keyword list of options. 19 | 20 | ## Options 21 | - `:separator` - the ASCII value of the column separator in the CSV file; usually retrieved with the `?*` notation where "*" is the character, for example: `?,` for a comma, `?;` for a semicolon, etc. 22 | - `:local?` - a boolean flag representing whether the file is present on the local file system or on a remote server 23 | """ 24 | def parse_source(file_path_or_url, opts) do 25 | case get_file(file_path_or_url, opts) do 26 | {:ok, raw_data} -> decode_source(raw_data, get_separator(opts)) 27 | error -> error 28 | end 29 | end 30 | 31 | defp get_file(file_path_or_url, opts) do 32 | case Keyword.get(opts, :local?, true) do 33 | true -> {:ok, file_manager().stream!(file_path_or_url)} 34 | false -> {:ok, file_manager().read_link!(file_path_or_url)} 35 | end 36 | rescue 37 | _error -> {:error, "File does not exist or is corrupted"} 38 | end 39 | 40 | defp decode_source(source_stream, separator) do 41 | source_stream 42 | |> CSV.decode(separator: separator) 43 | |> Enum.into([]) 44 | |> case do 45 | [headers | rows] -> {:ok, %{headers: headers, rows: rows}} 46 | error -> error 47 | end 48 | end 49 | 50 | defp get_separator(opts) do 51 | Keyword.get(opts, :separator, ?,) 52 | end 53 | 54 | @impl DataQuacker.Adapter 55 | def get_headers(%{headers: headers}), do: headers 56 | 57 | @impl DataQuacker.Adapter 58 | def get_rows(%{rows: rows}), do: {:ok, rows} 59 | 60 | @impl DataQuacker.Adapter 61 | def get_row(row), do: row 62 | 63 | defp file_manager() do 64 | Application.get_env(:data_quacker, :file_manager) || DataQuacker.FileManager 65 | end 66 | end 67 | -------------------------------------------------------------------------------- /lib/adapters/file_manager/file_manager.ex: -------------------------------------------------------------------------------- 1 | defmodule DataQuacker.FileManager do 2 | @moduledoc false 3 | 4 | @callback stream!(Path.t()) :: Enumerable.t() | File.Stream.t() | {:error, String.t()} 5 | @callback read_link!(Path.t()) :: {:ok, binary()} | {:error, String.t()} 6 | 7 | defdelegate stream!(path), to: File 8 | defdelegate read_link!(url), to: File 9 | end 10 | -------------------------------------------------------------------------------- /lib/adapters/identity.ex: -------------------------------------------------------------------------------- 1 | defmodule DataQuacker.Adapters.Identity do 2 | @moduledoc ~S""" 3 | This is an "identity adapter". 4 | It takes in a map with `:headers` and `:rows` as the keys. 5 | 6 | This adapter is very useful for testing a particular schema, 7 | but can also be used as the actual adapter if needed. 8 | 9 | ## Example source 10 | 11 | ```elixir 12 | %{ 13 | headers: ["First name", "Last name", "Age"], 14 | rows: [ 15 | ["John", "Smith", "21"], 16 | # ... 17 | ] 18 | } 19 | ``` 20 | """ 21 | 22 | @behaviour DataQuacker.Adapter 23 | 24 | @impl DataQuacker.Adapter 25 | @doc ~S""" 26 | Takes in a map with `:headers` and `:rows` keys, where the value under `:headers` is a list of strings, and the value under `:rows` is a list of lists of anything. 27 | 28 | > Note: Each list in in the rows list must be of the same length as the headers list. 29 | """ 30 | def parse_source(source, _opts) do 31 | {:ok, source} 32 | end 33 | 34 | @impl DataQuacker.Adapter 35 | def get_headers(%{headers: headers}), do: {:ok, headers} 36 | 37 | @impl DataQuacker.Adapter 38 | def get_rows(%{rows: rows}), do: {:ok, rows} 39 | 40 | @impl DataQuacker.Adapter 41 | def get_row(row), do: {:ok, row} 42 | end 43 | -------------------------------------------------------------------------------- /lib/data_quacker.ex: -------------------------------------------------------------------------------- 1 | defmodule DataQuacker do 2 | @moduledoc """ 3 | DataQuacker is a library which aims at helping validating, transforming and parsing non-sandboxed data. 4 | 5 | The most common example for such data, and the original idea behind this project, is CSV files. 6 | The scope of this library is not, however, in any way limited to CSV files. 7 | This library ships by default with two adapters: `DataQuacker.Adapters.CSV` for CSV files, 8 | and `DataQuacker.Adapters.Identity` for "in-memory data". 9 | Any other data source may be used with the help of a third party adapters; see: `DataQuacker.Adapter`. 10 | 11 | This library is comprised of three main components: 12 | 13 | - `DataQuacker`, which provides the `parse/4` function to parse data using a schema 14 | - `DataQuacker.Schema`, which a DSL for declaratively defining schemas which describe the mapping between the source data and the desired output 15 | - `DataQuacker.Adapters.CSV` and `DataQuacker.Adapters.Identity`, which extract data from sources into a format required by the `parse/4` function 16 | 17 | > Note: If you find anything missing from or unclear in the documentation, please do not hesitate to open an issue on the project's [Github repository](https://github.com/fiodorbaczynski/data_quacker). 18 | 19 | ## Testing 20 | 21 | The tests for parsing data which is external or non-sandboxed are often difficult to implement well, 22 | since that data may need to change over time. 23 | For example, editing CSV files used for tests, when the requirements change, 24 | can be tedious. 25 | 26 | For this reason, using a different adapter, which takes Elixir data as the input, for tests is recommend. 27 | In integration tests for this library the `DataQuacker.Adapters.Identity` adapter is used. 28 | 29 | The easiest way to switch out adapters in tests is to put the desired adapter in the `test.exs` config. 30 | You can find out how to do this under the "Options" section in the documentation for the `parse/4` function. 31 | 32 | ## Examples 33 | 34 | > Note: Most of the "juice", like transforming, validating, nesting, skipping, etc., is in the `DataQuacker.Schema` module, so the more complex and interesting examples also live there. Please take a look at its documentation for more in-depth examples. 35 | 36 | > Note: A fully working implementation of these examples can be found in the tests inside the "examples" directory. 37 | 38 | Given the following table of ducks in a pond, in the form of a CSV file: 39 | 40 | | Type | Colour | Age | 41 | |:--------:|:--------------:|-----| 42 | | Mallard | green | 3 | 43 | | Domestic | white | 2 | 44 | | Mandarin | multi-coloured | 4 | 45 | 46 | we want to have a list of maps with `:type`, `:colour` and `:age` as the keys. 47 | 48 | This can be achieved by creating the following schema and parser modules: 49 | 50 | Schema 51 | 52 | ```elixir 53 | defmodule PondSchema do 54 | use DataQuacker.Schema 55 | 56 | schema :pond do 57 | field :type do 58 | source("type") 59 | end 60 | 61 | field :colour do 62 | # make the "u" optional 63 | # in case we get an American data source :) 64 | 65 | source(~r/colou?r/i) 66 | end 67 | 68 | field :age do 69 | source("age") 70 | end 71 | end 72 | end 73 | ``` 74 | 75 | Parser 76 | 77 | ``` 78 | defmodule PondParser do 79 | def parse(file_path) do 80 | DataQuacker.parse( 81 | file_path, 82 | PondSchema.schema_structure(:pond), 83 | nil 84 | ) 85 | end 86 | end 87 | ``` 88 | 89 | ```elixir 90 | iex> PondParser.parse("path/to/file.csv") 91 | iex> {:ok, [ 92 | iex> {:ok, %{type: "Mandarin", colour: "multi-coloured", age: "4"}}, 93 | iex> {:ok, %{type: "Domestic", colour: "white", age: "2"}}, 94 | iex> {:ok, %{type: "Mallard", colour: "green", age: "3"}}, 95 | iex> ]} 96 | ``` 97 | 98 | Using this schema and parser we get a tuple of `:ok` or `:error`, and a list of rows, 99 | each of which is also a tuple of `:ok` or `:error`, but with a map as the second element. 100 | The topmost `:ok` or `:error` indicates whether *all* rows are valid, 101 | and those for individual rows indicate whether that particular row is valid 102 | 103 | > Note: The rows in the result are in the reverse order compared to the source rows. This is because for large lists reversing may be an expensive operation, which is often redundant, for example if the result is supposed to be inserted in a database. 104 | 105 | Now suppose we also want to validate that the type is one in a list of types we know, 106 | and get the age in the form of an integer. 107 | We need to make some changes to our schema 108 | 109 | ```elixir 110 | defmodule PondSchema do 111 | use DataQuacker.Schema 112 | 113 | schema :pond do 114 | field :type do 115 | validate(fn type -> type in ["Mallard", "Domestic", "Mandarin"] end) 116 | 117 | source("type") 118 | end 119 | 120 | field :colour do 121 | # make the "u" optional 122 | # in case we get an American data source :) 123 | 124 | source(~r/colou?r/i) 125 | end 126 | 127 | field :age do 128 | transform(fn age_str -> 129 | case Integer.parse(str) do 130 | {age_int, _} -> {:ok, age_int} 131 | :error -> :error 132 | end 133 | end) 134 | 135 | source("age") 136 | end 137 | end 138 | end 139 | ``` 140 | 141 | Using the same input file the output is now: 142 | 143 | ```elixir 144 | iex> PondParser.parse("path/to/file.csv") 145 | iex> {:ok, [ 146 | iex> {:ok, %{type: "Mandarin", colour: "multi-coloured", age: 4}}, 147 | iex> {:ok, %{type: "Domestic", colour: "white", age: 2}}, 148 | iex> {:ok, %{type: "Mallard", colour: "green", age: 3}}, 149 | iex> ]} 150 | ``` 151 | 152 | (the difference is in the type of "age") 153 | 154 | If we add some invalid fields to the file, however, the result will be quite different: 155 | 156 | | Type | Colour | Age | 157 | |:--------:|:--------------:|----------| 158 | | Mallard | green | 3 | 159 | | Domestic | white | 2 | 160 | | Mandarin | multi-coloured | 4 | 161 | | Mystery | golden | 100 | 162 | | Black | black | Infinity | 163 | 164 | ```elixir 165 | iex> PondParser.parse("path/to/file.csv") 166 | iex> {:error, [ 167 | iex> :error, 168 | iex> :error, 169 | iex> {:ok, %{type: "Mandarin", colour: "multi-coloured", age: 4}} 170 | iex> {:ok, %{type: "Domestic", colour: "white", age: 2}}, 171 | iex> {:ok, %{type: "Mallard", colour: "green", age: 3}}, 172 | iex> ]} 173 | ``` 174 | 175 | Since the last two rows of the input are invalid, the first two rows in the output are errors. 176 | 177 | > Note: The errors can be made more descriptive by returning tuples `{:error, any()}` from the validators and parsers. You can see this in action in the examples for the `DataQuacker.Schema` module. 178 | """ 179 | 180 | alias DataQuacker.Builder 181 | 182 | @doc """ 183 | Takes in a source, a schema, support data, and a keyword list of options. 184 | Returns a tuple with `:ok` or `:error` (indicating whether all rows are valid) as the first element, 185 | and a list of tuples `{:ok, map()} | {:error, any()} | :error)`. 186 | In case of `{:ok, map()}` for a given row, the map is the output defined in the schema. 187 | 188 | ## Source 189 | 190 | Any data which will be given to the adapter so that it can retrieve the source data. 191 | In case of the `DataQuacker.Adapter.CSV` this can be a file path or a file url. 192 | 193 | ## Schema 194 | 195 | A schema formed with the DSL from `DataQuacker.Schema`. 196 | 197 | ## Support data 198 | 199 | Any data which is supposed to be accessible inside various schema elements when parsing a source. 200 | 201 | ## Options 202 | 203 | The options can also be specified in the config, for example: 204 | 205 | ```elixir 206 | use Mix.Config 207 | 208 | # ... 209 | 210 | config :data_quacker, 211 | adapter: DataQuacker.Adapters.Identity, 212 | adapter_opts: [] 213 | 214 | # ... 215 | ``` 216 | 217 | - `:adapter` - the adapter module to be used to retrieve the source data; defaults to `DataQuacker.Adapters.CSV` 218 | - `:adapter_opts` - a keyword list of opts to be passed to the adapter; defaults to `[separator: ?,, local?: true]`; for a list of available adapter options see the documentation for the particular adapter 219 | """ 220 | @spec parse(any(), map(), any(), Keyword.t()) :: 221 | {:ok, list({:ok, map()} | {:error, any()} | :error)} 222 | | {:error, list({:ok, map()} | {:error, any()} | :error)} 223 | def parse(source, schema, support_data, opts \\ []) do 224 | with opts <- apply_default_opts(opts), 225 | adapter <- get_adapter(opts), 226 | {:ok, source} <- adapter.parse_source(source, get_adapter_opts(opts)) do 227 | Builder.call(source, schema, support_data, adapter) 228 | end 229 | end 230 | 231 | defp apply_default_opts(opts) do 232 | default_opts() 233 | |> Keyword.merge(Application.get_all_env(:data_quacker)) 234 | |> Keyword.merge(opts) 235 | end 236 | 237 | defp default_opts do 238 | [ 239 | adapter: DataQuacker.Adapters.CSV, 240 | adapter_opts: [separator: ?,, local?: true] 241 | ] 242 | end 243 | 244 | defp get_adapter(opts) do 245 | Keyword.get(opts, :adapter) 246 | end 247 | 248 | defp get_adapter_opts(opts) do 249 | Keyword.get(opts, :adapter_opts, []) 250 | end 251 | end 252 | -------------------------------------------------------------------------------- /lib/data_quacker/adapter.ex: -------------------------------------------------------------------------------- 1 | defmodule DataQuacker.Adapter do 2 | @moduledoc ~S""" 3 | Specifies the behaviour to which adapters must conform. 4 | 5 | An adapter must implement these functions: `parse_source/2`, `get_headers/1`, `get_rows/1`, `get_row/1`. 6 | 7 | The first one takes a source (e.g. a file path) and a keyword list of options, 8 | and returns a tuple of `{:ok, any()}` or `{:error, any()}`. 9 | In case of success the second element of the tuple 10 | will be the value given to the other two function. 11 | 12 | The second one takes the result of `parse_source/2` 13 | and returns `{:ok, list(any())} | {:error, any()}`. 14 | In case of success the second element of the tuple 15 | will be the value used to determine the indexes 16 | of sources described in the schema. 17 | 18 | The third one takes the result of `parse_source/2` 19 | and returns `{:ok, list(any())} | {:error, any()}`. 20 | In case of success each subsequent element of the resulting list 21 | will be passed to the get row function. 22 | 23 | The last one takes an element of the list 24 | which is the result of `get_rows/1` 25 | and returns `{:ok, list(any())} | {:error, any()}`. 26 | In case of success the resulting list will be treated 27 | as the list of columns in a row of the source. 28 | 29 | > Note: The resulting list in the `get_row/1` function must be of the same length as the resulting list in the `get_headers/1` function. 30 | 31 | For an example implementation take a look at the built-in adapters. 32 | 33 | > The rationale behind this API for adapters is that, depending on the source, potential errors may occur at different stages of parsing the source. For example the CSV library included in the default CSV adapter returns a tuple with `:ok` or `:error`as the first element for each row. However, some external APIs, like Google Sheets, return a list of rows without specifying for each whether it's valid or not. Therefore we need for it to be possible to specify that for each row, but not required for an adapter to eagerly iterate over all of the rows and wrap them in a tuple with `:ok`. 34 | """ 35 | 36 | @callback parse_source(any(), Keyword.t()) :: {:ok, any()} | {:error, any()} 37 | @callback get_headers(any()) :: {:ok, list(any())} | {:error, any()} 38 | @callback get_rows(any()) :: {:ok, list(any())} | {:error, any()} 39 | @callback get_row(any()) :: {:ok, list(any())} | {:error, any()} 40 | end 41 | -------------------------------------------------------------------------------- /lib/data_quacker/builder.ex: -------------------------------------------------------------------------------- 1 | defmodule DataQuacker.Builder do 2 | @moduledoc false 3 | 4 | alias DataQuacker.Context 5 | alias DataQuacker.Matcher 6 | alias DataQuacker.Skipper 7 | alias DataQuacker.Sourcer 8 | alias DataQuacker.Transformer 9 | alias DataQuacker.Validator 10 | 11 | def call( 12 | source, 13 | %{__name__: schema_name, matchers: matchers, rows: schema_rows} = _schema, 14 | support_data, 15 | adapter 16 | ) do 17 | with {:ok, headers} <- adapter.get_headers(source), 18 | {:ok, source_rows} <- adapter.get_rows(source), 19 | context <- 20 | support_data 21 | |> Context.new() 22 | |> Context.update_metadata(:schema, schema_name), 23 | {:ok, column_mappings} <- Matcher.call(headers, matchers, context) do 24 | build_source_rows(source_rows, schema_rows, column_mappings, context, adapter) 25 | end 26 | end 27 | 28 | defp build_source_rows( 29 | _source_rows, 30 | _schema_rows, 31 | _column_mappings, 32 | _context, 33 | _adapter, 34 | _acc \\ [], 35 | _all_ok? \\ true 36 | ) 37 | 38 | defp build_source_rows( 39 | [source_row | rest], 40 | schema_rows, 41 | column_mappings, 42 | context, 43 | adapter, 44 | acc, 45 | all_ok? 46 | ) do 47 | context = Context.increment_row(context) 48 | source_row = adapter.get_row(source_row) 49 | 50 | {result, context} = do_build_source_row(source_row, schema_rows, column_mappings, context) 51 | 52 | build_source_rows( 53 | rest, 54 | schema_rows, 55 | column_mappings, 56 | context, 57 | adapter, 58 | result ++ acc, 59 | all_ok? and 60 | Enum.all?(result, fn 61 | {:ok, _res} -> true 62 | _el -> false 63 | end) 64 | ) 65 | end 66 | 67 | defp build_source_rows([], _schema_rows, _column_mappings, _context, _adapter, acc, true), 68 | do: {:ok, acc} 69 | 70 | defp build_source_rows([], _schema_rows, _column_mappings, _context, _adapter, acc, false), 71 | do: {:error, acc} 72 | 73 | defp do_build_source_row({:ok, source_row}, schema_rows, column_mappings, context) do 74 | values = parse_row_values(source_row, column_mappings) 75 | 76 | build_schema_rows(schema_rows, values, context) 77 | end 78 | 79 | defp do_build_source_row(error, _schema_rows, _column_mappings, _context), do: error 80 | 81 | defp build_schema_rows(_schema_rows, _values, _context, acc \\ []) 82 | 83 | defp build_schema_rows([row | rest], values, context, acc) do 84 | case do_build_schema_row(row, values, context) do 85 | :skip -> build_schema_rows(rest, values, context, acc) 86 | {:ok, fields, context} -> build_schema_rows(rest, values, context, [{:ok, fields} | acc]) 87 | error -> build_schema_rows(rest, values, context, [error | acc]) 88 | end 89 | end 90 | 91 | defp build_schema_rows([], _values, context, acc), do: {acc, context} 92 | 93 | defp do_build_schema_row( 94 | %{ 95 | __index__: row_index, 96 | fields: fields, 97 | validators: validators, 98 | transformers: transformers, 99 | skip_if: skip_if 100 | }, 101 | values, 102 | context 103 | ) do 104 | with context <- Context.update_metadata(context, :row, row_index), 105 | {:ok, fields, context} <- fields |> Enum.into([]) |> build_fields(values, context), 106 | {:ok, fields, context} <- Transformer.call(fields, transformers, context), 107 | :ok <- Validator.call(fields, validators, context), 108 | false <- Skipper.call(fields, skip_if, context) do 109 | {:ok, fields, context} 110 | else 111 | true -> :skip 112 | error -> error 113 | end 114 | end 115 | 116 | defp build_fields(_fields, _values, _context, _acc \\ %{}) 117 | 118 | defp build_fields([{field_name, field} | fields], values, context, acc) do 119 | case do_build_field(field, values, context) do 120 | :skip -> 121 | build_fields(fields, values, context, acc) 122 | 123 | {:ok, field, context} -> 124 | build_fields(fields, values, context, Map.put(acc, field_name, field)) 125 | 126 | error -> 127 | error 128 | end 129 | end 130 | 131 | defp build_fields([], _values, context, acc), do: {:ok, acc, context} 132 | 133 | defp do_build_field( 134 | %{ 135 | __name__: field_name, 136 | validators: validators, 137 | transformers: transformers, 138 | skip_if: skip_if 139 | } = field, 140 | values, 141 | context 142 | ) do 143 | with context <- Context.update_metadata(context, :field, field_name), 144 | {:ok, value, context} <- do_build_field_value(field, values, context), 145 | {:ok, value, context} <- Transformer.call(value, transformers, context), 146 | :ok <- Validator.call(value, validators, context), 147 | false <- Skipper.call(value, skip_if, context) do 148 | {:ok, value, context} 149 | else 150 | true -> :skip 151 | error -> error 152 | end 153 | end 154 | 155 | defp do_build_field_value(%{__type__: :sourced, source: source}, values, context) do 156 | {:ok, Sourcer.call(source, values, context), context} 157 | end 158 | 159 | defp do_build_field_value(%{__type__: :wrapper, subfields: subfields}, values, context) do 160 | subfields 161 | |> Enum.into([]) 162 | |> build_fields(values, context) 163 | end 164 | 165 | defp parse_row_values(row, column_mappings) do 166 | column_mappings 167 | |> Enum.map(fn {target, index} -> {target, Enum.at(row, index)} end) 168 | |> Enum.into(%{}) 169 | end 170 | end 171 | -------------------------------------------------------------------------------- /lib/data_quacker/context.ex: -------------------------------------------------------------------------------- 1 | defmodule DataQuacker.Context do 2 | @moduledoc ~S""" 3 | This module provides a struct 4 | to hold contextual data 5 | for CSV parsing 6 | 7 | ## Metadata 8 | 9 | Metadata is a tuple of an atom and another atom or a non-negative integer. The first is the type of the entity currently being processed (`:field`, `:row`, etc.). The second is the name or index of the entity (name in case of a field, index in case of row). 10 | 11 | ## Support data 12 | 13 | Support data can be of any Elixir data type. It is the exact value passed as support_data to the `DataQuacker.parse/4` at runtime. 14 | 15 | ## Source row 16 | 17 | Source row is a a non-negative integer. The value is the index of the source row currently being processed. 18 | """ 19 | 20 | @type t :: %__MODULE__{ 21 | metadata: {atom(), atom() | non_neg_integer()}, 22 | support_data: any(), 23 | source_row: non_neg_integer() 24 | } 25 | defstruct [:metadata, :support_data, source_row: 0] 26 | 27 | @doc false 28 | def new(support_data), do: %__MODULE__{support_data: support_data} 29 | 30 | @doc false 31 | def update_metadata(context, type, name_or_index) do 32 | %__MODULE__{context | metadata: {type, name_or_index}} 33 | end 34 | 35 | @doc false 36 | def increment_row(%__MODULE__{source_row: source_row} = context) do 37 | %__MODULE__{context | source_row: source_row + 1} 38 | end 39 | end 40 | -------------------------------------------------------------------------------- /lib/data_quacker/matcher.ex: -------------------------------------------------------------------------------- 1 | defmodule DataQuacker.Matcher do 2 | @moduledoc false 3 | 4 | alias DataQuacker.Schema.WrappedFun 5 | 6 | def call(headers, rules, context), do: compile_rules(rules, headers, context) 7 | 8 | defp compile_rules(_rules, _headers, _context, acc \\ []) 9 | 10 | defp compile_rules( 11 | [%{rule: matching_function, target: target} | rest], 12 | headers, 13 | context, 14 | acc 15 | ) do 16 | case get_header_index(headers, matching_function, context) do 17 | nil -> 18 | {:error, {:header_not_found, target}} 19 | 20 | index -> 21 | compile_rules( 22 | rest, 23 | headers, 24 | context, 25 | [{target, index} | acc] 26 | ) 27 | end 28 | end 29 | 30 | defp compile_rules([], _headers, _context, acc), do: {:ok, acc} 31 | 32 | defp get_header_index(headers, matching_function, context) do 33 | Enum.find_index(headers, &apply_function(matching_function, &1, context)) 34 | end 35 | 36 | defp apply_function(%WrappedFun{arity: 1, callable: callable}, column, _context) do 37 | callable.(column) 38 | end 39 | 40 | defp apply_function(%WrappedFun{arity: 2, callable: callable}, column, context) do 41 | callable.(column, context) 42 | end 43 | end 44 | -------------------------------------------------------------------------------- /lib/data_quacker/skipper.ex: -------------------------------------------------------------------------------- 1 | defmodule DataQuacker.Skipper do 2 | @moduledoc false 3 | 4 | alias DataQuacker.Context 5 | 6 | alias DataQuacker.Schema.WrappedFun 7 | 8 | @type skipper_result :: true | false 9 | 10 | @spec call(any(), nil, any()) :: false 11 | def call(_value, nil, _context), do: false 12 | 13 | @spec call(any(), WrappedFun.t(), Context.t()) :: skipper_result() 14 | def call(value, skipping_rule, context) do 15 | case apply_function(skipping_rule, value, context) do 16 | result when is_boolean(result) -> 17 | result 18 | 19 | el -> 20 | raise """ 21 | 22 | Skipper in #{elem(context.metadata, 0)} #{elem(context.metadata, 1)} 23 | returned an incorrect value #{inspect(el)}. 24 | 25 | Skippers can only have returns of type: 26 | `true | false` 27 | """ 28 | end 29 | end 30 | 31 | @spec apply_function(any(), WrappedFun.t(1), Context.t()) :: any() 32 | defp apply_function(%WrappedFun{arity: 1, callable: callable}, value, _context) do 33 | callable.(value) 34 | end 35 | 36 | @spec apply_function(any(), WrappedFun.t(2), Context.t()) :: any() 37 | defp apply_function(%WrappedFun{arity: 2, callable: callable}, value, context) do 38 | callable.(value, context) 39 | end 40 | end 41 | -------------------------------------------------------------------------------- /lib/data_quacker/sourcer.ex: -------------------------------------------------------------------------------- 1 | defmodule DataQuacker.Sourcer do 2 | @moduledoc false 3 | 4 | alias DataQuacker.Schema.WrappedFun 5 | 6 | def call(%WrappedFun{} = getter_function, _values, context) do 7 | apply_function(getter_function, context) 8 | end 9 | 10 | def call(target, values, _context) do 11 | get_value(target, values) 12 | end 13 | 14 | defp apply_function(%WrappedFun{arity: 0, callable: callable}, _context) do 15 | callable.() 16 | end 17 | 18 | defp apply_function(%WrappedFun{arity: 1, callable: callable}, context) do 19 | callable.(context) 20 | end 21 | 22 | defp get_value(target, values) do 23 | Map.get(values, target) 24 | end 25 | end 26 | -------------------------------------------------------------------------------- /lib/data_quacker/transformer.ex: -------------------------------------------------------------------------------- 1 | defmodule DataQuacker.Transformer do 2 | @moduledoc false 3 | 4 | alias DataQuacker.Context 5 | 6 | alias DataQuacker.Schema.WrappedFun 7 | 8 | @type transformation_result :: {:ok, any()} | {:ok, any(), any()} | {:error, any()} | :error 9 | 10 | @spec call(any(), nonempty_list(WrappedFun.t()), Context.t()) :: transformation_result() 11 | def call(value, [transformer | rest], context) do 12 | case apply_transformer(value, transformer, context) do 13 | {:ok, value} -> 14 | call(value, rest, context) 15 | 16 | {:ok, value, support_data} -> 17 | call(value, rest, %{context | support_data: support_data}) 18 | 19 | {:error, _details} = error -> 20 | error 21 | 22 | :error -> 23 | :error 24 | 25 | el -> 26 | raise """ 27 | 28 | Transformer in #{elem(context.metadata, 0)} #{elem(context.metadata, 1)} 29 | returned an incorrect value #{inspect(el)}. 30 | 31 | Transformers can only have returns of type: 32 | `{:ok, any()} | {:ok, any(), any()} | {:error, any()} | :error` 33 | """ 34 | end 35 | end 36 | 37 | @spec call(any(), [], Context.t()) :: {:ok, any(), Context.t()} 38 | def call(value, [], context), do: {:ok, value, context} 39 | 40 | @spec apply_transformer(any(), WrappedFun.t(1), Context.t()) :: any() 41 | defp apply_transformer(value, %WrappedFun{callable: callable, arity: 1}, _context) do 42 | callable.(value) 43 | end 44 | 45 | @spec apply_transformer(any(), WrappedFun.t(2), Context.t()) :: any() 46 | defp apply_transformer(value, %WrappedFun{callable: callable, arity: 2}, context) do 47 | callable.(value, context) 48 | end 49 | end 50 | -------------------------------------------------------------------------------- /lib/data_quacker/validator.ex: -------------------------------------------------------------------------------- 1 | defmodule DataQuacker.Validator do 2 | @moduledoc false 3 | 4 | alias DataQuacker.Context 5 | 6 | alias DataQuacker.Schema.WrappedFun 7 | 8 | @type validation_result :: :ok | :error | {:error, any()} 9 | 10 | @spec call(any(), nonempty_list(WrappedFun.t()), Context.t()) :: validation_result() 11 | def call(value, [validator | rest], context) do 12 | case apply_validation(value, validator, context) do 13 | :ok -> 14 | call(value, rest, context) 15 | 16 | true -> 17 | call(value, rest, context) 18 | 19 | false -> 20 | :error 21 | 22 | {:error, _details} = error -> 23 | error 24 | 25 | :error -> 26 | :error 27 | 28 | el -> 29 | raise """ 30 | 31 | Validator in #{elem(context.metadata, 0)} #{elem(context.metadata, 1)} 32 | returned an incorrect value #{inspect(el)}. 33 | 34 | Validators can only have returns of type: 35 | `:ok | :error | {:error, any()} | true | false` 36 | """ 37 | end 38 | end 39 | 40 | @spec call(any(), [], Context.t()) :: :ok 41 | def call(_value, [], _context), do: :ok 42 | 43 | @spec apply_validation(any(), WrappedFun.t(1), Context.t()) :: any() 44 | defp apply_validation(value, %WrappedFun{callable: callable, arity: 1}, _context) do 45 | callable.(value) 46 | end 47 | 48 | @spec apply_validation(any(), WrappedFun.t(2), Context.t()) :: any() 49 | defp apply_validation(value, %WrappedFun{callable: callable, arity: 2}, context) do 50 | callable.(value, context) 51 | end 52 | end 53 | -------------------------------------------------------------------------------- /lib/schema/error.ex: -------------------------------------------------------------------------------- 1 | defmodule DataQuacker.SchemaError do 2 | defexception [:message] 3 | end 4 | -------------------------------------------------------------------------------- /lib/schema/helpers/fun_wrapper.ex: -------------------------------------------------------------------------------- 1 | defmodule DataQuacker.Schema.FunWrapper do 2 | @moduledoc false 3 | 4 | alias DataQuacker.Schema.WrappedFun 5 | alias DataQuacker.SchemaError 6 | 7 | defmacro wrap_fun(fun, expected_arity \\ nil) do 8 | arity = fun_arity(fun) 9 | args = fun_args(arity) 10 | name = random_name() 11 | 12 | maybe_assert_arity!(arity, expected_arity) 13 | 14 | quote do 15 | def unquote(name)(unquote_splicing(args)) do 16 | unquote(fun).(unquote_splicing(args)) 17 | end 18 | 19 | %WrappedFun{callable: &(__MODULE__.unquote(name) / unquote(arity)), arity: unquote(arity)} 20 | end 21 | end 22 | 23 | defp fun_arity(quoted_fun) do 24 | with {fun, _} <- Code.eval_quoted(quoted_fun), 25 | fun_info <- :erlang.fun_info(fun), 26 | arity when not is_nil(arity) <- Keyword.get(fun_info, :arity) do 27 | arity 28 | else 29 | _error -> raise SchemaError, "Invalid function given" 30 | end 31 | end 32 | 33 | defp fun_args(0), do: [] 34 | 35 | defp fun_args(arity) do 36 | Enum.map(1..arity, fn i -> 37 | # credo:disable-for-next-line Credo.Check.Warning.UnsafeToAtom 38 | arg_name = String.to_atom("arg#{i}") 39 | 40 | # AST for a variable 41 | {arg_name, [], __MODULE__} 42 | end) 43 | end 44 | 45 | defp maybe_assert_arity!(arity, expected_arity) do 46 | {unquoted_expected_arity, _} = Code.eval_quoted(expected_arity) 47 | 48 | case unquoted_expected_arity do 49 | nil -> 50 | :ok 51 | 52 | %Range{first: first, last: last} when arity >= first and arity <= last -> 53 | :ok 54 | 55 | i when is_integer(i) and arity == i -> 56 | :ok 57 | 58 | el -> 59 | raise SchemaError, """ 60 | 61 | A function of unexpected arity #{arity} given. 62 | Should be #{inspect(el)} 63 | """ 64 | end 65 | end 66 | 67 | defp random_name() do 68 | 64 69 | |> :crypto.strong_rand_bytes() 70 | |> Base.url_encode64() 71 | |> binary_part(0, 64) 72 | |> String.to_atom() 73 | end 74 | end 75 | -------------------------------------------------------------------------------- /lib/schema/helpers/wrapped_fun.ex: -------------------------------------------------------------------------------- 1 | defmodule DataQuacker.Schema.WrappedFun do 2 | @moduledoc false 3 | 4 | @type t :: %__MODULE__{ 5 | callable: (... -> any()), 6 | arity: non_neg_integer() 7 | } 8 | 9 | @type t(arity) :: %__MODULE__{ 10 | callable: (... -> any()), 11 | arity: arity 12 | } 13 | defstruct [:callable, :arity] 14 | end 15 | -------------------------------------------------------------------------------- /lib/schema/schema.ex: -------------------------------------------------------------------------------- 1 | # credo:disable-for-this-file Credo.Check.Refactor.AppendSingleItem 2 | defmodule DataQuacker.Schema do 3 | @moduledoc ~S""" 4 | Defines macros for creating data schemas 5 | which represents a mapping from the source to the desired output. 6 | 7 | > Note: To use the macros you have to put `use DataQuacker.Schema` in the desired module. 8 | 9 | A schema can be defined to represent the structure of an arbitrarily nested map or list of maps. 10 | This is done with the `schema/2`, `row/2` and `field/3` macros. 11 | Additionally, there are two special macros: `validate/1` and `transform/1`. 12 | Lastly, the `source/1` and `virtual_source/1` macros are used 13 | to define the data which should be inserted in a particular field. 14 | These allow for validation and transformation to be performed 15 | on a specific subset of the output data. 16 | 17 | > Note: the `row/2` and `field/3` macros represent the *output* structure, 18 | while the `source/1` and `virtual_source/1` macros reference the input data. 19 | Since both the input and the output can be said to have rows, 20 | the term "source row" is used in the documentation to denote a row in the input data. 21 | The term "row" is used to denote a row in the output. 22 | 23 | All of the structure-defining macros take a block as their last argument 24 | which can be thought of as their "body". The `schema/2` and `field/2` macros 25 | also take a name as their first argument, and `row/2` and `field/3` 26 | take a keyword list of options as their first and second argument respectively. 27 | 28 | More information can be found in the documentation for the specific macros. 29 | 30 | ## Examples 31 | 32 | > Note: A fully working implementation of these examples can be found in the tests inside the "examples" directory. 33 | 34 | Suppose we have a table of students in the form of a CSV file, which looks like this: 35 | 36 | | First name | Last name | Age | Favourite subject | 37 | |:----------:|:---------:|:---:|:-----------------:| 38 | | John | Smith | 19 | Maths | 39 | | Adam | Johnson | 18 | Physics | 40 | | Quackers | the Duck | 1 | Programming | 41 | 42 | Also suppose our desired output is a list of tuples with maps with the following structure: 43 | 44 | ```elixir 45 | {:ok, %{ 46 | first_name: "...", 47 | last_name: "...", 48 | age: "...", 49 | favourite_subject: "..." 50 | }} 51 | ``` 52 | 53 | The mapping from the table to the list of maps can be represented as follows: 54 | 55 | ```elixir 56 | defmodule StudentsSchema do 57 | use DataQuacker.Schema 58 | 59 | schema :students do 60 | field :first_name do 61 | source("first name") 62 | end 63 | 64 | field :last_name do 65 | source("last name") 66 | end 67 | 68 | field :age do 69 | source("age") 70 | end 71 | 72 | field :favourite_subject do 73 | source("favourite subject") 74 | end 75 | end 76 | end 77 | ``` 78 | 79 | This looks great (I hope!), but realistically we would like age to be an Integer, 80 | and favourite subject to be somehow validated. This can be achieved by modifying the previous schema, like this: 81 | 82 | ```elixir 83 | defmodule StudentsSchema do 84 | use DataQuacker.Schema 85 | 86 | schema :students do 87 | field :first_name do 88 | source("first name") 89 | end 90 | 91 | field :last_name do 92 | source("last name") 93 | end 94 | 95 | field :age do 96 | transform(fn age -> 97 | case Integer.parse(age) do 98 | {age_int, _} -> {:ok, age_int} 99 | :error -> {:error, "Invalid value #{age} given"} 100 | end 101 | end) 102 | 103 | source("age") 104 | end 105 | 106 | field :favourite_subject do 107 | validate(fn subj -> subj in ["Maths", "Physics", "Programming"] end) 108 | 109 | source("favourite subject") 110 | end 111 | end 112 | end 113 | ``` 114 | 115 | Now our result will be a list of maps, like: 116 | ```elixir 117 | [ 118 | # ... 119 | {:ok, %{ 120 | age: 123, 121 | # ... 122 | }} 123 | # ... 124 | ] 125 | ``` 126 | 127 | > Note: To see how to use such schema to parse a CSV file, please see the example in the documentation for the `DataQuacker` module. 128 | 129 | However if, for example, an invalid age is given, 130 | the entire row where the error occurred will result in the following tuple: 131 | `{:error, "Invalid value blabla given"}` 132 | 133 | Great, but what if we have the "First name" and "Last name" columns in our CSV files, 134 | but only a `:full_name` field in our database? No problem, Fields can be arbitrarily nested. 135 | 136 | It's just a small tweak: 137 | 138 | ```elixir 139 | defmodule StudentsSchema do 140 | use DataQuacker.Schema 141 | 142 | schema :students do 143 | field :full_name do 144 | transform(fn %{first_name: first_name, last_name: last_name} -> 145 | {:ok, "#{first_name} #{last_name}"} 146 | end) 147 | 148 | field :first_name do 149 | source("first name") 150 | end 151 | 152 | field :last_name do 153 | source("last name") 154 | end 155 | end 156 | 157 | # ... 158 | end 159 | end 160 | ``` 161 | 162 | Now our output is: 163 | 164 | ```elixir 165 | {:ok, [ 166 | #... 167 | {:ok, %{ 168 | full_name: "John Smith", 169 | # ... 170 | }} 171 | #... 172 | ]} 173 | ``` 174 | 175 | To illustrate some more functionality, let's take a look at another example. 176 | We will start with a very simple CSV source file 177 | which will gradually become more and more complex, 178 | and so will our rules for parsing it. 179 | 180 | | Apartment/flat size (in m^2) | Price per 1 month | 181 | |:----------------------------:|:-----------------:| 182 | | 40 | 1000 | 183 | | 50 | 1100 | 184 | 185 | ```elixir 186 | defmodule PricingSchema do 187 | use DataQuacker.Schema 188 | 189 | schema :pricing do 190 | field :size do 191 | transform(fn size -> 192 | case Integer.parse(size) do 193 | {size_int, _} -> {:ok, size_int} 194 | :error -> {:error, "Invalid value #{size} given"} 195 | end 196 | end) 197 | 198 | source("Apartment/flat size (in m^2)") 199 | end 200 | 201 | field :price do 202 | transform(fn price -> 203 | case Integer.parse(price) do 204 | {price_int, _} -> {:ok, price_int} 205 | :error -> {:error, "Invalid value #{price} given"} 206 | end 207 | end) 208 | 209 | source("Price per 1 month") 210 | end 211 | end 212 | end 213 | ``` 214 | 215 | The above results in: 216 | ```elixir 217 | [ 218 | {:ok, %{size: 50, price: 1100}}, 219 | {:ok, %{size: 40, price: 1000}} 220 | ] 221 | ``` 222 | 223 | > Note: The rows in the result are in the reverse order compared to the source rows. This is because for large lists reversing may be an expensive operation, which is often redundant, for example if the result is supposed to be inserted in a database. 224 | 225 | This schema could work, but there are some problems with it. 226 | 227 | It's not fun to copy&paste the function for parsing string to int 228 | over and over again. That's why we'll create a regular function 229 | and pass a reference to it in both places. 230 | 231 | ```elixir 232 | defmodule PricingSchema do 233 | use DataQuacker.Schema 234 | 235 | schema :pricing do 236 | field :size do 237 | transform(&PricingSchema.parse_int/1) 238 | # ... 239 | end 240 | 241 | field :price do 242 | transform(&PricingSchema.parse_int/1) 243 | # ... 244 | end 245 | end 246 | 247 | def parse_int(str) do 248 | case Integer.parse(str) do 249 | {int, _} -> {:ok, int} 250 | :error -> {:error, "Invalid value #{str} given"} 251 | end 252 | end 253 | end 254 | ``` 255 | 256 | > Note: the reference to the function must be written out in full (including the module name), 257 | because it will be executed in a different context. 258 | 259 | This is better, but still not ideal for two reasons. 260 | First of all, we source our data based on simple string matching. While this will still work 261 | if the casing in the headers changes, it will not if "Price per 1 month" changes to "Price *for* 1 month", 262 | or "Apartment/flat size (in m^2)" to "Apartment *or* flat size (in m^2)". 263 | Since most likely we do not have control over the source, these can change unexpectedly. 264 | Second of all, our error messages are quite vague since they do not specify the offending source row and field. 265 | 266 | To tackle the first one we can change our `source/1` macros to be either strings, regexes, 267 | lists of strings or custom functions. The details of each approach is specified 268 | in the docs for the `source/1` macro, but for now we will just us a list of strings. 269 | 270 | `source("Apartment/flat size (in m^2)")` -> `source(["apartment", "size"])` 271 | 272 | `source("Apartment/flat size (in m^2)")` -> `source(["price", "1"])` 273 | 274 | The above mean "match a header which contains apartment and size" 275 | and "match a header which contains apartment and 1". 276 | 277 | > Note: The order of the headers is inconsequential. 278 | 279 | As for the second issue, transform can actually be given a one- or two-argument function. 280 | If it is given a one-argument function, the argument at execution will be the value of the field 281 | or row. If it is given a two-argument function, the second argument will be a `%Context{}` struct. 282 | Which contains the following fields: `:metadata`, `:support_data`, `:source_row`. 283 | Support data is an arbitrary value of any type that can be passed in at parse time. 284 | It can be used to, for example, validate something against a database without having to fetch the data 285 | for each row. More on that in the documentation of the `DataQuacker` module. For now, however, we only need `metadata` and `source_row`. The first one is a tuple 286 | of an atom and an atom or a tuple, where the first element is the type (`:field` or `:row`) 287 | and the second one is the name or index in the case of a row. 288 | The second one is just the index of the source row which is being processed. 289 | 290 | > Note: the term "source row" is used here to denote a row in the input file. The term row 291 | is used to denote a row of output. 292 | 293 | We can therefore change our `parse_int/1` function into 294 | 295 | ```elixir 296 | def parse_int(str, %{metadata: metadata, source_row: source_row}) do 297 | case Integer.parse(str) do 298 | {int, _} -> {:ok, int} 299 | :error -> {:error, "Error processing #{elem(metadata, 0)} #{elem(metadata, 1)} in row #{source_row}; '#{str}' given"} 300 | end 301 | end 302 | ``` 303 | 304 | An example error will look like this: `{:error, "Error processing field price in row 2; 'oops' given"}` 305 | 306 | The next case we will be dealing with here is again a "small change" to the source file. 307 | 308 | | Apartment/flat size (in m^2) | Price per 1 month | Price per 3 months | 309 | |:----------------------------:|:-----------------:|--------------------| 310 | | 40 | 1000 | 2800 | 311 | | 50 | 1100 | 3000 | 312 | | 60 | | 3600 | 313 | 314 | Now each source row contains two different prices for different lease period. 315 | Additionally, for the bigger apartments there may only be an option 316 | to rent for three months. 317 | 318 | We could create a schema to parse the data int rows like: 319 | `%{size: 40, price_1: 1000, price_3: 2800}`, 320 | but this is not ideal since we would have to deal with `nil` at `:price_1`, 321 | and we probably want separate rows in the database for each lease duration, 322 | as this will allow us to easily pull out the price for a specific size and lease duration 323 | using SQL indexes. 324 | 325 | A better structure therefore would look like this 326 | ```elixir 327 | [ 328 | # ... 329 | {:ok, %{size: 40, duration: 3, price: 2800}}, 330 | {:ok, %{size: 40, duration: 1, price: 1000}} 331 | ] 332 | ``` 333 | 334 | This is where the `row/2` macro comes in. It allows us to specify any number of output rows 335 | for a single input row. Previously we did not use this macro at all, 336 | since the lack of it implies there is exactly one output row per input row. 337 | 338 | This is our new schema: 339 | 340 | ```elixir 341 | defmodule PricingSchema do 342 | use DataQuacker.Schema 343 | 344 | schema :pricing do 345 | row skip_if: (fn %{price: price} -> is_nil(price) end) do 346 | field :size do 347 | transform(&PricingSchema.parse_int/2) 348 | 349 | source(["apartment", "size"]) 350 | end 351 | 352 | field :duration do 353 | virtual_source(1) 354 | end 355 | 356 | field :price do 357 | transform(&PricingSchema.parse_int/2) 358 | 359 | source(["price", "1"]) 360 | end 361 | end 362 | 363 | row do 364 | field :size do 365 | transform(&PricingSchema.parse_int/2) 366 | 367 | source(["apartment", "size"]) 368 | end 369 | 370 | field :duration do 371 | virtual_source(3) 372 | end 373 | 374 | field :price do 375 | transform(&PricingSchema.parse_int/2) 376 | 377 | source(["price", "3"]) 378 | end 379 | end 380 | end 381 | 382 | def parse_int("", _), do: {:ok, nil} 383 | 384 | def parse_int(str, %{metadata: metadata, source_row: source_row}) do 385 | case Integer.parse(str) do 386 | {int, _} -> {:ok, int} 387 | :error -> {:error, "Error processing #{elem(metadata, 0)} #{elem(metadata, 1)} in row #{source_row}; '#{str}' given"} 388 | end 389 | end 390 | end 391 | ``` 392 | 393 | There are a few new interesting things going on here. 394 | 395 | Firstly, as we can see, any column in the source can be inserted multiple times 396 | within the schema. This is particularly useful if for a single input row 397 | we want to have multiple output rows which share some of the fields. 398 | 399 | Secondly, we added a new field `:duration` which instead of being sourced from the input data 400 | is just a static value. We achieved it with the `virtual_source/1` macro 401 | which either takes a value or a function returning a value to be injected into the field. 402 | This is useful for us to be able to make the output structure as close to our database model as we can. 403 | 404 | > Note: There is a special case in the `parse_int/2` function to return nil on empty input, 405 | because `Integer.parse/2` will return an error given an empty string. 406 | 407 | Lastly, we added a special option to the first output row, called `skip_if`. 408 | The function we provided will be evaluated for each output row representing a one-month lease price, 409 | and if it returns `true` the row will not appear in the actual result. 410 | 411 | Using our latest schema and the CSV presented above, we get this result: 412 | ```elixir 413 | {:ok, [ 414 | {:ok, %{duration: 3, price: 3600, size: 60}}, 415 | {:ok, %{duration: 3, price: 3000, size: 50}}, 416 | {:ok, %{duration: 1, price: 1100, size: 50}}, 417 | {:ok, %{duration: 3, price: 2800, size: 40}}, 418 | {:ok, %{duration: 1, price: 1000, size: 40}} 419 | ]} 420 | ``` 421 | 422 | The last case is about multiple transformations on the same field. 423 | 424 | Our source file has changed again, so that it includes some non-integer prices. 425 | We could just switch our usage of `Integer.parse/2` to `Decimal.parse/1`, 426 | but there is a catch: `Decimal.parse/1` expects `.` (dot) to be the decimal separator, 427 | and our source uses `,` (comma). 428 | For this reason we will need to first replace the commas with periods, and then convert. 429 | 430 | As the transformer we provide for the `:price` field is an arbitrary Elixir function, 431 | we could do both of those operations at once. 432 | That would work, but for schemas which have very complex transformation and validation rules, 433 | the function could get bloated quickly. 434 | 435 | The goal of this library is to avoid that complexity, and allow for easy understanding 436 | of the custom rules. This is why it's recommended to split te transformers into multiple functions. 437 | 438 | Let's create two functions: `parse_decimal/2` and `replace_commas/1`. 439 | 440 | > Note: To follow this example you will have to install the `Decimal` library, which you can find at [hex.pm/packages/decimal](https://hex.pm/packages/decimal). 441 | 442 | ```elixir 443 | def replace_commas(str) do 444 | {:ok, String.replace(str, ",", ".")} 445 | end 446 | 447 | def parse_decimal("", _), do: {:ok, nil} 448 | 449 | def parse_decimal(str, %{metadata: metadata, source_row: source_row}) do 450 | case Decimal.parse(str) do 451 | {decimal, ""} -> {:ok, decimal} 452 | :error -> {:error, "Error processing #{elem(metadata, 0)} #{elem(metadata, 1)} in row #{source_row}; '#{str}' given"} 453 | end 454 | end 455 | ``` 456 | 457 | We can now change our `:price` fields to use these functions: 458 | 459 | ```elixir 460 | # ... 461 | 462 | field :price do 463 | transform(&PricingSchema.replace_commas/1) 464 | transform(&PricingSchema.parse_decimal/2) 465 | 466 | source(["price", "1"]) 467 | end 468 | 469 | # ... 470 | 471 | field :price do 472 | transform(&PricingSchema.replace_commas/1) 473 | transform(&PricingSchema.parse_decimal/2) 474 | 475 | source(["price", "3"]) 476 | end 477 | 478 | # ... 479 | ``` 480 | 481 | > Note: Different transformers for the same field or row may take different numbers of arguments, depending on whether the context is needed in the particular function. 482 | 483 | The final schema should look like this: 484 | 485 | ```elixir 486 | defmodule PricingSchema do 487 | use DataQuacker.Schema 488 | 489 | schema :pricing do 490 | row skip_if: (fn %{price: price} -> is_nil(price) end) do 491 | field :size do 492 | transform(&PricingSchema.parse_int/2) 493 | 494 | source(["apartment", "size"]) 495 | end 496 | 497 | field :duration do 498 | virtual_source(1) 499 | end 500 | 501 | field :price do 502 | transform(&PricingSchema.replace_commas/1) 503 | transform(&PricingSchema.parse_decimal/2) 504 | 505 | source(["price", "1"]) 506 | end 507 | end 508 | 509 | row do 510 | field :size do 511 | transform(&PricingSchema.parse_int/2) 512 | 513 | source(["apartment", "size"]) 514 | end 515 | 516 | field :duration do 517 | virtual_source(3) 518 | end 519 | 520 | field :price do 521 | transform(&PricingSchema.replace_commas/1) 522 | transform(&PricingSchema.parse_decimal/2) 523 | 524 | source(["price", "3"]) 525 | end 526 | end 527 | end 528 | 529 | def parse_int("", _), do: {:ok, nil} 530 | 531 | def parse_int(str, %{metadata: metadata, source_row: source_row}) do 532 | case Integer.parse(str) do 533 | {int, _} -> {:ok, int} 534 | :error -> {:error, "Error processing #{elem(metadata, 0)} #{elem(metadata, 1)} in row #{source_row}; '#{str}' given"} 535 | end 536 | end 537 | 538 | def replace_commas(str) do 539 | {:ok, String.replace(str, ",", ".")} 540 | end 541 | 542 | def parse_decimal("", _), do: {:ok, nil} 543 | 544 | def parse_decimal(str, %{metadata: metadata, source_row: source_row}) do 545 | case Decimal.parse(str) do 546 | {decimal, ""} -> {:ok, decimal} 547 | :error -> {:error, "Error processing #{elem(metadata, 0)} #{elem(metadata, 1)} in row #{source_row}; '#{str}' given"} 548 | end 549 | end 550 | end 551 | ``` 552 | """ 553 | 554 | alias DataQuacker.Schema.State 555 | 556 | alias DataQuacker.SchemaError 557 | 558 | import DataQuacker.Schema.FunWrapper 559 | 560 | @doc false 561 | defmacro __using__(_opts) do 562 | quote do 563 | import unquote(__MODULE__) 564 | 565 | @state State.new() 566 | @schema_names [] 567 | end 568 | end 569 | 570 | @doc ~S""" 571 | Defines a schema and a `schema_structure/1` function 572 | which takes the schema name as the argument 573 | and returns the schema in a form that can be passed to a parser. 574 | 575 | Multiple schemas can be defined in a single module. 576 | 577 | The result structure is a map with the following types: 578 | ```elixir 579 | %{ 580 | __name__: atom(), 581 | rows: list(), 582 | matchers: list() 583 | } 584 | ``` 585 | 586 | The macro takes in a name and a block with which the rows, fields, etc. can be defined. 587 | The block must contain at least one row. Note, however, that if no row is explicitly specified, 588 | but at least one field is, the schema is assumed to have exactly one row which contains all of the fields. 589 | 590 | > Note: if one or many fields are present directly inside the schema, the row macro cannot be used explicitly. 591 | The same is true the other way around - if at least one row is specified explicitly, 592 | fields can only appear inside rows, not directly in the schema. 593 | 594 | Unlike `row/2` and `field/3`, the `schema/2` macro cannot have validators or transformers. 595 | If there is only one row, but it needs to define validators or transformers, 596 | the schema must define this row explicitly. 597 | """ 598 | defmacro schema(name, do: block) when is_atom(name) do 599 | quote do 600 | if unquote(name) in @schema_names do 601 | raise SchemaError, """ 602 | 603 | Invalid schema name. 604 | There already exists a schema #{inspect(unquote(name))} 605 | on this module. 606 | """ 607 | end 608 | 609 | if not Enum.empty?(@state.cursor) do 610 | raise SchemaError, """ 611 | 612 | Invalid schema position. 613 | Schema can only appear as a top-level module macro 614 | (cannot be nested in other schemas). 615 | """ 616 | end 617 | 618 | @state State.new() 619 | @state State.register(@state, :schema, {unquote(name), %{}}) 620 | 621 | unquote(block) 622 | 623 | if Enum.empty?(@state.rows) do 624 | raise SchemaError, """ 625 | 626 | Invalid schema usage. 627 | Schema must have at least 628 | one row or one field. 629 | """ 630 | end 631 | 632 | if State.flagged?(@state, :has_loose_fields?) do 633 | @state State.update(@state, :row, %{fields: @state.fields}) 634 | 635 | @state State.cursor_exit(@state) 636 | end 637 | 638 | @state State.cursor_exit(@state) 639 | 640 | @state State.update(@state, :schema, %{ 641 | matchers: @state.matchers, 642 | rows: @state.rows 643 | }) 644 | 645 | def schema_structure(unquote(name)) do 646 | @state.schema 647 | end 648 | end 649 | end 650 | 651 | defmacro schema(name, do: _block) do 652 | quote do 653 | raise SchemaError, """ 654 | 655 | Invalid schema name. 656 | Must be an atom, #{inspect(unquote(name))} given. 657 | """ 658 | end 659 | end 660 | 661 | @doc ~S""" 662 | Defines an output row. 663 | Can only be used directly inside a schema, and only if the schema has no fields 664 | directly inside it. 665 | 666 | This macro takes in a keyword list of options, and a block within which the fields, 667 | validators and transformers can be specified. 668 | 669 | ## Options 670 | * `:skip_if` - a function of arity 1 or 2, which returns `true` or `false` given the value of the row and optionally the context; `true` means the row should be skipped from the output, `false` is a "noop" 671 | 672 | > Note: The order of execution is always: transformers, then validators, then "skip_if". 673 | """ 674 | defmacro row(opts \\ [], do: block) do 675 | quote do 676 | if State.flagged?(@state, :has_loose_fields?) do 677 | raise SchemaError, """ 678 | 679 | Invalid row usage. 680 | Rows cannot appear in a schema 681 | if the schema has loose fields 682 | (fields appearing outside of any row). 683 | """ 684 | end 685 | 686 | if not State.cursor_at?(@state, :schema) do 687 | raise SchemaError, """ 688 | 689 | Invalid row position. 690 | Rows can only appear directly 691 | inside a schema. 692 | """ 693 | end 694 | 695 | @state State.clear_fields(@state) 696 | @state State.register( 697 | @state, 698 | :row, 699 | {length(@state.rows), %{skip_if: skip_if_opt(unquote(opts))}} 700 | ) 701 | 702 | unquote(block) 703 | 704 | @state State.update(@state, :row, %{fields: @state.fields}) 705 | 706 | if @state.fields == %{} do 707 | raise SchemaError, """ 708 | 709 | Invalid row usage. 710 | Rows must have at least one subfield. 711 | """ 712 | end 713 | 714 | @state State.cursor_exit(@state) 715 | end 716 | end 717 | 718 | @doc ~S""" 719 | Defines an output field. 720 | Can be used inside a schema, a row or another field. 721 | Can only be used directly inside a schema if the schema has no explicitly defined rows. 722 | Can only be used inside another field if that field has no source. 723 | 724 | This macro takes in a name, a keyword list of options, and a block within which the subfields or source, 725 | and validators and transformers can be specified. 726 | Can either specify exactly one source (virtual or regular) or subfields. 727 | 728 | ## Options 729 | * `:skip_if` - a function of arity 1 or 2, which returns `true` or `false` given the value of the field and optionally the context; `true` means the field should be skipped from the output, `false` is a "noop" 730 | 731 | > Note: The order of execution is always: transformers, then validators, then "skip_if" 732 | """ 733 | defmacro field(_name, _opts \\ [], _) 734 | 735 | defmacro field(name, opts, do: block) when is_atom(name) do 736 | quote do 737 | if State.cursor_at?(@state, nil) do 738 | raise SchemaError, """ 739 | 740 | Invalid field position. 741 | Fields can only appear inside a schema, 742 | rows or other fields. 743 | """ 744 | end 745 | 746 | if State.cursor_at?(@state, :schema) and not Enum.empty?(@state.rows) do 747 | raise SchemaError, """ 748 | 749 | Invalid field usage. 750 | Fields cannot appear directly inside a schema 751 | if the schema explicitly declares rows. 752 | """ 753 | end 754 | 755 | if State.cursor_at?(@state, :schema) do 756 | @state State.flag(@state, :has_loose_fields?, true) 757 | @state State.register(@state, :row, {length(@state.rows), %{}}) 758 | end 759 | 760 | if State.cursor_at?(@state, :field) and State.get(@state, :field).__type__ == :sourced do 761 | raise SchemaError, """ 762 | 763 | Invalid field usage. 764 | A field can either have subfields or a source, 765 | but not both. 766 | """ 767 | end 768 | 769 | if State.cursor_at?(@state, :row) and Map.has_key?(@state.fields, unquote(name)) do 770 | raise SchemaError, """ 771 | 772 | Invalid field name. 773 | There already exists a field of the same name 774 | in this row. 775 | """ 776 | end 777 | 778 | if State.cursor_at?(@state, :field) and 779 | Map.has_key?(State.get(@state, :field).subfields, unquote(name)) do 780 | raise SchemaError, """ 781 | 782 | Invalid field name. 783 | There already exists a subfield of the same name 784 | in this field. 785 | """ 786 | end 787 | 788 | if State.cursor_at?(@state, :field) do 789 | @state State.update(@state, :field, %{__type__: :wrapper}) 790 | end 791 | 792 | @state State.register( 793 | @state, 794 | :field, 795 | {unquote(name), %{skip_if: skip_if_opt(unquote(opts))}} 796 | ) 797 | 798 | unquote(block) 799 | 800 | if is_nil(State.get(@state, :field).__type__) do 801 | raise SchemaError, """ 802 | 803 | Invalid field usage. 804 | Fields must either have a source 805 | or at least one subfield. 806 | """ 807 | end 808 | 809 | @state State.cursor_exit(@state) 810 | end 811 | end 812 | 813 | defmacro field(name, _opts, do: _block) do 814 | quote do 815 | raise SchemaError, """ 816 | 817 | Invalid field name. 818 | Must be an atom, #{inspect(unquote(name))} given. 819 | """ 820 | end 821 | end 822 | 823 | @doc ~S""" 824 | Defines a source mapping from the input. 825 | Can only be used inside a field, and only if that field does not define any subfields 826 | or any other source. 827 | 828 | This macro takes in either a "needle" which can be string, a regex, a list of strings, 829 | or a function of arity 1 or 2. 830 | 831 | ## Needle 832 | * when is a string - the downcased header name for a particular column must contain the downcased string given as the needle for the column to match 833 | * when is a regex - the header name for a particular column must match the needle for the column to match 834 | * when is a list of strings - the downcase header name for a particular column must contain all of the downcased elements given as the needle for the column to match 835 | * when is a function - given the header name for a particular column, and optionally the context, must return `true` for the column to match; the function must always return `true` or `false` 836 | """ 837 | defmacro source(needle) do 838 | {unquoted_needle, _} = Code.eval_quoted(needle) 839 | 840 | case unquoted_needle do 841 | string when is_binary(string) -> 842 | quote do 843 | source(fn column_name -> 844 | String.contains?(String.downcase(column_name), unquote(String.downcase(needle))) 845 | end) 846 | end 847 | 848 | list when is_list(list) -> 849 | quote do 850 | source(fn column_name -> 851 | column_name = String.downcase(column_name) 852 | 853 | Enum.all?( 854 | unquote(Enum.map(needle, &String.downcase(&1))), 855 | &String.contains?(column_name, &1) 856 | ) 857 | end) 858 | end 859 | 860 | %Regex{} -> 861 | quote do 862 | source(fn column_name -> 863 | Regex.match?(unquote(needle), column_name) 864 | end) 865 | end 866 | 867 | fun when is_function(fun) -> 868 | quote do 869 | if not State.cursor_at?(@state, :field) do 870 | raise SchemaError, """ 871 | 872 | Invalid source position. 873 | Sources can only appear inside fields. 874 | """ 875 | end 876 | 877 | if State.get(@state, :field).__type__ == :sourced do 878 | raise SchemaError, """ 879 | 880 | Invalid source usage. 881 | Only one source per field is allowed. 882 | """ 883 | end 884 | 885 | if State.get(@state, :field).__type__ == :wrapper do 886 | raise SchemaError, """ 887 | 888 | Invalid source usage. 889 | A field can either have subfields or a source, 890 | but not both. 891 | """ 892 | end 893 | 894 | @state State.register(@state, :matcher, wrap_fun(unquote(needle), 1..2)) 895 | @state State.update(@state, :field, %{__type__: :sourced, source: State.target(@state)}) 896 | end 897 | 898 | _el -> 899 | quote do 900 | raise SchemaError, """ 901 | 902 | Invalid column source type. 903 | Must be a string, a regex expression or a function 904 | which can be used to match a column name. 905 | """ 906 | end 907 | end 908 | end 909 | 910 | @doc ~S""" 911 | Defines a value to be injected to a particular field. 912 | Can only be used inside a field, and only if that field does not define any subfields 913 | or any other source. 914 | 915 | This macro takes in either a literal value, or a function of arity 0 or 1. 916 | 917 | ## Value 918 | * when is a function - optionally given the context, can return any value to be injected inside the field 919 | * else - the value is injected inside the field "as is" 920 | """ 921 | defmacro virtual_source(value) do 922 | {unquoted_value, _} = Code.eval_quoted(value) 923 | 924 | case unquoted_value do 925 | fun when is_function(fun) -> 926 | quote do 927 | if not State.cursor_at?(@state, :field) do 928 | raise SchemaError, """ 929 | 930 | Invalid source position. 931 | Sources can only appear inside fields. 932 | """ 933 | end 934 | 935 | if State.get(@state, :field).__type__ == :sourced do 936 | raise SchemaError, """ 937 | 938 | Invalid source usage. 939 | Only one source per field is allowed. 940 | """ 941 | end 942 | 943 | if State.get(@state, :field).__type__ == :wrapper do 944 | raise SchemaError, """ 945 | 946 | Invalid source usage. 947 | A field can either have subfields or a source, 948 | but not both. 949 | """ 950 | end 951 | 952 | @state State.update(@state, :field, %{ 953 | __type__: :sourced, 954 | source: wrap_fun(unquote(value), 0..1) 955 | }) 956 | end 957 | 958 | _el -> 959 | quote do 960 | virtual_source(fn -> unquote(value) end) 961 | end 962 | end 963 | end 964 | 965 | @doc ~S""" 966 | Defines a validator for a field or row. 967 | Can only be used inside a field or row. 968 | 969 | This macro takes in a function of arity 1 or 2, which will be applied to the value of the row or the field where the validator was defined. Multiple validators are allowed, and will be executed in the order in which they are defined. 970 | 971 | > Note: To use validators on a row, the row must be defined explicitly. Implicit rows cannot have validators. 972 | 973 | ## Fun 974 | * when is a function - given the field's or row's value and optionally the context, must return either `true`, `false`, `:ok`, `:error` or a tuple `{:error, any()}`, where `true` and `ok` are the success typing, and `false`, `:error` and `{:error, any()}` are the error typing; the entire output row will be an error row if any validation inside it or inside its fields fails 975 | """ 976 | defmacro validate(fun) do 977 | quote do 978 | validator = wrap_fun(unquote(fun), 1..2) 979 | 980 | cond do 981 | State.cursor_at?(@state, :row) -> 982 | validators = @state |> State.get(:row) |> Map.get(:validators) 983 | 984 | @state State.update(@state, :row, %{validators: validators ++ [validator]}) 985 | 986 | State.cursor_at?(@state, :field) -> 987 | validators = @state |> State.get(:field) |> Map.get(:validators) 988 | 989 | @state State.update(@state, :field, %{validators: validators ++ [validator]}) 990 | 991 | true -> 992 | raise SchemaError, """ 993 | 994 | Incorrect validator position. 995 | Validators can only appear 996 | inside rows or fields. 997 | """ 998 | end 999 | end 1000 | end 1001 | 1002 | @doc ~S""" 1003 | Defines a data transformer for a field or row. 1004 | Can only be used inside a field or row. 1005 | 1006 | This macro takes in a function of arity 1 or 2, which will be applied to the value of the row or the field where the transformer was defined. Multiple transformers are allowed, and will be executed in the order in which they are defined. 1007 | 1008 | > Note: To use transformers on a row, the row must be defined explicitly. Implicit rows cannot have transformers. 1009 | 1010 | ## Fun 1011 | * when is a function - given the field's or row's value and optionally the context, must return either `{:ok, any()}`, `{:error, any()}` or `:error`, where `{:ok, any()}` is the success typing and `{:error, any()}`, and `:error` are the error typing; the second element of the success tuple is taken to be the new value of the row or field; the entire output row will be an error row if any validation inside it or inside its fields fails 1012 | """ 1013 | defmacro transform(fun) do 1014 | quote do 1015 | transformer = wrap_fun(unquote(fun), 1..2) 1016 | 1017 | cond do 1018 | State.cursor_at?(@state, :row) -> 1019 | transformers = @state |> State.get(:row) |> Map.get(:transformers) 1020 | 1021 | @state State.update(@state, :row, %{transformers: transformers ++ [transformer]}) 1022 | 1023 | State.cursor_at?(@state, :field) -> 1024 | transformers = @state |> State.get(:field) |> Map.get(:transformers) 1025 | 1026 | @state State.update(@state, :field, %{transformers: transformers ++ [transformer]}) 1027 | 1028 | true -> 1029 | raise SchemaError, """ 1030 | 1031 | Incorrect transformer position. 1032 | Transformers can only appear 1033 | inside rows or fields. 1034 | """ 1035 | end 1036 | end 1037 | end 1038 | 1039 | @doc false 1040 | defmacro skip_if_opt(opts) do 1041 | {unquoted_opts, _} = Code.eval_quoted(opts) 1042 | 1043 | case Keyword.fetch(unquoted_opts, :skip_if) do 1044 | {:ok, fun} when is_function(fun) -> 1045 | quote do 1046 | wrap_fun(unquote(Keyword.get(opts, :skip_if)), 1..2) 1047 | end 1048 | 1049 | :error -> 1050 | quote do 1051 | nil 1052 | end 1053 | 1054 | _el -> 1055 | quote do 1056 | raise SchemaError, """ 1057 | 1058 | Invalid skip_if type 1059 | must be a function 1060 | with arity 1 or 2. 1061 | """ 1062 | end 1063 | end 1064 | end 1065 | end 1066 | -------------------------------------------------------------------------------- /lib/schema/state.ex: -------------------------------------------------------------------------------- 1 | # credo:disable-for-this-file Credo.Check.Refactor.AppendSingleItem 2 | defmodule DataQuacker.Schema.State do 3 | @moduledoc false 4 | 5 | alias DataQuacker.Schema.State 6 | 7 | defstruct cursor: [], flags: %{}, schema: %{}, matchers: [], rows: [], fields: %{} 8 | 9 | def new(), do: %State{} 10 | 11 | def clear_fields(state) do 12 | %State{state | fields: %{}} 13 | end 14 | 15 | def flag(%State{flags: flags} = state, flag, value) do 16 | flags = Map.put(flags, flag, value) 17 | 18 | %State{state | flags: flags} 19 | end 20 | 21 | def flagged?(%State{flags: flags}, flag) do 22 | Map.get(flags, flag, false) 23 | end 24 | 25 | def cursor_at?(%State{cursor: []}, type), do: is_nil(type) 26 | 27 | def cursor_at?(%State{cursor: cursor}, type) do 28 | elem(hd(cursor), 0) == type 29 | end 30 | 31 | def target(%State{cursor: cursor}) do 32 | target_from_cursor(cursor) 33 | end 34 | 35 | def cursor_exit(%State{cursor: cursor} = state, levels \\ 1) do 36 | %State{state | cursor: Enum.drop(cursor, levels)} 37 | end 38 | 39 | def register(%State{cursor: cursor} = state, :schema, {schema_name, schema}) do 40 | cursor = [{:schema, schema_name} | cursor] 41 | 42 | schema = Map.merge(new_schema(schema_name), schema) 43 | 44 | %State{state | schema: schema, cursor: cursor} 45 | end 46 | 47 | def register(%State{cursor: cursor, rows: rows} = state, :row, {row_index, row}) do 48 | cursor = [{:row, row_index} | cursor] 49 | 50 | row = Map.merge(new_row(row_index), row) 51 | rows = rows ++ [row] 52 | 53 | %State{state | rows: rows, cursor: cursor} 54 | end 55 | 56 | def register(%State{cursor: cursor, fields: fields} = state, :field, {field_name, field}) do 57 | cursor = [{:field, field_name} | cursor] 58 | needle = field_needle(cursor) 59 | 60 | field = Map.merge(new_field(field_name), field) 61 | fields = put_in(fields, Enum.reverse(needle), field) 62 | 63 | %State{state | fields: fields, cursor: cursor} 64 | end 65 | 66 | def register(%State{matchers: matchers, cursor: cursor} = state, :matcher, rule) do 67 | matcher = %{rule: rule, target: target_from_cursor(cursor)} 68 | matchers = [matcher | matchers] 69 | 70 | %State{state | matchers: matchers} 71 | end 72 | 73 | def update(%State{schema: existing_schema} = state, :schema, schema) do 74 | schema = Map.merge(existing_schema, schema) 75 | 76 | %State{state | schema: schema} 77 | end 78 | 79 | def update(%State{cursor: cursor, rows: rows} = state, :row, row) do 80 | index = elem(hd(cursor), 1) 81 | 82 | rows = List.update_at(rows, index, &Map.merge(&1, row)) 83 | 84 | %State{state | rows: rows} 85 | end 86 | 87 | def update(%State{cursor: cursor, fields: fields} = state, :field, field) do 88 | needle = field_needle(cursor) 89 | 90 | fields = update_in(fields, Enum.reverse(needle), &Map.merge(&1, field)) 91 | 92 | %State{state | fields: fields} 93 | end 94 | 95 | def get(%State{cursor: cursor, rows: rows}, :row) do 96 | Enum.at(rows, elem(hd(cursor), 1)) 97 | end 98 | 99 | def get(%State{cursor: cursor, fields: fields}, :field) do 100 | needle = field_needle(cursor) 101 | 102 | get_in(fields, Enum.reverse(needle)) 103 | end 104 | 105 | defp new_schema(name) do 106 | %{__name__: name, matchers: [], rows: []} 107 | end 108 | 109 | defp new_row(index) do 110 | %{__index__: index, fields: %{}, validators: [], transformers: [], skip_if: nil} 111 | end 112 | 113 | defp new_field(name) do 114 | %{ 115 | __name__: name, 116 | __type__: nil, 117 | source: nil, 118 | subfields: %{}, 119 | validators: [], 120 | transformers: [], 121 | skip_if: nil 122 | } 123 | end 124 | 125 | defp fields_cursor(cursor) do 126 | cursor |> Enum.split_while(&(elem(&1, 0) == :field)) |> elem(0) 127 | end 128 | 129 | defp target_from_cursor(cursor) do 130 | Enum.map(cursor, &elem(&1, 1)) 131 | end 132 | 133 | defp field_needle(cursor) do 134 | cursor |> fields_cursor() |> target_from_cursor() |> Enum.intersperse(:subfields) 135 | end 136 | end 137 | -------------------------------------------------------------------------------- /mix.exs: -------------------------------------------------------------------------------- 1 | defmodule DataQuacker.MixProject do 2 | use Mix.Project 3 | 4 | def project do 5 | [ 6 | app: :data_quacker, 7 | version: "0.1.1", 8 | elixir: "~> 1.12", 9 | deps: deps(), 10 | elixirc_paths: elixirc_paths(Mix.env()), 11 | build_embedded: Mix.env() == :prod, 12 | start_permanent: Mix.env() == :prod, 13 | package: package(), 14 | name: "DataQuacker", 15 | description: 16 | "A library for validating transforming and parsing non-sandboxed data (e.g. CSV files)", 17 | source_url: "https://github.com/fiodorbaczynski/data_quacker", 18 | homepage_url: "https://github.com/fiodorbaczynski/data_quacker", 19 | docs: docs() 20 | ] 21 | end 22 | 23 | defp elixirc_paths(:test), do: ["lib", "test/support"] 24 | 25 | defp elixirc_paths(_), do: ["lib"] 26 | 27 | def application do 28 | [ 29 | extra_applications: [:logger, :crypto] 30 | ] 31 | end 32 | 33 | def package do 34 | [ 35 | name: "data_quacker", 36 | files: ["lib", ".formatter.exs", "mix.exs", "README*", "LICENSE*"], 37 | maintainers: ["Fiodor Baczyński"], 38 | licenses: ["Apache-2.0"], 39 | links: %{"GitHub" => "https://github.com/fiodorbaczynski/data_quacker"} 40 | ] 41 | end 42 | 43 | defp deps do 44 | [ 45 | {:credo, "~> 1.5", only: :dev, runtime: false}, 46 | {:dialyxir, "~> 1.1.0", only: :dev, runtime: false}, 47 | {:ex_doc, "~> 0.25", only: :dev, runtime: false}, 48 | {:csv, "~> 2.4"}, 49 | {:decimal, "~> 2.0", only: :test}, 50 | {:mox, "~> 1.0.0", only: :test} 51 | ] 52 | end 53 | 54 | defp docs() do 55 | [ 56 | main: "DataQuacker", 57 | extras: ["README.md"], 58 | source_url: "https://github.com/elixir-ecto/ecto", 59 | groups_for_modules: [ 60 | Schema: [ 61 | DataQuacker.Schema 62 | ], 63 | Parsing: [ 64 | DataQuacker, 65 | DataQuacker.Context 66 | ], 67 | Adapters: [ 68 | DataQuacker.Adapter, 69 | DataQuacker.Adapters.CSV, 70 | DataQuacker.Adapters.Identity 71 | ] 72 | ] 73 | ] 74 | end 75 | end 76 | -------------------------------------------------------------------------------- /mix.lock: -------------------------------------------------------------------------------- 1 | %{ 2 | "bunt": {:hex, :bunt, "0.2.0", "951c6e801e8b1d2cbe58ebbd3e616a869061ddadcc4863d0a2182541acae9a38", [:mix], [], "hexpm", "7af5c7e09fe1d40f76c8e4f9dd2be7cebd83909f31fee7cd0e9eadc567da8353"}, 3 | "credo": {:hex, :credo, "1.5.6", "e04cc0fdc236fefbb578e0c04bd01a471081616e741d386909e527ac146016c6", [:mix], [{:bunt, "~> 0.2.0", [hex: :bunt, repo: "hexpm", optional: false]}, {:file_system, "~> 0.2.8", [hex: :file_system, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "4b52a3e558bd64e30de62a648518a5ea2b6e3e5d2b164ef5296244753fc7eb17"}, 4 | "csv": {:hex, :csv, "2.4.1", "50e32749953b6bf9818dbfed81cf1190e38cdf24f95891303108087486c5925e", [:mix], [{:parallel_stream, "~> 1.0.4", [hex: :parallel_stream, repo: "hexpm", optional: false]}], "hexpm", "54508938ac67e27966b10ef49606e3ad5995d665d7fc2688efb3eab1307c9079"}, 5 | "decimal": {:hex, :decimal, "2.0.0", "a78296e617b0f5dd4c6caf57c714431347912ffb1d0842e998e9792b5642d697", [:mix], [], "hexpm", "34666e9c55dea81013e77d9d87370fe6cb6291d1ef32f46a1600230b1d44f577"}, 6 | "dialyxir": {:hex, :dialyxir, "1.1.0", "c5aab0d6e71e5522e77beff7ba9e08f8e02bad90dfbeffae60eaf0cb47e29488", [:mix], [{:erlex, ">= 0.2.6", [hex: :erlex, repo: "hexpm", optional: false]}], "hexpm", "07ea8e49c45f15264ebe6d5b93799d4dd56a44036cf42d0ad9c960bc266c0b9a"}, 7 | "earmark": {:hex, :earmark, "1.3.5", "0db71c8290b5bc81cb0101a2a507a76dca659513984d683119ee722828b424f6", [:mix], [], "hexpm", "762b999fd414fb41e297944228aa1de2cd4a3876a07f968c8b11d1e9a2190d07"}, 8 | "earmark_parser": {:hex, :earmark_parser, "1.4.15", "b29e8e729f4aa4a00436580dcc2c9c5c51890613457c193cc8525c388ccb2f06", [:mix], [], "hexpm", "044523d6438ea19c1b8ec877ec221b008661d3c27e3b848f4c879f500421ca5c"}, 9 | "erlex": {:hex, :erlex, "0.2.6", "c7987d15e899c7a2f34f5420d2a2ea0d659682c06ac607572df55a43753aa12e", [:mix], [], "hexpm", "2ed2e25711feb44d52b17d2780eabf998452f6efda104877a3881c2f8c0c0c75"}, 10 | "ex_doc": {:hex, :ex_doc, "0.25.1", "4b736fa38dc76488a937e5ef2944f5474f3eff921de771b25371345a8dc810bc", [:mix], [{:earmark_parser, "~> 1.4.0", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_elixir, "~> 0.14", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1", [hex: :makeup_erlang, repo: "hexpm", optional: false]}], "hexpm", "3200b0a69ddb2028365281fbef3753ea9e728683863d8cdaa96580925c891f67"}, 11 | "file_system": {:hex, :file_system, "0.2.10", "fb082005a9cd1711c05b5248710f8826b02d7d1784e7c3451f9c1231d4fc162d", [:mix], [], "hexpm", "41195edbfb562a593726eda3b3e8b103a309b733ad25f3d642ba49696bf715dc"}, 12 | "jason": {:hex, :jason, "1.2.2", "ba43e3f2709fd1aa1dce90aaabfd039d000469c05c56f0b8e31978e03fa39052", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "18a228f5f0058ee183f29f9eae0805c6e59d61c3b006760668d8d18ff0d12179"}, 13 | "makeup": {:hex, :makeup, "1.0.5", "d5a830bc42c9800ce07dd97fa94669dfb93d3bf5fcf6ea7a0c67b2e0e4a7f26c", [:mix], [{:nimble_parsec, "~> 0.5 or ~> 1.0", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "cfa158c02d3f5c0c665d0af11512fed3fba0144cf1aadee0f2ce17747fba2ca9"}, 14 | "makeup_elixir": {:hex, :makeup_elixir, "0.15.1", "b5888c880d17d1cc3e598f05cdb5b5a91b7b17ac4eaf5f297cb697663a1094dd", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.1", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "db68c173234b07ab2a07f645a5acdc117b9f99d69ebf521821d89690ae6c6ec8"}, 15 | "makeup_erlang": {:hex, :makeup_erlang, "0.1.1", "3fcb7f09eb9d98dc4d208f49cc955a34218fc41ff6b84df7c75b3e6e533cc65f", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "174d0809e98a4ef0b3309256cbf97101c6ec01c4ab0b23e926a9e17df2077cbb"}, 16 | "mox": {:hex, :mox, "1.0.0", "4b3c7005173f47ff30641ba044eb0fe67287743eec9bd9545e37f3002b0a9f8b", [:mix], [], "hexpm", "201b0a20b7abdaaab083e9cf97884950f8a30a1350a1da403b3145e213c6f4df"}, 17 | "nimble_parsec": {:hex, :nimble_parsec, "1.1.0", "3a6fca1550363552e54c216debb6a9e95bd8d32348938e13de5eda962c0d7f89", [:mix], [], "hexpm", "08eb32d66b706e913ff748f11694b17981c0b04a33ef470e33e11b3d3ac8f54b"}, 18 | "parallel_stream": {:hex, :parallel_stream, "1.0.6", "b967be2b23f0f6787fab7ed681b4c45a215a81481fb62b01a5b750fa8f30f76c", [:mix], [], "hexpm", "639b2e8749e11b87b9eb42f2ad325d161c170b39b288ac8d04c4f31f8f0823eb"}, 19 | } 20 | -------------------------------------------------------------------------------- /priv/plts/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /test/adapters/csv_test.exs: -------------------------------------------------------------------------------- 1 | defmodule DataQuacker.CSVAdapterTest do 2 | use ExUnit.Case, async: true 3 | 4 | import Mox 5 | 6 | alias DataQuacker.Adapters.CSV 7 | 8 | setup do 9 | {:ok, 10 | sample_source: [ 11 | ["a", "b", "c"], 12 | ["a1", "b1", "c1"], 13 | ["a2", "b2", "c2"], 14 | ["a3", "b3", "c3"] 15 | ]} 16 | end 17 | 18 | describe "parse_source/2" do 19 | test "given a local file path, should parse the source", %{ 20 | sample_source: [headers | rows] = sample_source 21 | } do 22 | expect(DataQuacker.MockFileManager, :stream!, fn "sample_path.csv" -> 23 | Stream.map(sample_source, &Enum.join(&1, ",")) 24 | end) 25 | 26 | assert CSV.parse_source("sample_path.csv", local?: true) == 27 | {:ok, %{headers: {:ok, headers}, rows: Enum.map(rows, &{:ok, &1})}} 28 | end 29 | 30 | test "with semicolon set as the separator given a local file path, should parse the source", 31 | %{sample_source: [headers | rows] = sample_source} do 32 | expect(DataQuacker.MockFileManager, :stream!, fn "sample_path.csv" -> 33 | Stream.map(sample_source, &Enum.join(&1, ";")) 34 | end) 35 | 36 | assert CSV.parse_source("sample_path.csv", local?: true, separator: ?;) == 37 | {:ok, %{headers: {:ok, headers}, rows: Enum.map(rows, &{:ok, &1})}} 38 | end 39 | 40 | test "given a remote file url, should parse the source", %{ 41 | sample_source: [headers | rows] = sample_source 42 | } do 43 | expect(DataQuacker.MockFileManager, :read_link!, fn "file_url.com" -> 44 | Enum.map(sample_source, &Enum.join(&1, ",")) 45 | end) 46 | 47 | assert CSV.parse_source("file_url.com", local?: false) == 48 | {:ok, %{headers: {:ok, headers}, rows: Enum.map(rows, &{:ok, &1})}} 49 | end 50 | 51 | test "with semicolon set as the separator given a remote file url, should parse the source", 52 | %{sample_source: [headers | rows] = sample_source} do 53 | expect(DataQuacker.MockFileManager, :read_link!, fn "file_url.com" -> 54 | Enum.map(sample_source, &Enum.join(&1, ";")) 55 | end) 56 | 57 | assert CSV.parse_source("file_url.com", local?: false, separator: ?;) == 58 | {:ok, %{headers: {:ok, headers}, rows: Enum.map(rows, &{:ok, &1})}} 59 | end 60 | end 61 | 62 | # describe "get_headers/1" do 63 | # test "returns the value under the headers key, wrapped in an ':ok' tuple", %{ 64 | # sample_source: sample_source 65 | # } do 66 | # assert Identity.get_headers(sample_source) == {:ok, sample_source.headers} 67 | # end 68 | # end 69 | # 70 | # describe "get_rows/1" do 71 | # test "returns the value under the rows key, wrapped in an ':ok' tuple", %{ 72 | # sample_source: sample_source 73 | # } do 74 | # assert Identity.get_rows(sample_source) == {:ok, sample_source.rows} 75 | # end 76 | # end 77 | # 78 | # describe "get_row/1" do 79 | # test "returns the given row wrapped in an ':ok' tuple", %{sample_source: sample_source} do 80 | # row = Enum.random(sample_source.rows) 81 | # 82 | # assert Identity.get_row(row) == {:ok, row} 83 | # end 84 | # end 85 | end 86 | -------------------------------------------------------------------------------- /test/adapters/identity_test.exs: -------------------------------------------------------------------------------- 1 | defmodule DataQuacker.IdentityAdapterTest do 2 | use ExUnit.Case, async: true 3 | 4 | alias DataQuacker.Adapters.Identity 5 | 6 | setup do 7 | {:ok, 8 | sample_source: %{ 9 | headers: ["a", "b", "c"], 10 | rows: [ 11 | ["a1", "b1", "c1"], 12 | ["a2", "b2", "c2"], 13 | ["a3", "b3", "c3"] 14 | ] 15 | }} 16 | end 17 | 18 | describe "parse_source/2" do 19 | test "returns the source as-is, wrapped in an ':ok' tuple", %{sample_source: sample_source} do 20 | assert Identity.parse_source(sample_source, []) == {:ok, sample_source} 21 | end 22 | end 23 | 24 | describe "get_headers/1" do 25 | test "returns the value under the headers key, wrapped in an ':ok' tuple", %{ 26 | sample_source: sample_source 27 | } do 28 | assert Identity.get_headers(sample_source) == {:ok, sample_source.headers} 29 | end 30 | end 31 | 32 | describe "get_rows/1" do 33 | test "returns the value under the rows key, wrapped in an ':ok' tuple", %{ 34 | sample_source: sample_source 35 | } do 36 | assert Identity.get_rows(sample_source) == {:ok, sample_source.rows} 37 | end 38 | end 39 | 40 | describe "get_row/1" do 41 | test "returns the given row wrapped in an ':ok' tuple", %{sample_source: sample_source} do 42 | row = Enum.random(sample_source.rows) 43 | 44 | assert Identity.get_row(row) == {:ok, row} 45 | end 46 | end 47 | end 48 | -------------------------------------------------------------------------------- /test/data_quacker/skipper_test.exs: -------------------------------------------------------------------------------- 1 | defmodule DataQuacker.SkipperTest do 2 | use ExUnit.Case, async: true 3 | 4 | alias DataQuacker.Skipper 5 | 6 | alias DataQuacker.Context 7 | 8 | alias DataQuacker.Schema.WrappedFun 9 | 10 | describe "call/3" do 11 | setup do 12 | skipper_fun_1 = %WrappedFun{arity: 1, callable: fn value -> value == "abc" end} 13 | 14 | skipper_fun_2 = %WrappedFun{ 15 | arity: 2, 16 | callable: fn value, context -> value == context.support_data.expected_value end 17 | } 18 | 19 | incorrect_type_skipper_fun_1 = %WrappedFun{arity: 1, callable: fn _value -> :ok end} 20 | 21 | incorrect_type_skipper_fun_2 = %WrappedFun{ 22 | arity: 2, 23 | callable: fn _value, _context -> :ok end 24 | } 25 | 26 | {:ok, 27 | skipper_fun_1: skipper_fun_1, 28 | skipper_fun_2: skipper_fun_2, 29 | incorrect_type_skipper_fun_1: incorrect_type_skipper_fun_1, 30 | incorrect_type_skipper_fun_2: incorrect_type_skipper_fun_2} 31 | end 32 | 33 | test "given a skipper function with arity 1 and a value should apply the function to the value", 34 | %{skipper_fun_1: skipper_fun_1} do 35 | assert Skipper.call("abc", skipper_fun_1, %Context{}) == true 36 | assert Skipper.call("def", skipper_fun_1, %Context{}) == false 37 | end 38 | 39 | test "given a skipper function with arity 2 and a value should apply the function to the value with the context", 40 | %{skipper_fun_2: skipper_fun_2} do 41 | assert Skipper.call("abc", skipper_fun_2, %Context{support_data: %{expected_value: "abc"}}) == 42 | true 43 | 44 | assert Skipper.call("def", skipper_fun_2, %Context{support_data: %{expected_value: "abc"}}) == 45 | false 46 | end 47 | 48 | test "given a skipper function with arity 1 and an incorrect return type should raise", 49 | %{incorrect_type_skipper_fun_1: incorrect_type_skipper_fun_1} do 50 | assert_raise(RuntimeError, ~r/skipper.+field.+abc.+incorrect.+value/si, fn -> 51 | Skipper.call("abc", incorrect_type_skipper_fun_1, %Context{metadata: {:field, :abc}}) 52 | end) 53 | end 54 | 55 | test "given a skipper function with arity 2 and an incorrect return type should raise", 56 | %{incorrect_type_skipper_fun_2: incorrect_type_skipper_fun_2} do 57 | assert_raise(RuntimeError, ~r/skipper.+field.+abc.+incorrect.+value/si, fn -> 58 | Skipper.call("abc", incorrect_type_skipper_fun_2, %Context{metadata: {:field, :abc}}) 59 | end) 60 | end 61 | end 62 | end 63 | -------------------------------------------------------------------------------- /test/examples/pond_example_test.exs: -------------------------------------------------------------------------------- 1 | defmodule DataQuacker.Examples.PondExampleTest do 2 | use DataQuacker.Case, async: true 3 | 4 | alias DataQuacker.Adapters.Identity 5 | 6 | defmodule PondSchema do 7 | use DataQuacker.Schema 8 | 9 | schema :pond_example_1 do 10 | field :type do 11 | source("type") 12 | end 13 | 14 | field :colour do 15 | source(~r/colou?r/i) 16 | end 17 | 18 | field :age do 19 | source("age") 20 | end 21 | end 22 | 23 | schema :pond_example_2 do 24 | field :type do 25 | validate(fn type -> type in ["Mallard", "Domestic", "Mandarin"] end) 26 | 27 | source("type") 28 | end 29 | 30 | field :colour do 31 | source(~r/colou?r/i) 32 | end 33 | 34 | field :age do 35 | transform(fn age_str -> 36 | case Integer.parse(age_str) do 37 | {age_int, _} -> {:ok, age_int} 38 | :error -> :error 39 | end 40 | end) 41 | 42 | source("age") 43 | end 44 | end 45 | end 46 | 47 | describe "pond example" do 48 | @tag :integration 49 | test "should parse sample data given the pond example 1 schema" do 50 | assert {:ok, [row1, row2, row3]} = 51 | DataQuacker.parse( 52 | %{ 53 | headers: ["Type", "Colour", "Age"], 54 | rows: [ 55 | ["Mallard", "green", "3"], 56 | ["Domestic", "white", "2"], 57 | ["Mandarin", "multi-coloured", "4"] 58 | ] 59 | }, 60 | PondSchema.schema_structure(:pond_example_1), 61 | nil, 62 | adapter: Identity 63 | ) 64 | 65 | assert row1 == {:ok, %{type: "Mandarin", colour: "multi-coloured", age: "4"}} 66 | assert row2 == {:ok, %{type: "Domestic", colour: "white", age: "2"}} 67 | assert row3 == {:ok, %{type: "Mallard", colour: "green", age: "3"}} 68 | end 69 | 70 | @tag :integration 71 | test "should parse sample data given the pond example 2 schema" do 72 | assert {:error, [row1, row2, row3, row4, row5]} = 73 | DataQuacker.parse( 74 | %{ 75 | headers: ["Type", "Colour", "Age"], 76 | rows: [ 77 | ["Mallard", "green", "3"], 78 | ["Domestic", "white", "2"], 79 | ["Mandarin", "multi-coloured", "4"], 80 | ["Mystery", "golden", "100"], 81 | ["Black", "black", "Infinity"] 82 | ] 83 | }, 84 | PondSchema.schema_structure(:pond_example_2), 85 | nil, 86 | adapter: Identity 87 | ) 88 | 89 | assert row1 == :error 90 | assert row2 == :error 91 | assert row3 == {:ok, %{type: "Mandarin", colour: "multi-coloured", age: 4}} 92 | assert row4 == {:ok, %{type: "Domestic", colour: "white", age: 2}} 93 | assert row5 == {:ok, %{type: "Mallard", colour: "green", age: 3}} 94 | end 95 | end 96 | end 97 | -------------------------------------------------------------------------------- /test/examples/pricing_example_test.exs: -------------------------------------------------------------------------------- 1 | defmodule DataQuacker.Examples.PricingExampleTest do 2 | use DataQuacker.Case, async: true 3 | 4 | alias DataQuacker.Adapters.Identity 5 | 6 | defmodule PricingSchema do 7 | use DataQuacker.Schema 8 | 9 | schema :pricing_example_1 do 10 | field :size do 11 | transform(fn size -> 12 | case Integer.parse(size) do 13 | {size_int, _} -> {:ok, size_int} 14 | :error -> {:error, "Invalid value #{size} given"} 15 | end 16 | end) 17 | 18 | source("Apartment/flat size (in m^2)") 19 | end 20 | 21 | field :price do 22 | transform(fn price -> 23 | case Integer.parse(price) do 24 | {price_int, _} -> {:ok, price_int} 25 | :error -> {:error, "Invalid value #{price} given"} 26 | end 27 | end) 28 | 29 | source("Price per 1 month") 30 | end 31 | end 32 | 33 | schema :pricing_example_2 do 34 | field :size do 35 | transform(&PricingSchema.parse_int_example_2/1) 36 | 37 | source("Apartment/flat size (in m^2)") 38 | end 39 | 40 | field :price do 41 | transform(&PricingSchema.parse_int_example_2/1) 42 | 43 | source("Price per 1 month") 44 | end 45 | end 46 | 47 | schema :pricing_example_3 do 48 | field :size do 49 | transform(&PricingSchema.parse_int_example_3/2) 50 | 51 | source(["apartment", "size"]) 52 | end 53 | 54 | field :price do 55 | transform(&PricingSchema.parse_int_example_3/2) 56 | 57 | source(["price", "1"]) 58 | end 59 | end 60 | 61 | schema :pricing_example_4 do 62 | row skip_if: fn %{price: price} -> is_nil(price) end do 63 | field :size do 64 | transform(&PricingSchema.parse_int_example_4/2) 65 | 66 | source(["apartment", "size"]) 67 | end 68 | 69 | field :duration do 70 | virtual_source(1) 71 | end 72 | 73 | field :price do 74 | transform(&PricingSchema.parse_int_example_4/2) 75 | 76 | source(["price", "1"]) 77 | end 78 | end 79 | 80 | row do 81 | field :size do 82 | transform(&PricingSchema.parse_int_example_4/2) 83 | 84 | source(["apartment", "size"]) 85 | end 86 | 87 | field :duration do 88 | virtual_source(3) 89 | end 90 | 91 | field :price do 92 | transform(&PricingSchema.parse_int_example_4/2) 93 | 94 | source(["price", "3"]) 95 | end 96 | end 97 | end 98 | 99 | schema :pricing_example_5 do 100 | row skip_if: fn %{price: price} -> is_nil(price) end do 101 | field :size do 102 | transform(&PricingSchema.parse_int_example_5/2) 103 | 104 | source(["apartment", "size"]) 105 | end 106 | 107 | field :duration do 108 | virtual_source(1) 109 | end 110 | 111 | field :price do 112 | transform(&PricingSchema.replace_commas/1) 113 | transform(&PricingSchema.parse_decimal/2) 114 | 115 | source(["price", "1"]) 116 | end 117 | end 118 | 119 | row do 120 | field :size do 121 | transform(&PricingSchema.parse_int_example_5/2) 122 | 123 | source(["apartment", "size"]) 124 | end 125 | 126 | field :duration do 127 | virtual_source(3) 128 | end 129 | 130 | field :price do 131 | transform(&PricingSchema.replace_commas/1) 132 | transform(&PricingSchema.parse_decimal/2) 133 | 134 | source(["price", "3"]) 135 | end 136 | end 137 | end 138 | 139 | def parse_int_example_2(str) do 140 | case Integer.parse(str) do 141 | {int, _} -> {:ok, int} 142 | :error -> {:error, "Invalid value #{str} given"} 143 | end 144 | end 145 | 146 | def parse_int_example_3(str, %{metadata: metadata, source_row: source_row}) do 147 | case Integer.parse(str) do 148 | {int, _} -> 149 | {:ok, int} 150 | 151 | :error -> 152 | {:error, 153 | "Error processing #{elem(metadata, 0)} #{elem(metadata, 1)} in row #{source_row}; '#{str}' given"} 154 | end 155 | end 156 | 157 | def parse_int_example_4("", _context), do: {:ok, nil} 158 | 159 | def parse_int_example_4(str, %{metadata: metadata, source_row: source_row}) do 160 | case Integer.parse(str) do 161 | {int, _} -> 162 | {:ok, int} 163 | 164 | :error -> 165 | {:error, 166 | "Error processing #{elem(metadata, 0)} #{elem(metadata, 1)} in row #{source_row}; '#{str}' given"} 167 | end 168 | end 169 | 170 | def parse_int_example_5("", _context), do: {:ok, nil} 171 | 172 | def parse_int_example_5(str, %{metadata: metadata, source_row: source_row}) do 173 | case Integer.parse(str) do 174 | {int, _} -> 175 | {:ok, int} 176 | 177 | :error -> 178 | {:error, 179 | "Error processing #{elem(metadata, 0)} #{elem(metadata, 1)} in row #{source_row}; '#{str}' given"} 180 | end 181 | end 182 | 183 | def replace_commas(str) do 184 | {:ok, String.replace(str, ",", ".")} 185 | end 186 | 187 | def parse_decimal("", _context), do: {:ok, nil} 188 | 189 | def parse_decimal(str, %{metadata: metadata, source_row: source_row}) do 190 | case Decimal.parse(str) do 191 | {decimal, ""} -> 192 | {:ok, decimal} 193 | 194 | :error -> 195 | {:error, 196 | "Error processing #{elem(metadata, 0)} #{elem(metadata, 1)} in row #{source_row}; '#{str}' given"} 197 | end 198 | end 199 | end 200 | 201 | describe "pricing example" do 202 | @tag :integration 203 | test "should parse sample data given the pricing example 1 schema" do 204 | assert {:ok, [row1, row2]} = 205 | DataQuacker.parse( 206 | %{ 207 | headers: ["Apartment/flat size (in m^2)", "Price per 1 month"], 208 | rows: [ 209 | ["40", "1000"], 210 | ["50", "1100"] 211 | ] 212 | }, 213 | PricingSchema.schema_structure(:pricing_example_1), 214 | nil, 215 | adapter: Identity 216 | ) 217 | 218 | assert row1 == {:ok, %{size: 50, price: 1100}} 219 | assert row2 == {:ok, %{size: 40, price: 1000}} 220 | end 221 | 222 | @tag :integration 223 | test "should parse sample data given the pricing example 2 schema" do 224 | assert {:ok, [row1, row2]} = 225 | DataQuacker.parse( 226 | %{ 227 | headers: ["Apartment/flat size (in m^2)", "Price per 1 month"], 228 | rows: [ 229 | ["40", "1000"], 230 | ["50", "1100"] 231 | ] 232 | }, 233 | PricingSchema.schema_structure(:pricing_example_2), 234 | nil, 235 | adapter: Identity 236 | ) 237 | 238 | assert row1 == {:ok, %{size: 50, price: 1100}} 239 | assert row2 == {:ok, %{size: 40, price: 1000}} 240 | end 241 | 242 | @tag :integration 243 | test "should parse sample data given the pricing example 3 schema" do 244 | assert {:error, [row1, row2, row3, row4]} = 245 | DataQuacker.parse( 246 | %{ 247 | headers: ["Apartment or flat size", "Price for 1 month"], 248 | rows: [ 249 | ["40", "1000"], 250 | ["50", "1100"], 251 | ["50", "a lot of $$$"], 252 | ["huge", "1000000"] 253 | ] 254 | }, 255 | PricingSchema.schema_structure(:pricing_example_3), 256 | nil, 257 | adapter: Identity 258 | ) 259 | 260 | assert row1 == {:error, "Error processing field size in row 4; 'huge' given"} 261 | assert row2 == {:error, "Error processing field price in row 3; 'a lot of $$$' given"} 262 | assert row3 == {:ok, %{size: 50, price: 1100}} 263 | assert row4 == {:ok, %{size: 40, price: 1000}} 264 | end 265 | 266 | @tag :integration 267 | test "should parse sample data given the pricing example 4 schema" do 268 | assert {:ok, [row1, row2, row3, row4, row5]} = 269 | DataQuacker.parse( 270 | %{ 271 | headers: ["Apartment or flat size", "Price for 1 month", "Price per 3 months"], 272 | rows: [ 273 | ["40", "1000", "2800"], 274 | ["50", "1100", "3000"], 275 | ["60", "", "3600"] 276 | ] 277 | }, 278 | PricingSchema.schema_structure(:pricing_example_4), 279 | nil, 280 | adapter: Identity 281 | ) 282 | 283 | assert row1 == {:ok, %{duration: 3, price: 3600, size: 60}} 284 | assert row2 == {:ok, %{duration: 3, price: 3000, size: 50}} 285 | assert row3 == {:ok, %{duration: 1, price: 1100, size: 50}} 286 | assert row4 == {:ok, %{duration: 3, price: 2800, size: 40}} 287 | assert row5 == {:ok, %{duration: 1, price: 1000, size: 40}} 288 | end 289 | 290 | @tag :integration 291 | test "should parse sample data given the pricing example 5 schema" do 292 | assert {:ok, [row1, row2, row3, row4, row5]} = 293 | DataQuacker.parse( 294 | %{ 295 | headers: ["Apartment or flat size", "Price for 1 month", "Price per 3 months"], 296 | rows: [ 297 | ["40", "999,99", "2799,99"], 298 | ["50", "1099,99", "2999,99"], 299 | ["60", "", "3599,99"] 300 | ] 301 | }, 302 | PricingSchema.schema_structure(:pricing_example_5), 303 | nil, 304 | adapter: Identity 305 | ) 306 | 307 | assert row1 == {:ok, %{duration: 3, price: Decimal.new("3599.99"), size: 60}} 308 | assert row2 == {:ok, %{duration: 3, price: Decimal.new("2999.99"), size: 50}} 309 | assert row3 == {:ok, %{duration: 1, price: Decimal.new("1099.99"), size: 50}} 310 | assert row4 == {:ok, %{duration: 3, price: Decimal.new("2799.99"), size: 40}} 311 | assert row5 == {:ok, %{duration: 1, price: Decimal.new("999.99"), size: 40}} 312 | end 313 | end 314 | end 315 | -------------------------------------------------------------------------------- /test/examples/students_example_test.exs: -------------------------------------------------------------------------------- 1 | defmodule DataQuacker.Examples.StudentsExampleTest do 2 | use DataQuacker.Case, async: true 3 | 4 | alias DataQuacker.Adapters.Identity 5 | 6 | defmodule StudentsSchema do 7 | use DataQuacker.Schema 8 | 9 | schema :students_example_1 do 10 | field :first_name do 11 | source("first name") 12 | end 13 | 14 | field :last_name do 15 | source("last name") 16 | end 17 | 18 | field :age do 19 | source("age") 20 | end 21 | 22 | field :favourite_subject do 23 | source("favourite subject") 24 | end 25 | end 26 | 27 | schema :students_example_2 do 28 | field :first_name do 29 | source("first name") 30 | end 31 | 32 | field :last_name do 33 | source("last name") 34 | end 35 | 36 | field :age do 37 | transform(fn age -> 38 | case Integer.parse(age) do 39 | {age_int, _} -> {:ok, age_int} 40 | :error -> {:error, "Invalid value #{age} given"} 41 | end 42 | end) 43 | 44 | source("age") 45 | end 46 | 47 | field :favourite_subject do 48 | validate(fn subj -> subj in ["Maths", "Physics", "Programming"] end) 49 | 50 | source("favourite subject") 51 | end 52 | end 53 | 54 | schema :students_example_4 do 55 | field :full_name do 56 | transform(fn %{first_name: first_name, last_name: last_name} -> 57 | {:ok, "#{first_name} #{last_name}"} 58 | end) 59 | 60 | field :first_name do 61 | source("first name") 62 | end 63 | 64 | field :last_name do 65 | source("last name") 66 | end 67 | end 68 | 69 | field :age do 70 | transform(fn age -> 71 | case Integer.parse(age) do 72 | {age_int, _} -> {:ok, age_int} 73 | :error -> {:error, "Invalid value #{age} given"} 74 | end 75 | end) 76 | 77 | source("age") 78 | end 79 | 80 | field :favourite_subject do 81 | validate(fn subj, context -> 82 | case subj in context.support_data.valid_subjects do 83 | true -> 84 | :ok 85 | 86 | false -> 87 | {:error, 88 | "Invalid favourite subject in row ##{context.source_row}, must be one of #{inspect(context.support_data.valid_subjects)}"} 89 | end 90 | end) 91 | 92 | source("favourite subject") 93 | end 94 | end 95 | end 96 | 97 | describe "students example" do 98 | @tag :integration 99 | test "should parse sample data given the students example 1 schema" do 100 | assert {:ok, [row1, row2, row3]} = 101 | DataQuacker.parse( 102 | %{ 103 | headers: ["First name", "Last name", "Age", "Favourite subject"], 104 | rows: [ 105 | ["John", "Smith", "19", "Maths"], 106 | ["Adam", "Johnson", "18", "Physics"], 107 | ["Quackers", "the Duck", "1", "Programming"] 108 | ] 109 | }, 110 | StudentsSchema.schema_structure(:students_example_1), 111 | %{valid_subjects: ["Maths", "Physics", "Programming"]}, 112 | adapter: Identity 113 | ) 114 | 115 | assert row1 == 116 | {:ok, 117 | %{ 118 | age: "1", 119 | favourite_subject: "Programming", 120 | first_name: "Quackers", 121 | last_name: "the Duck" 122 | }} 123 | 124 | assert row2 == 125 | {:ok, 126 | %{ 127 | age: "18", 128 | favourite_subject: "Physics", 129 | first_name: "Adam", 130 | last_name: "Johnson" 131 | }} 132 | 133 | assert row3 == 134 | {:ok, 135 | %{age: "19", favourite_subject: "Maths", first_name: "John", last_name: "Smith"}} 136 | end 137 | 138 | @tag :integration 139 | test "should parse sample data given the students example 2 schema" do 140 | assert {:ok, [row1, row2, row3]} = 141 | DataQuacker.parse( 142 | %{ 143 | headers: ["First name", "Last name", "Age", "Favourite subject"], 144 | rows: [ 145 | ["John", "Smith", "19", "Maths"], 146 | ["Adam", "Johnson", "18", "Physics"], 147 | ["Quackers", "the Duck", "1", "Programming"] 148 | ] 149 | }, 150 | StudentsSchema.schema_structure(:students_example_2), 151 | %{valid_subjects: ["Maths", "Physics", "Programming"]}, 152 | adapter: Identity 153 | ) 154 | 155 | assert row1 == 156 | {:ok, 157 | %{ 158 | age: 1, 159 | favourite_subject: "Programming", 160 | first_name: "Quackers", 161 | last_name: "the Duck" 162 | }} 163 | 164 | assert row2 == 165 | {:ok, 166 | %{age: 18, favourite_subject: "Physics", first_name: "Adam", last_name: "Johnson"}} 167 | 168 | assert row3 == 169 | {:ok, 170 | %{age: 19, favourite_subject: "Maths", first_name: "John", last_name: "Smith"}} 171 | end 172 | 173 | @tag :integration 174 | test "should parse sample data given the students example 4 schema" do 175 | assert {:error, [row1, row2, row3, row4]} = 176 | DataQuacker.parse( 177 | %{ 178 | headers: ["First name", "Last name", "Age", "Favourite subject"], 179 | rows: [ 180 | ["John", "Smith", "19", "Maths"], 181 | ["Adam", "Johnson", "18", "Physics"], 182 | ["Quackers", "the Duck", "1", "Programming"], 183 | ["Mat", "Savage", "100", "None"] 184 | ] 185 | }, 186 | StudentsSchema.schema_structure(:students_example_4), 187 | %{valid_subjects: ["Maths", "Physics", "Programming"]}, 188 | adapter: Identity 189 | ) 190 | 191 | assert row1 == 192 | {:error, 193 | "Invalid favourite subject in row #4, must be one of [\"Maths\", \"Physics\", \"Programming\"]"} 194 | 195 | assert row2 == 196 | {:ok, %{age: 1, favourite_subject: "Programming", full_name: "Quackers the Duck"}} 197 | 198 | assert row3 == {:ok, %{age: 18, favourite_subject: "Physics", full_name: "Adam Johnson"}} 199 | assert row4 == {:ok, %{age: 19, favourite_subject: "Maths", full_name: "John Smith"}} 200 | end 201 | end 202 | end 203 | -------------------------------------------------------------------------------- /test/helpers_tests/fun_wrapper_test.exs: -------------------------------------------------------------------------------- 1 | defmodule DataQuacker.FunWrapperHelperTest do 2 | use DataQuacker.Case, async: true 3 | 4 | alias DataQuacker.Schema.WrappedFun 5 | alias DataQuacker.SchemaError 6 | 7 | defmodule SampleWrappedFunctions do 8 | import DataQuacker.Schema.FunWrapper 9 | 10 | @fun0 wrap_fun(fn -> 11 | "no args" 12 | end) 13 | 14 | @fun1 wrap_fun(fn arg1 -> 15 | arg1 16 | end) 17 | 18 | @fun2 wrap_fun(fn arg1, arg2 -> 19 | {arg1, arg2} 20 | end) 21 | 22 | def wrapped_fun0 do 23 | @fun0 24 | end 25 | 26 | def wrapped_fun1 do 27 | @fun1 28 | end 29 | 30 | def wrapped_fun2 do 31 | @fun2 32 | end 33 | end 34 | 35 | describe "wrap_fun/2" do 36 | test "should wrap a function and return a wrapped function struct" do 37 | assert %WrappedFun{callable: fun0, arity: 0} = SampleWrappedFunctions.wrapped_fun0() 38 | 39 | assert fun0.() == "no args" 40 | 41 | assert %WrappedFun{callable: fun1, arity: 1} = SampleWrappedFunctions.wrapped_fun1() 42 | 43 | assert fun1.("a") == "a" 44 | 45 | assert %WrappedFun{callable: fun2, arity: 2} = SampleWrappedFunctions.wrapped_fun2() 46 | 47 | assert fun2.("a", "b") == {"a", "b"} 48 | end 49 | 50 | test "given a function and expected numeric arity should not compile if the function's arity does not match the assertion" do 51 | assert_raise(SchemaError, ~r/unexpected.+arity/si, fn -> 52 | Code.eval_string( 53 | """ 54 | defmodule TestFunWrapper do 55 | import DataQuacker.Schema.FunWrapper 56 | 57 | @fun wrap_fun(fn _ -> nil end, 2) 58 | end 59 | """, 60 | [], 61 | __ENV__ 62 | ) 63 | end) 64 | 65 | assert_raise(SchemaError, ~r/unexpected.+arity/si, fn -> 66 | Code.eval_string( 67 | """ 68 | defmodule TestFunWrapper do 69 | import DataQuacker.Schema.FunWrapper 70 | 71 | @fun wrap_fun(fn _, _ -> nil end, 1) 72 | end 73 | """, 74 | [], 75 | __ENV__ 76 | ) 77 | end) 78 | end 79 | 80 | test "given a function and expected range of arity should not compile if the function's arity does not match the assertion" do 81 | assert_raise(SchemaError, ~r/unexpected.+arity/si, fn -> 82 | Code.eval_string( 83 | """ 84 | defmodule TestFunWrapper do 85 | import DataQuacker.Schema.FunWrapper 86 | 87 | @fun wrap_fun(fn -> nil end, 1..2) 88 | end 89 | """, 90 | [], 91 | __ENV__ 92 | ) 93 | end) 94 | 95 | assert_raise(SchemaError, ~r/unexpected.+arity/si, fn -> 96 | Code.eval_string( 97 | """ 98 | defmodule TestFunWrapper do 99 | import DataQuacker.Schema.FunWrapper 100 | 101 | @fun wrap_fun(fn _, _, _ -> nil end, 1..2) 102 | end 103 | """, 104 | [], 105 | __ENV__ 106 | ) 107 | end) 108 | end 109 | end 110 | end 111 | -------------------------------------------------------------------------------- /test/schema/state_test.exs: -------------------------------------------------------------------------------- 1 | defmodule DataQuacker.Schema.StateTest do 2 | use ExUnit.Case, async: true 3 | 4 | alias DataQuacker.Schema.State 5 | 6 | describe "new/0" do 7 | test "should return an empty State struct" do 8 | assert State.new() == %State{ 9 | cursor: [], 10 | flags: %{}, 11 | schema: %{}, 12 | matchers: [], 13 | rows: [], 14 | fields: %{} 15 | } 16 | end 17 | end 18 | 19 | describe "clear_fields/1" do 20 | setup do 21 | {:ok, state: %State{fields: %{a: 1, b: 2, c: 3}}} 22 | end 23 | 24 | test "should clear the fields", %{state: state} do 25 | state = State.clear_fields(state) 26 | 27 | assert state.fields == %{} 28 | end 29 | end 30 | 31 | describe "flag/3" do 32 | setup do 33 | {:ok, state: %State{flags: %{a: true, b: false}}} 34 | end 35 | 36 | test "should put a flag with a value", %{state: state} do 37 | state = State.flag(state, :c, true) 38 | 39 | assert state.flags.c == true 40 | end 41 | 42 | test "should replace the value of an already existing flag", %{state: state} do 43 | state = State.flag(state, :b, true) 44 | 45 | assert state.flags.b == true 46 | end 47 | end 48 | 49 | describe "flagged?/2" do 50 | setup do 51 | {:ok, state: %State{flags: %{a: true, b: false}}} 52 | end 53 | 54 | test "should get the value of a flag", %{state: state} do 55 | assert State.flagged?(state, :a) == true 56 | assert State.flagged?(state, :b) == false 57 | end 58 | 59 | test "should return false if a flag does not exist", %{state: state} do 60 | assert State.flagged?(state, :c) == false 61 | end 62 | end 63 | 64 | describe "cursor_at/2" do 65 | setup do 66 | {:ok, 67 | empty_cursor_state: %State{cursor: []}, 68 | state: %State{cursor: [{:field, :sample_field}, {:row, 0}]}} 69 | end 70 | 71 | test "given a state with an empty cursor and compare the needle with nil", %{ 72 | empty_cursor_state: empty_cursor_state 73 | } do 74 | assert State.cursor_at?(empty_cursor_state, nil) == true 75 | assert State.cursor_at?(empty_cursor_state, 123) == false 76 | assert State.cursor_at?(empty_cursor_state, "abc") == false 77 | end 78 | 79 | test "given a cursor should compare the latest pointer's type to the needle", %{ 80 | state: state 81 | } do 82 | assert State.cursor_at?(state, :field) == true 83 | assert State.cursor_at?(state, :row) == false 84 | end 85 | end 86 | 87 | describe "target/1" do 88 | setup do 89 | {:ok, state: %State{cursor: [{:field, :abc}, {:row, 0}, {:schema, :def}]}} 90 | end 91 | 92 | test "should return a list of values at subsequent cursor entries (without the types)", %{ 93 | state: state 94 | } do 95 | assert State.target(state) == [:abc, 0, :def] 96 | end 97 | end 98 | 99 | describe "cursor_exit/1" do 100 | setup do 101 | {:ok, state: %State{cursor: [{:field, :abc}, {:row, 0}, {:schema, :def}]}} 102 | end 103 | 104 | test "should drop the cursor's head", %{state: state} do 105 | assert State.cursor_exit(state) == %State{cursor: [{:row, 0}, {:schema, :def}]} 106 | end 107 | 108 | test "given the exit level should the cursor's first n elements", %{state: state} do 109 | assert State.cursor_exit(state, 2) == %State{cursor: [{:schema, :def}]} 110 | end 111 | end 112 | 113 | describe "register/3" do 114 | setup do 115 | blank_state = %State{} 116 | state_with_schema = State.register(blank_state, :schema, {:abc, %{}}) 117 | state_with_row = State.register(state_with_schema, :row, {0, %{}}) 118 | state_with_field = State.register(state_with_row, :field, {:def, %{}}) 119 | 120 | {:ok, 121 | blank_state: blank_state, 122 | state_with_schema: state_with_schema, 123 | state_with_row: state_with_row, 124 | state_with_field: state_with_field} 125 | end 126 | 127 | test "given a schema should add the schema merged with the default to the state", %{ 128 | blank_state: state 129 | } do 130 | assert %State{cursor: [{:schema, :abc}], schema: %{__name__: :abc, matchers: [], rows: []}} = 131 | State.register(state, :schema, {:abc, %{}}) 132 | end 133 | 134 | test "given a row should add the row merged with the default to the state", %{ 135 | state_with_schema: state 136 | } do 137 | assert %State{ 138 | cursor: [{:row, 0}, {:schema, :abc}], 139 | rows: [ 140 | %{__index__: 0, fields: %{}, transformers: [], skip_if: nil, validators: []} 141 | ] 142 | } = State.register(state, :row, {0, %{}}) 143 | end 144 | 145 | test "given a field should add the field merged with the default to the state", %{ 146 | state_with_row: state 147 | } do 148 | assert %State{ 149 | cursor: [{:field, :def}, {:row, 0}, {:schema, :abc}], 150 | fields: %{ 151 | def: %{ 152 | __name__: :def, 153 | __type__: nil, 154 | transformers: [], 155 | skip_if: nil, 156 | source: nil, 157 | subfields: %{}, 158 | validators: [] 159 | } 160 | } 161 | } = State.register(state, :field, {:def, %{}}) 162 | end 163 | 164 | test "given a field when the cursor is already at a field should add the field merged with the default to the state as a subfield", 165 | %{ 166 | state_with_field: state 167 | } do 168 | assert %State{ 169 | cursor: [{:field, :ghi}, {:field, :def}, {:row, 0}, {:schema, :abc}], 170 | fields: %{ 171 | def: %{ 172 | subfields: %{ 173 | ghi: %{ 174 | __name__: :ghi, 175 | __type__: nil, 176 | transformers: [], 177 | skip_if: nil, 178 | source: nil, 179 | subfields: %{}, 180 | validators: [] 181 | } 182 | } 183 | } 184 | } 185 | } = State.register(state, :field, {:ghi, %{}}) 186 | end 187 | 188 | test "given a matcher should add the matcher merged with the default to the state with the current cursor as the target", 189 | %{state_with_field: state} do 190 | assert %State{matchers: [%{rule: "some rule", target: target}]} = 191 | State.register(state, :matcher, "some rule") 192 | 193 | assert target == State.target(state) 194 | end 195 | end 196 | 197 | describe "update/3" do 198 | setup do 199 | state_with_schema = State.register(%State{}, :schema, {:abc, %{}}) 200 | state_with_row = State.register(state_with_schema, :row, {0, %{}}) 201 | state_with_field = State.register(state_with_row, :field, {:def, %{}}) 202 | state_with_nested_field = State.register(state_with_field, :field, {:ghi, %{}}) 203 | 204 | {:ok, 205 | state_with_schema: state_with_schema, 206 | state_with_row: state_with_row, 207 | state_with_field: state_with_field, 208 | state_with_nested_field: state_with_nested_field} 209 | end 210 | 211 | test "given a schema should update the existing schema", %{ 212 | state_with_schema: state 213 | } do 214 | assert %State{schema: %{some_field: 123}} = State.update(state, :schema, %{some_field: 123}) 215 | end 216 | 217 | test "given a row should update the row that the cursor is pointing at", %{ 218 | state_with_row: state 219 | } do 220 | assert %State{rows: [%{some_field: 123}]} = State.update(state, :row, %{some_field: 123}) 221 | end 222 | 223 | test "given a field should update the field the cursor is pointing at", %{ 224 | state_with_field: state 225 | } do 226 | assert %State{fields: %{def: %{some_field: 123}}} = 227 | State.update(state, :field, %{some_field: 123}) 228 | end 229 | 230 | test "given a field should update the field the cursor is pointing at (nested)", 231 | %{ 232 | state_with_nested_field: state 233 | } do 234 | assert %State{fields: %{def: %{subfields: %{ghi: %{some_field: 123}}}}} = 235 | State.update(state, :field, %{some_field: 123}) 236 | end 237 | end 238 | 239 | describe "get/2" do 240 | setup do 241 | state_with_schema = State.register(%State{}, :schema, {:abc, %{}}) 242 | state_with_row = State.register(state_with_schema, :row, {0, %{}}) 243 | state_with_field = State.register(state_with_row, :field, {:def, %{}}) 244 | state_with_nested_field = State.register(state_with_field, :field, {:ghi, %{}}) 245 | 246 | {:ok, 247 | state_with_schema: state_with_schema, 248 | state_with_row: state_with_row, 249 | state_with_field: state_with_field, 250 | state_with_nested_field: state_with_nested_field} 251 | end 252 | 253 | test "should return a row if requested", %{state_with_row: state_with_row} do 254 | assert State.get(state_with_row, :row) == Enum.at(state_with_row.rows, 0) 255 | end 256 | 257 | test "should return a field if requested", %{state_with_field: state_with_field} do 258 | assert State.get(state_with_field, :field) == Map.get(state_with_field.fields, :def) 259 | end 260 | 261 | test "should return a nested field if requested", %{ 262 | state_with_nested_field: state_with_nested_field 263 | } do 264 | assert State.get(state_with_nested_field, :field) == 265 | get_in(state_with_nested_field.fields, [:def, :subfields, :ghi]) 266 | end 267 | end 268 | end 269 | -------------------------------------------------------------------------------- /test/support/case.ex: -------------------------------------------------------------------------------- 1 | defmodule DataQuacker.Case do 2 | @moduledoc false 3 | 4 | use ExUnit.CaseTemplate 5 | 6 | using do 7 | quote do 8 | import DataQuacker.Case 9 | end 10 | end 11 | end 12 | -------------------------------------------------------------------------------- /test/support/mock_file_manager.ex: -------------------------------------------------------------------------------- 1 | # defmodule DataQuacker.MockFileManager do 2 | # @behaviour DataQuacker.FileManager 3 | # 4 | # @impl true 5 | # def stream!(path, _modes \\ [], _line_or_bytes \\ :line) do 6 | # %Stream{} 7 | # end 8 | # end 9 | -------------------------------------------------------------------------------- /test/test_helper.exs: -------------------------------------------------------------------------------- 1 | Mox.defmock(DataQuacker.MockFileManager, for: DataQuacker.FileManager) 2 | 3 | Application.put_env(:data_quacker, :file_manager, DataQuacker.MockFileManager) 4 | 5 | ExUnit.start() 6 | --------------------------------------------------------------------------------