├── .gitattributes ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── feature_request.md │ ├── maintenance_task.md │ └── security_issue.md ├── PULL_REQUEST_TEMPLATE.md └── workflows │ ├── ci.yml │ └── codeql-analysis.yml ├── .gitignore ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE.md ├── README.md ├── SECURITY.md ├── db ├── dot │ ├── .gitkeep │ ├── 1-schema.sql │ ├── 2-upload_static_data.sql │ ├── 3-demo_data.sql │ └── 4-upload_sample_dot_data.sql └── fake_data_generator.py ├── docker ├── .gitignore ├── airflow │ ├── dags │ │ ├── dot_projects.json │ │ └── run_dot_project.py │ └── scripts │ │ ├── .gitkeep │ │ └── entrypoint.sh ├── appsmith │ ├── DOT App V2.json │ └── stacks │ │ └── .gitignore ├── demo │ └── .gitkeep ├── docker-compose-demo.yml ├── docker-compose-with-airflow.yml ├── docker-compose.yml ├── dot │ ├── Dockerfile │ └── dot_config.yml └── run_demo.py ├── dot ├── .gitignore ├── __init__.py ├── config │ ├── README.md │ ├── example │ │ ├── dot_config.yml │ │ ├── project_name │ │ │ └── dbt │ │ │ │ └── dbt_project.yml │ │ └── self_tests │ │ │ ├── dbt │ │ │ ├── dbt_project.yml │ │ │ └── profiles_github.yml │ │ │ ├── dot_config_docker.yml │ │ │ ├── dot_config_github.yml │ │ │ └── dot_config_local.yml │ └── templates │ │ ├── dbt │ │ ├── dbt_project.yml │ │ └── profiles.yml │ │ └── great_expectations │ │ ├── batch_config.json │ │ ├── config_variables.yml │ │ └── great_expectations.yml ├── dbt │ ├── .gitignore │ ├── analysis │ │ └── .gitkeep │ ├── data │ │ └── .gitkeep │ ├── macros │ │ ├── .gitkeep │ │ ├── filter_by_list.sql │ │ ├── filter_by_word.sql │ │ ├── get_column_name.sql │ │ ├── get_column_names.sql │ │ ├── get_relations.sql │ │ ├── test_associated_columns_not_null.sql │ │ ├── test_expression_is_true.sql │ │ ├── test_no_impossible_values.sql │ │ ├── test_not_less_than_or_equal_zero.sql │ │ ├── test_not_negative_string_column.sql │ │ ├── test_possible_duplicate_forms.sql │ │ ├── test_relationships.sql │ │ └── test_valid_date.sql │ ├── packages.yml │ └── snapshots │ │ └── .gitkeep ├── great_expectations │ ├── .gitignore │ ├── checkpoints │ │ └── iop_tool_checkpoint.yml │ ├── expectations │ │ └── .gitignore │ ├── notebooks │ │ └── sql │ │ │ └── validation_playground.ipynb │ ├── plugins │ │ ├── custom_data_docs │ │ │ └── styles │ │ │ │ └── data_docs_custom_styles.css │ │ └── custom_expectations │ │ │ └── custom_dataset.py │ └── readme.md ├── install_dot.sh ├── logs │ └── .gitignore ├── requirements_dot.txt ├── run_everything.py ├── self_tests │ ├── __init__.py │ ├── data │ │ ├── base_self_test │ │ │ └── .gitkeep │ │ ├── dot_input_files │ │ │ └── dbt │ │ │ │ ├── core │ │ │ │ ├── dot_model__airlines_data.sql │ │ │ │ ├── dot_model__all_airports_data.sql │ │ │ │ ├── dot_model__all_airports_data.yml │ │ │ │ ├── dot_model__all_flight_data.sql │ │ │ │ ├── dot_model__all_flight_data.yml │ │ │ │ ├── dot_model__ethiopia_airlines_data.sql │ │ │ │ └── dot_model__zagreb_flight_data.sql │ │ │ │ └── test │ │ │ │ └── .gitkeep │ │ ├── dot_output_files │ │ │ └── dbt │ │ │ │ ├── manifest_node_ex_non_negative_string_column.json │ │ │ │ └── target │ │ │ │ ├── manifest_archive.json │ │ │ │ ├── manifest_test.json │ │ │ │ ├── run_results_archive.json │ │ │ │ └── run_results_test.json │ │ ├── expected │ │ │ ├── dot_model__all_flight_data.sql │ │ │ ├── extract_df_from_dbt_test_results_json.csv │ │ │ ├── get_test_parameters_non_negative_string_column.json │ │ │ ├── integration │ │ │ │ ├── test_results.csv │ │ │ │ └── test_results_summary.csv │ │ │ └── read_dbt_output_files.json │ │ ├── queries │ │ │ └── dbt_core_generated_objects.sql │ │ └── test_configuration_utils │ │ │ └── dot_config.yml │ ├── integration │ │ ├── __init__.py │ │ └── test_run_dot_tests.py │ ├── self_tests_utils │ │ ├── __init__.py │ │ ├── base_self_test_class.py │ │ └── dbt_base_safe_test_class.py │ └── unit │ │ ├── __init__.py │ │ ├── test_configuration_utils.py │ │ ├── test_connection_utils.py │ │ ├── test_core_entities_creation.py │ │ ├── test_dbt.py │ │ ├── test_dbt_logs.py │ │ ├── test_dbt_logs_safe.py │ │ ├── test_dot_utils.py │ │ ├── test_dot_utils_schema_improved.py │ │ └── test_generate_tests_from_db.py └── utils │ ├── __init__.py │ ├── configuration_management.py │ ├── configuration_utils.py │ ├── connection_utils.py │ ├── dbt.py │ ├── dbt_logs.py │ ├── great_expectations.py │ ├── run_management.py │ └── utils.py ├── environment.yml ├── images ├── db_schema.png ├── dot.png └── dot_logo.png ├── lint.py └── setup_hooks.sh /.gitattributes: -------------------------------------------------------------------------------- 1 | * text=auto 2 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Browser [e.g. chrome, safari] 29 | - Version [e.g. 22] 30 | 31 | **Smartphone (please complete the following information):** 32 | - Device: [e.g. iPhone6] 33 | - OS: [e.g. iOS8.1] 34 | - Browser [e.g. stock browser, safari] 35 | - Version [e.g. 22] 36 | 37 | **Additional context** 38 | Add any other context about the problem here. 39 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/maintenance_task.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Maintenance task 3 | about: Suggest a maintenance task 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe your maintenance task** 11 | A clear and concise description of what your propose and why it will improve the codebase [...] 12 | 13 | **Additional context** 14 | Add any other context or screenshots about the feature request here. 15 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/security_issue.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Security issue 3 | about: Report a security issue 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | 12 | **Security issue description** 13 | 14 | *Describe the security issue here* 15 | 16 | **Impact** 17 | 18 | *What impact does this security issue have?* 19 | 20 | **Mitigation** 21 | 22 | *If you have suggestions to mitigate the issue, please provide them here* 23 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | Fixes # 2 | 3 | ## Proposed Changes 4 | 5 | - 6 | - 7 | - -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: Lint & self tests 2 | 3 | 4 | on: [push, pull_request] 5 | 6 | jobs: 7 | lint: 8 | runs-on: ubuntu-latest 9 | 10 | services: 11 | postgres: 12 | image: postgres:latest 13 | env: 14 | POSTGRES_DB: dot_db 15 | POSTGRES_PASSWORD: postgres 16 | POSTGRES_USER: postgres 17 | ports: 18 | - 5432:5432 19 | # Set health checks to wait until postgres has started 20 | options: >- 21 | --health-cmd pg_isready 22 | --health-interval 10s 23 | --health-timeout 5s 24 | --health-retries 5 25 | 26 | steps: 27 | - uses: actions/checkout@v2 28 | - uses: actions/setup-python@v2 29 | with: 30 | python-version: '3.8' 31 | - name: black, lint & self_tests (pre-commit actions) 32 | run: | 33 | python -m pip install --upgrade pip 34 | pip install -r dot/requirements_dot.txt 35 | cp dot/config/example/self_tests/dot_config_github.yml dot/self_tests/data/base_self_test/dot_config.yml 36 | mkdir -p $HOME/.dbt 37 | cp dot/config/example/self_tests/dbt/profiles_github.yml $HOME/.dbt/profiles.yml 38 | ./setup_hooks.sh 39 | .git/hooks/pre-commit 40 | 41 | -------------------------------------------------------------------------------- /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | branches: [ "main" ] 17 | pull_request: 18 | # The branches below must be a subset of the branches above 19 | branches: [ "main" ] 20 | schedule: 21 | - cron: '17 12 * * 6' 22 | 23 | jobs: 24 | analyze: 25 | name: Analyze 26 | runs-on: ubuntu-latest 27 | permissions: 28 | actions: read 29 | contents: read 30 | security-events: write 31 | 32 | strategy: 33 | fail-fast: false 34 | matrix: 35 | language: [ 'python' ] 36 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] 37 | # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support 38 | 39 | steps: 40 | - name: Checkout repository 41 | uses: actions/checkout@v3 42 | 43 | # Initializes the CodeQL tools for scanning. 44 | - name: Initialize CodeQL 45 | uses: github/codeql-action/init@v2 46 | with: 47 | languages: ${{ matrix.language }} 48 | # If you wish to specify custom queries, you can do so here or in a config file. 49 | # By default, queries listed here will override any specified in a config file. 50 | # Prefix the list here with "+" to use these queries and those in the config file. 51 | 52 | # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs 53 | # queries: security-extended,security-and-quality 54 | 55 | 56 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 57 | # If this step fails, then you should remove it and run the build manually (see below) 58 | - name: Autobuild 59 | uses: github/codeql-action/autobuild@v2 60 | 61 | # ℹ️ Command-line programs to run using the OS shell. 62 | # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun 63 | 64 | # If the Autobuild fails above, remove it and uncomment the following three lines. 65 | # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance. 66 | 67 | # - run: | 68 | # echo "Run, Build Application using script" 69 | # ./location_of_script_within_repo/buildscript.sh 70 | 71 | - name: Perform CodeQL Analysis 72 | uses: github/codeql-action/analyze@v2 73 | with: 74 | category: "/language:${{matrix.language}}" 75 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | dot/config/dot_config.yml 2 | dot/dbt/dbt_project.yml 3 | dot/dbt/models/ 4 | dot/dbt/tests/ 5 | dot/generated_files/ 6 | dot/great_expectations/batch_config.json 7 | dot/great_expectations/great_expectations.yml 8 | 9 | dot/self_tests/data/base_self_test/dot_config.yml 10 | dot/self_tests/output/ 11 | dot/dbt/models_self_tests 12 | docker/db/ 13 | docker/demo/appsmith 14 | docker/demo/db 15 | docker/cookie 16 | .DS_Store 17 | 18 | .idea* 19 | __pycache__ 20 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # DataKind Code of Conduct 2 | 3 | In order to ensure that all participants experience a positive and respectful DataKind community, we reinforce and protect our respectful and safe environment by requiring that all community members commit to a Code of Conduct. This Code of Conduct demonstrates [DataKind’s values](https://www.datakind.org/our-story), as well as how we’re accountable to each other, in a way that promotes shared understanding. We also provide transparency on how to identify and address situations if there are concerns that this code has been breached. DataKind commits to enforcing our Code of Conduct and to periodically reviewing our Code of Conduct to ensure DataKind remains a safe environment. Our Code of Conduct applies to any interactions within the DataKind community including all events (in-person or virtual) hosted by DataKind, shared online spaces (email, Slack, etc.), social media, and external events where volunteers or staff are representing DataKind as a speaker. 4 | 5 | ## Expected Behaviors 6 | One of our favorite mottos at DataKind is “be tough on ideas, gentle on people.” People are at the center of all that we do, so it’s critical that we have mutual respect for one another and aim to exemplify our core values. Whether in an in-person or virtual environment, we want to provide a space where people are able to express their skills, curiosity, perspectives, and personality. 7 | 8 | When you volunteer at DataKind, you aim to demonstrate our core values: 9 | 10 | ### Humanity 11 | We believe that the top priority of data science and AI should be to improve quality of life for people. 12 | * Data science should be used where it improves access to information and services. We prioritize projects where responsible data science or AI provide the most impactful interventions. 13 | * We work together in ways that recognize our shared humanity, making space for connection, joy, trust, and rest. 14 | * We believe that being clear is being kind. We are tough on ideas, but gentle on people. 15 | 16 | ### Equity 17 | We aspire to close gaps in information and access by tailoring our work to the needs and perspectives of the people who will use it. 18 | * We co-develop the vision for how solutions will address community-identified needs, prioritizing groups that have historically been excluded from data science and AI. 19 | * We build partnerships for reciprocal learning and seek to amplify our complementary strengths. 20 | * We prioritize ethics, privacy, anti-racism, and risk management from the start of any data science and AI work. 21 | 22 | ### Innovation 23 | We bring curiosity and creativity to the world’s toughest challenges, building responsive tools and solutions with shared expertise. 24 | * We are nimble and adaptive, testing new approaches and generating insights. We aspire to continuous learning and free and responsible sharing. 25 | * We design our work in collaboration with others, building shared buy-in and knowledge for maximum impact. We approach new topics and projects with humility, ready to admit when we're wrong and eager to thoughtfully change course. 26 | * We are rigorous and feedback-driven. We continually raise the bar for technical and ethical excellence. 27 | 28 | ## Unacceptable Behaviors 29 | Discrimination and harassment based on race, gender identities, gender expressions, sexual orientations, abilities, physical appearances, socioeconomic backgrounds, nationalities, ages, religions, or beliefs are expressly prohibited in our community. As a member of the DataKind community, you agree **not** to: 30 | 31 | * Make assumptions about a person’s level of expertise. 32 | * Insult other people. 33 | * Harass DataKind staff, volunteers, community members, or participants. Harassment includes things like: 34 | * Violent threats, actions, or language directed against another person 35 | * Intimidation; stalking; unwanted photography; inappropriate physical contacts 36 | * Discriminatory or derogatory language or actions 37 | * Posting or use of sexual, discriminatory, or violent imagery, comments, jokes, or language 38 | * Unwelcome sexual attention 39 | * Bullying 40 | * The use of inflammatory or off-topic language to provoke hostility or conflict instead of civil discourse (i.e. trolling) 41 | * Sustained disruption of an event 42 | * Question or challenge someone’s self-identity or chosen labels 43 | * Purposely share any aspect of a person’s identity without their consent, except as necessary to protect people from intentional abuse 44 | * Advocate for, or encourage, any of the above behavior 45 | 46 | Any reported violation of the Code of Conduct will be reviewed and disciplinary action could include actions up to DataKind terminating collaboration and/or banning any future engagement activities. 47 | 48 | ## Contact Us 49 | If someone has made you feel uncomfortable for any reason or if you believe someone is violating the Code of Conduct, we want to hear from you. You can submit an incident form [here](https://forms.gle/158HYrYNf95NQBo69). 50 | 51 | ## DataKind’s Commitment to Privacy 52 | Our overall commitment to a safe environment through this Code of Conduct includes our privacy statement. If you have questions or concerns, please contact us at compliance@datakind.org. 53 | 54 | Acknowledgement 55 | We’d like to acknowledge our volunteer community, staff, and the other organizations that inspired updates to this Code of Conduct: [R4DS Online Learning Community](https://www.rfordatasci.com/conduct/), [Carpentries](https://docs.google.com/forms/d/e/1FAIpQLSdi0wbplgdydl_6rkVtBIVWbb9YNOHQP_XaANDClmVNu0zs-w/viewform), [Buffer](https://buffer.com/resources/code-of-conduct/), [Slack](https://api.slack.com/community/code-of-conduct), and [LGBTQ in Technology](https://lgbtq.technology/coc.html). 56 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to DOT 2 | 3 | Hi! Thanks for your interest in contributing to DOT, we're really excited to see you! In this document we'll try to 4 | summarize everything that you need to know to do a good job. 5 | 6 | ## New contributor guide 7 | 8 | To get an overview of the project, please read the [README](README.md) and our [Code of Conduct](./CODE_OF_CONDUCT.md) to keep our community approachable and respectable. 9 | 10 | 11 | ## Getting started 12 | ### Creating Issues 13 | 14 | If you spot a problem, [search if an issue already exists](https://github.com/datakind/Data-Observation-Toolkit/issues). If a related issue doesn't exist, 15 | you can open a new issue using a relevant [issue form](https://github.com/datakind/Data-Observation-Toolkit/issues/new). 16 | 17 | As a general rule, we don’t assign issues to anyone. If you find an issue to work on, you are welcome to open a PR with a fix. 18 | 19 | ### More complex configuration options 20 | 21 | All the configuration files must be located under the [config](dot/config) folder of the DOT. 22 | 23 | ### Main config file 24 | 25 | The main config file must be called `dot_config.yml` and located at the top [config](dot/config) folder. Note that 26 | this file will be ignored for version control. You may use the [example dot_config yaml](dot/config/example/dot_config.yml) 27 | as a template. 28 | 29 | Besides the DOT DB connection in the paragraph above, see below for additional config options. 30 | 31 | #### Connection parameters for each of the projects to run 32 | 33 | For each of the projects you would like to run, add a key to the DOT config yaml with the following structure: 34 | ``` 35 | _db: 36 | type: connection type e.g. postgres 37 | host: host 38 | user: username 39 | pass: password 40 | port: port number e.g 5432 41 | dbname: database name 42 | schema: schema name, e.g. public 43 | threads: nubmer of threads for DBT, e.g. 4 44 | ``` 45 | 46 | #### Output schema suffix 47 | 48 | The DOT generates 2 kind of database objects: 49 | - Entities of the models that are being tested, e.g. assessments, follow ups, patients 50 | - Results of the failing tests 51 | 52 | If nothing is done, these objects would be created in the same schema as the original data for the project 53 | (thus polluting the DB). If the key `output_schema_suffix` is added, its value will be added as a suffix; i.e. if the 54 | project data is stored in a certain schema, the output objects will go to `_` 55 | (e.g. to `public_tests` if the project schema is `public` and the suffix is set to `tests` in the lines above). 56 | 57 | Note that this mechanism uses a DBT feature, and that the same applies to the GE tests. 58 | 59 | #### Save passed tests 60 | 61 | The key `save_passed_tests` accepts boolean values. If set to true, tha results of the passing tests will be also stored 62 | to the DOT DB. If not, only the results of failing tests will be stored. 63 | 64 | ### Other config file locations 65 | Optional configuration for DBT and Great Expectations can be added, per project, in a structure as follows. 66 | 67 | ```bash 68 | |____config 69 | | |____ 70 | | | |____dbt 71 | | | | |____profiles.yml 72 | | | | |____dbt_project.yml 73 | | | |____ge 74 | | | | |____great_expectations.yml 75 | | | | |____config_variables.yml 76 | | | | |____batch_config.json 77 | ``` 78 | In general these customizations will not be needed, but only in some scenarios with particular requirements; these 79 | require a deeper knowledge of the DOT and of either DBT and/or Great Expectations. 80 | 81 | There are examples for all the files above under [this folder](dot/config/example/project_name). For each of the 82 | files you want to customize, you may copy and adapt the examples provided following the directory structure above. 83 | 84 | More details in the [config README](dot/config/README.md). 85 | 86 | ## Making Code changes 87 | 88 | ## Setting up a Development Environment 89 | 90 | To set up your local development environment for contributing follow the steps 91 | in the paragraphs below. 92 | 93 | The easiest way to develop DOT is to use the provided Docker environment, see [README](./README.md) for more details. 94 | This comes with the user interface and Postgres database included. Self tests will also work there too, so we encourage 95 | using this environment if you can. The Docker image will mount your filesystem, so changes to files 96 | will be reflected in the running instance of DOT and its user interface. 97 | 98 | #### Running DOT without using Docker 99 | 100 | If you wish to build locally, then ... 101 | 102 | 1. Install [miniconda](https://docs.conda.io/en/latest/miniconda.html) by selecting the installer that fits your OS version. Once it is installed you may have to restart your terminal (closing your terminal and opening again) 103 | 2. In this directory, open terminal 104 | 3. `conda env create -f environment.yml` 105 | 4. `conda activate dot_conda_env` 106 | 5. You will need a postgres database called 'dot_db'. To populate objects run the scripts in [./db/dot](./db/dot) sequentially. 107 | 6. Update your [./dot/config/dot_config.yml](./dot/config/dot_config.yml]) to point at your local database 108 | 7. Create a config file for the database connection details, located at the directory `dot/self_tests/data/base_self_test`. 109 | 110 | #### Running unit tests 111 | 112 | Run the following and hopefully you get a successful output. 113 | ``` 114 | pytest dot/self_tests/unit 115 | ``` 116 | 117 | You can also run 118 | ``` 119 | git commit 120 | ``` 121 | since you have added the `Code Quality` tools referenced in the main README as a pre-commit hook, 122 | together with the self-tests. 123 | 124 | ### GitHub Workflow 125 | 126 | As many other open source projects, we use the famous 127 | [gitflow](https://nvie.com/posts/a-successful-git-branching-model/) to manage our 128 | branches. 129 | 130 | Summary of our git branching model: 131 | - Get all the latest work from the upstream `datakind/Data-Observation-Toolkit` repository 132 | (`git checkout main`) 133 | - Create a new branch off with a descriptive name (for example: 134 | `feature/new-test-macro`, `bugfix/bug-when-uploading-results`). You can 135 | do it with (`git checkout -b `) 136 | - Make your changes and commit them locally (`git add >`, 137 | `git commit -m "Add some change" `). Whenever you commit, the self-tests 138 | and code quality will kick in; fix anything that gets broken 139 | - Push to your branch on GitHub (with the name as your local branch: 140 | `git push origin `). This will output a URL for creating a Pull Request (PR) 141 | - Create a pull request by opening the URL a browser. You can also create PRs in the GitHub 142 | interface, choosing your branch to merge into main 143 | - Wait for comments and respond as-needed 144 | - Once PR review is complete, your code will be merged. Thanks!! 145 | 146 | 147 | ### Tips 148 | 149 | - Write [helpful commit 150 | messages](https://robots.thoughtbot.com/5-useful-tips-for-a-better-commit-message) 151 | - Anything in your branch must have no failing tests. You can check by looking at your PR 152 | online in GitHub 153 | - Never use `git add .`: it can add unwanted files; 154 | - Avoid using `git commit -a` unless you know what you're doing; 155 | - Check every change with `git diff` before adding them to the index (stage 156 | area) and with `git diff --cached` before committing; 157 | - If you have push access to the main repository, please do not commit directly 158 | to `dev`: your access should be used only to accept pull requests; if you 159 | want to make a new feature, you should use the same process as other 160 | developers so your code will be reviewed. 161 | 162 | 163 | ## Code Guidelines 164 | 165 | - Use [PEP8](https://www.python.org/dev/peps/pep-0008/); 166 | - Write tests for your new features (please see "Tests" topic below); 167 | - Always remember that [commented code is dead 168 | code](https://blog.codinghorror.com/coding-without-comments/); 169 | - Name identifiers (variables, classes, functions, module names) with readable 170 | names (`x` is always wrong); 171 | - When manipulating strings, we prefer either [f-string 172 | formatting](https://docs.python.org/3/tutorial/inputoutput.html#formatted-string-literals) 173 | (f`'{a} = {b}'`) or [new-style 174 | formatting](https://docs.python.org/library/string.html#format-string-syntax) 175 | (`'{} = {}'.format(a, b)`), instead of the old-style formatting (`'%s = %s' % (a, b)`); 176 | - You will know if any test breaks when you commit, and the tests will be run 177 | again in the continuous integration pipeline (see below); 178 | 179 | ## Tests 180 | 181 | You should write tests for every feature you add or bug you solve in the code. 182 | Having automated tests for every line of our code lets us make big changes 183 | without worries: there will always be tests to verify if the changes introduced 184 | bugs or lack of features. If we don't have tests we will be blind and every 185 | change will come with some fear of possibly breaking something. 186 | 187 | For a better design of your code, we recommend using a technique called 188 | [test-driven development](https://en.wikipedia.org/wiki/Test-driven_development), 189 | where you write your tests **before** writing the actual code that implements 190 | the desired feature. 191 | 192 | You can use `pytest` to run your tests, no matter which type of test it is. 193 | 194 | 195 | ## Continuous Integration 196 | 197 | We use [GitHub Actions](https://github.com/datakind/Data-Observation-Toolkit/actions) 198 | for continuous integration. 199 | See [here](https://docs.github.com/en/actions) for GitHub's documentation. 200 | 201 | The [`.github/workflows/lint.yml`](.github/workflows/ci.yml) file configures the CI. 202 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2022 DataKind 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated 6 | documentation files (the "Software"), to deal in the Software without restriction, including without limitation 7 | the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and 8 | to permit persons to whom the Software is furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 11 | 12 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 13 | WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 14 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 15 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | If we verify a reported security vulnerability, our policy is: 4 | 5 | - We will patch the current release branch, as well as the immediate prior minor release branch. 6 | 7 | - After patching the release branches, we will immediately issue new security fix releases for each patched release branch. 8 | 9 | ## Reporting a Security Issue 10 | 11 | To report any security issues, please [raise an issue](https://github.com/datakind/Data-Observation-Toolkit/issues/new/choose) and select **Security issue* 12 | -------------------------------------------------------------------------------- /db/dot/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datakind/Data-Observation-Toolkit/9c28925298e4b4a03e75a3b093d26efe13d8c3be/db/dot/.gitkeep -------------------------------------------------------------------------------- /db/dot/2-upload_static_data.sql: -------------------------------------------------------------------------------- 1 | 2 | -- dot.scenarios 3 | INSERT INTO dot.scenarios VALUES('MISSING-1', 'Missing fields', 'Data entry error', 'Form data entry error', 'Null fields', 'Blank fields'); 4 | INSERT INTO dot.scenarios VALUES('INCONSISTENT-1', 'Inconsistent data', 'Data entry error', 'Form data entry error', 'Outliers', 'Jaundice alert=No when fever+jaundice; Incorrect LMP, wrong visit dates'); 5 | INSERT INTO dot.scenarios VALUES('INCONSISTENT-2', 'Inconsistent data', 'Data entry error', 'Time/Date incorrect on phone', 'Date logic issues, outliers', ''); 6 | INSERT INTO dot.scenarios VALUES('FAKE-1', 'Fake data', 'Data entry error', 'Fake data entered into forms', 'Outliers', ''); 7 | INSERT INTO dot.scenarios VALUES('DUPLICATE-1', 'Duplicate data', 'Data entry error', 'Duplicate data entered', 'Duplicate records', 'Multiple person records for the same person'); 8 | INSERT INTO dot.scenarios VALUES('BIAS-1', 'Miscalibrated instruments', 'Data entry error', 'Measurement bias', 'Outliers', 'Thermometer bias'); 9 | INSERT INTO dot.scenarios VALUES('BIAS-2', 'CHW Training issues', 'Data entry error', 'Measurement bias', 'Outliers', 'Breath counts not measured correctly'); 10 | INSERT INTO dot.scenarios VALUES('BUGS-1', 'Foreign keys errors', 'Software bugs', 'Database bugs', 'Foreign key errors', ''); 11 | INSERT INTO dot.scenarios VALUES('BUGS-2', 'Inconsistent field formats ', 'Software bugs', 'Database bugs', 'Inconsistent field formats ', 'During form updates/modifications the app developer changes the response type of a field'); 12 | INSERT INTO dot.scenarios VALUES('BUGS-3', 'Data category errors ', 'Software bugs', 'Database bugs', 'Category distribution changes over time', '‘Male’ mixed with ‘man’ instead of all ‘Male'''); 13 | INSERT INTO dot.scenarios VALUES('BUGS-4', 'Field name changes', 'Software bugs', 'Application bugs', 'Field data changes over time', 'During form updates/modifications the app developer renames a form field name or changes the response type of a field'); 14 | INSERT INTO dot.scenarios VALUES('BUGS-5', 'Incorrect metrics/indicators calculation/aggregation', 'Software bugs', 'Reporting bugs', 'Errors in calculated metrics', 'Technical debt complexity in pregnancy national metrics'); 15 | INSERT INTO dot.scenarios VALUES('MISSED-1', 'Missed person or patient', 'Process errors', 'Missed task', 'Houses in district not included', 'Unvisited household'); 16 | INSERT INTO dot.scenarios VALUES('MISSED-2', 'Missed assessment/report', 'Process errors', 'Missed task', 'Missed follow-up forms', 'Missed pregnancy; Missed delivery report'); 17 | INSERT INTO dot.scenarios VALUES('MISSED-3', 'Missed followup', 'Process errors', 'Missed task', 'Inconsistent patterns in follow-up data', ''); 18 | INSERT INTO dot.scenarios VALUES('MISSED-4', 'Missed referral visit', 'Process errors', 'Missed task', 'Missing referral visits', 'Patient referred but doesn''t attend'); 19 | INSERT INTO dot.scenarios VALUES('MISSED-5', 'Missed treatment', 'Process errors', 'Missed task', 'Inconsistencies in treatment data, outliers', 'No Malaria treatment after diagnosis; Underreporting immunization'); 20 | INSERT INTO dot.scenarios VALUES('MISSED-6', 'Missed CHW supervision', 'Process errors', 'Missed task', 'Missed supervision forms', ''); 21 | INSERT INTO dot.scenarios VALUES('MISSED-7', 'Missed family planning', 'Process errors', 'Missed task', 'No FP for relevant househould, outliers', ''); 22 | INSERT INTO dot.scenarios VALUES('FOLLOWUP-1', 'Unrealistically fast followup', 'Process errors', 'Incorrect followup', 'Unrealistically fast followups', ''); 23 | INSERT INTO dot.scenarios VALUES('MULTIEVENTS-1', 'Mutiple same day events', 'Process errors', 'Multiple events', 'Mutiple same day events', ''); 24 | INSERT INTO dot.scenarios VALUES('ASSESS-1', 'Inconsistent data', 'Process errors', 'Incorrect assessment', 'Outliers', 'Jaundice alert=No when fever+jaundice; '); 25 | INSERT INTO dot.scenarios VALUES('TREAT-1', 'Incorrect treatment', 'Process errors', 'Incorrect treatment', 'Outliers', 'Drug protocol not followed for Malaria treatment; FP for people on tubal ligation, pregnant or had vasectomy'); 26 | 27 | -- dot.test_types 28 | INSERT INTO dot.test_types VALUES('relationships', 'dbt', 'Test missing relationships between records', 'multi_table', true, true); 29 | INSERT INTO dot.test_types VALUES('unique', 'dbt', 'Test to confirm uniqueness ', 'column', false,true); 30 | INSERT INTO dot.test_types VALUES('not_negative_string_column', 'dbt', 'Test to confirm all positive', 'column', false, true); 31 | INSERT INTO dot.test_types VALUES('not_null', 'dbt', 'Test to confirm if null', 'column', false, true); 32 | INSERT INTO dot.test_types VALUES('accepted_values', 'dbt', 'Test to confirm values adhere to specified list', 'column', true, true); 33 | INSERT INTO dot.test_types VALUES('custom_sql', 'dbt', 'Custom SQL, if rows returned test failed', 'any', true, false); 34 | INSERT INTO dot.test_types VALUES('possible_duplicate_forms', 'dbt', 'Test to confirm duplicate records', 'single_table', true, false); 35 | INSERT INTO dot.test_types VALUES('associated_columns_not_null', 'dbt', 'Test to confirm related columns not null', 'column', false, true); 36 | INSERT INTO dot.test_types VALUES('expect_similar_means_across_reporters', 'great_expectations', 'Test to compare means across reporters (eg of temperature)', 'column', true, false); 37 | INSERT INTO dot.test_types VALUES('expression_is_true', 'dbt', 'Test to confirm a value of an expression given a condition', 'any', true, false); 38 | 39 | 40 | -- dot.test_parameters_interface 41 | -- INSERT INTO dot.test_parameters_interface VALUES('relationships', 'name', 'function_argument', 'Name of the test'); 42 | INSERT INTO dot.test_parameters_interface VALUES('relationships', 'reference', 'view/table', $$ref('dot_model__ancview_pregnancy')$$, 'Referenced field to be checked if missing'); 43 | INSERT INTO dot.test_parameters_interface VALUES('relationships', 'field', 'entity any field', 'uuid', 'Field being checked'); 44 | -- INSERT INTO dot.test_parameters_interface VALUES('not_negative_string_column', 'name', 'function_argument', 'Name of column to be check3ed for non-negative values'); 45 | INSERT INTO dot.test_parameters_interface VALUES('accepted_values', 'values', 'list of values', $$["dog","cat","ostrich"]$$,'List of accepted values for the field being checked'); 46 | INSERT INTO dot.test_parameters_interface VALUES('possible_duplicate_forms', 'table_specific_reported_date', 'entity date field', 'reported', 'Column which indicates when form created'); 47 | INSERT INTO dot.test_parameters_interface VALUES('possible_duplicate_forms', 'table_specific_patient_uuid', 'entity id field', 'patient_id', 'Column which holds to patient uuid'); 48 | INSERT INTO dot.test_parameters_interface VALUES('possible_duplicate_forms', 'table_specific_uuid', 'entity id field', 'uuid', 'UUID for records in the table (form) being checked'); 49 | INSERT INTO dot.test_parameters_interface VALUES('possible_duplicate_forms', 'table_specific_period', 'one of (hour, day, week)', 'day','Specified period to check for duplicates (hour, day, week)'); 50 | INSERT INTO dot.test_parameters_interface VALUES('custom_sql', 'query', 'sql statement', $$SELECT field1, field2, 'table1' as \"primary_table\", 'field1' as \"primary_table_id_field\" WHERE COLOR='green'$$,'Custom SQL to use to determine test fails, SQL is defined in columns test_parameter'); 51 | -- INSERT INTO dot.test_parameters_interface VALUES('expression_is_true', 'name', 'function_argument', 'Name of the test'); 52 | INSERT INTO dot.test_parameters_interface VALUES('expression_is_true', 'condition', 'entity columns boolean logic', '(patient_age_in_months<24) and (malaria_give_act is not null)','Where clause of rows that are going to be checked'); 53 | INSERT INTO dot.test_parameters_interface VALUES('expression_is_true', 'expression', 'entity columns boolean logic', 'malaria_act_dosage is not null', 'If not true, the row fails the test'); 54 | INSERT INTO dot.test_parameters_interface VALUES('expect_similar_means_across_reporters', 'key', 'entity id field', 'reported_by', 'The grouping field to check means by, ie a person-specific id'); 55 | INSERT INTO dot.test_parameters_interface VALUES('expect_similar_means_across_reporters', 'quantity', 'entity numeric field', 'temperature', 'The name of the numeric field to analyze for variation'); 56 | INSERT INTO dot.test_parameters_interface VALUES('expect_similar_means_across_reporters', 'data_table', 'view/table', 'dot_model__iccmview_assessment', 'The name of entity view where data is'); 57 | INSERT INTO dot.test_parameters_interface VALUES('expect_similar_means_across_reporters', 'id_column', 'entity id field', 'reported_by', 'The id column to use to get failed test records'); 58 | 59 | -- dot.scenario_test_types 60 | INSERT INTO dot.scenario_test_types VALUES('MISSING-1', 'associated_columns_not_null'); 61 | INSERT INTO dot.scenario_test_types VALUES('MISSING-1', 'not_null'); 62 | INSERT INTO dot.scenario_test_types VALUES('INCONSISTENT-1', 'custom_sql'); 63 | INSERT INTO dot.scenario_test_types VALUES('INCONSISTENT-1', 'not_negative_string_column'); 64 | INSERT INTO dot.scenario_test_types VALUES('INCONSISTENT-1', 'accepted_values'); 65 | INSERT INTO dot.scenario_test_types VALUES('INCONSISTENT-1', 'expression_is_true'); 66 | INSERT INTO dot.scenario_test_types VALUES('INCONSISTENT-2', 'custom_sql'); 67 | INSERT INTO dot.scenario_test_types VALUES('FAKE-1', 'accepted_values'); 68 | INSERT INTO dot.scenario_test_types VALUES('FAKE-1', 'expect_similar_means_across_reporters'); 69 | INSERT INTO dot.scenario_test_types VALUES('DUPLICATE-1', 'unique'); 70 | INSERT INTO dot.scenario_test_types VALUES('DUPLICATE-1', 'possible_duplicate_forms'); 71 | INSERT INTO dot.scenario_test_types VALUES('DUPLICATE-1', 'custom_sql'); 72 | INSERT INTO dot.scenario_test_types VALUES('BIAS-1', 'expect_similar_means_across_reporters'); 73 | INSERT INTO dot.scenario_test_types VALUES('BIAS-2', 'expect_similar_means_across_reporters'); 74 | INSERT INTO dot.scenario_test_types VALUES('BUGS-1', 'relationships'); 75 | --INSERT INTO dot.scenario_test_types VALUES('BUGS-2', ''); 76 | --INSERT INTO dot.scenario_test_types VALUES('BUGS-3', ''); 77 | --INSERT INTO dot.scenario_test_types VALUES('BUGS-4', ''); 78 | --INSERT INTO dot.scenario_test_types VALUES('BUGS-5', ''); 79 | INSERT INTO dot.scenario_test_types VALUES('MISSED-1', 'custom_sql'); 80 | INSERT INTO dot.scenario_test_types VALUES('MISSED-2', 'custom_sql'); 81 | INSERT INTO dot.scenario_test_types VALUES('MISSED-3', 'custom_sql'); 82 | INSERT INTO dot.scenario_test_types VALUES('MISSED-4', 'custom_sql'); 83 | INSERT INTO dot.scenario_test_types VALUES('MISSED-5', 'custom_sql'); 84 | INSERT INTO dot.scenario_test_types VALUES('MISSED-6', 'custom_sql'); 85 | INSERT INTO dot.scenario_test_types VALUES('MISSED-7', 'custom_sql'); 86 | INSERT INTO dot.scenario_test_types VALUES('FOLLOWUP-1', 'custom_sql'); 87 | INSERT INTO dot.scenario_test_types VALUES('MULTIEVENTS-1', 'custom_sql'); 88 | INSERT INTO dot.scenario_test_types VALUES('ASSESS-1', 'custom_sql'); 89 | INSERT INTO dot.scenario_test_types VALUES('TREAT-1', 'custom_sql'); 90 | 91 | -------------------------------------------------------------------------------- /db/dot/4-upload_sample_dot_data.sql: -------------------------------------------------------------------------------- 1 | INSERT INTO dot.projects SELECT 'ScanProject1', 'Scan 1 project', true, 'public', null, '2021-12-07 00:00:00+00','2021-12-07 00:00:00+00','Matt'; 2 | 3 | -- entity categories 4 | INSERT INTO dot.entity_categories VALUES('ALL', 'All flights'); 5 | INSERT INTO dot.entity_categories VALUES('ZAG', 'Zagreb airport flights'); 6 | INSERT INTO dot.entity_categories VALUES('ETH', 'Ethiopian Airlines'); 7 | 8 | -- configured entities - db views of the data we want to scan 9 | INSERT INTO dot.configured_entities VALUES('ScanProject1', 'all_flight_data', 'ALL', '{{ config(materialized=''view'') }} 10 | {% set schema = %} 11 | select * 12 | from {{ schema }}.flight_data ','2021-12-07 00:00:00+00','2021-12-07 00:00:00+00','Matt'); 13 | 14 | INSERT INTO dot.configured_entities VALUES('ScanProject1', 'zagreb_flight_data', 'ZAG', '{{ config(materialized=''view'') }} 15 | {% set schema = %} 16 | select * 17 | from {{ schema }}.flight_data WHERE origin_airport=''Zagreb airport'' ','2021-12-07 00:00:00+00','2021-12-07 00:00:00+00','Matt'); 18 | 19 | INSERT INTO dot.configured_entities VALUES('ScanProject1', 'ethiopia_airlines_data', 'ETH', '{{ config(materialized=''view'') }} 20 | {% set schema = %} 21 | select * 22 | from {{ schema }}.flight_data WHERE airline=''Ethiopian Airlines'' ','2021-12-07 00:00:00+00','2021-12-07 00:00:00+00','Matt'); 23 | 24 | INSERT INTO dot.configured_entities VALUES('ScanProject1', 'all_airports_data', 'ALL', '{{ config(materialized=''view'') }} 25 | {% set schema = %} 26 | select * 27 | from {{ schema }}.airport_data ','2021-12-07 00:00:00+00','2021-12-07 00:00:00+00','Matt'); 28 | 29 | INSERT INTO dot.configured_entities VALUES('ScanProject1', 'airlines_data', 'ALL', '{{ config(materialized=''view'') }} 30 | {% set schema = %} 31 | select DISTINCT airline 32 | from {{ schema }}.flight_data ','2021-12-07 00:00:00+00','2021-12-07 00:00:00+00','Matt'); 33 | 34 | 35 | -- Note these UUIDs get reset by the trigger 36 | INSERT INTO dot.configured_tests VALUES(TRUE, 'ScanProject1', '549c0575-e64c-3605-85a9-70356a23c4d2', 'MISSING-1', 3, 37 | 'Origin airport is not null', '', '', 'all_flight_data', 'not_null', 'origin_airport', '', 38 | NULL, '2021-12-23 19:00:00.000 -0500', '2021-12-23 19:00:00.000 -0500', 'Matt'); 39 | 40 | INSERT INTO dot.configured_tests VALUES(TRUE, 'ScanProject1', '8aca2bee-9e95-3f8a-90e9-153714e05367', 'INCONSISTENT-1', 41 | 5, 'Price is not negative', '', '', 'all_flight_data', 'not_negative_string_column', 'price', '', 42 | '{"name": "price"}', '2021-12-23 19:00:00.000 -0500', '2021-12-23 19:00:00.000 -0500', 'Matt'); 43 | 44 | INSERT INTO dot.configured_tests VALUES(TRUE, 'ScanProject1', '52d7352e-56ee-3084-9c67-e5ab24afc3a3', 'DUPLICATE-1', 45 | 3, 'Airport not unique', '', '', 'all_airports_data', 'unique', 'airport', '', NULL, 46 | '2021-12-23 19:00:00.000 -0500', '2021-12-23 19:00:00.000 -0500', 'Matt'); 47 | 48 | INSERT INTO dot.configured_tests VALUES(TRUE, 'ScanProject1', '935e6b61-b664-3eab-9d67-97c2c9c2bec0', 'INCONSISTENT-1', 49 | 3, 'Disallowed FP methods entered in form', '', '', 'all_flight_data', 'accepted_values', 'stops', 50 | '', $${"values": [ "1", "2", "3", "Non-stop"]}$$, '2021-12-23 19:00:00.000 -0500', '2021-12-23 19:00:00.000 -0500', 'Matt'); 51 | 52 | INSERT INTO dot.configured_tests VALUES(TRUE, 'ScanProject1', '0cdc9702-91e0-3499-b6f0-4dec12ad0f08', 'ASSESS-1', 3, 53 | 'Flight with no airport record', '', '', 'all_flight_data', 'relationships', 'origin_airport', 54 | '', $${"name": "flight_with_no_airport", "to": "ref('dot_model__all_airports_data')", "field": "airport"}$$, 55 | '2021-12-23 19:00:00.000 -0500', '2021-12-23 19:00:00.000 -0500', 'Matt'); 56 | 57 | INSERT INTO dot.configured_tests VALUES(TRUE, 'ScanProject1', '0cdc9702-91e0-3499-b6f0-4dec12ad0f18', 'BIAS-1', 6, 58 | 'Price outlier airlines', '', '', 'all_flight_data', 'expect_similar_means_across_reporters', 59 | 'price', '', $${"key": "airline","quantity": "price","data_table": "dot_model__all_flight_data","id_column": "airline", 60 | "target_table":"dot_model__airlines_data"}$$, '2022-01-19 20:00:00.000 -0500', '2022-01-19 20:00:00.000 -0500', 'Matt'); 61 | 62 | INSERT INTO dot.configured_tests VALUES(TRUE, 'ScanProject1', '36d33837-bd92-370a-963a-264a4d5b2bac', 'DUPLICATE-1', 63 | 6, 'Duplicate flight record', '', '', 'all_flight_data', 'possible_duplicate_forms', '', '', 64 | $${"table_specific_reported_date": "departure_time", "table_specific_patient_uuid": "airline", "table_specific_uuid": 65 | "uuid", "table_specific_period": "day"}$$, '2021-12-23 19:00:00.000 -0500', '2022-03-21 19:00:00.000 -0500', 'Matt'); 66 | 67 | INSERT INTO dot.configured_tests VALUES(TRUE, 'ScanProject1', 'c4a3da8f-32f4-4e9b-b135-354de203ca90', 'TREAT-1', 68 | 5, 'Number of stops has a reasonable value', '', '', 'all_flight_data', 'custom_sql', '', '', 69 | format('{%s: %s}', 70 | to_json('query'::text), 71 | to_json($query$ 72 | select 73 | distinct uuid, 74 | 'dot_model__all_flight_data' as primary_table, 75 | 'uuid' as primary_table_id_field 76 | from {{ ref('dot_model__all_flight_data') }} 77 | where CAST(REGEXP_REPLACE(COALESCE(stops,'0'), '[^0-9]+', '0', 'g') as INTEGER) > 5 78 | $query$::text) 79 | )::json, 80 | '2021-12-23 19:00:00.000 -0500', '2021-12-23 19:00:00.000 -0500', 'Lorenzo'); 81 | 82 | INSERT INTO dot.configured_tests VALUES(TRUE, 'ScanProject1', '3081f033-e8f4-4f3b-aea8-36f8c5df05dc', 'INCONSISTENT-1', 83 | 8, 'Price is a positive number for direct flights', '', '', 'all_flight_data', 'expression_is_true', 84 | '', '', $${"name": "t_direct_flights_positive_price", "expression": "price is not null and price > 0", 85 | "condition": "stops = 'non-stop'"}$$, '2022-12-10 19:00:00.000 -0500', '2022-12-10 19:00:00.000 -0500', 'Lorenzo'); 86 | 87 | COMMIT; 88 | 89 | 90 | -------------------------------------------------------------------------------- /db/fake_data_generator.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script will generate fake data which can be used to demo DOT. See below 3 | for code to generate each test type scenario. Script saves a sql file in ./dot which 4 | will be included as part of Docker build. 5 | """ 6 | import pandas as pd 7 | from faker import Faker 8 | from faker_airtravel import AirTravelProvider 9 | from flatten_json import flatten 10 | import numpy as np 11 | from datetime import datetime 12 | from datetime import timedelta 13 | import random 14 | import uuid 15 | 16 | NUMBER_OF_FLIGHTS = 1000 17 | 18 | np.random.seed(seed=12345) 19 | Faker.seed(0) 20 | fake = Faker() 21 | # this is to seed the custom provider of airports for deterministic results 22 | random.seed(10) 23 | 24 | start_time = "01/01/2022 11:13:08.230010" 25 | date_format_str = "%d/%m/%Y %H:%M:%S.%f" 26 | flight_time = datetime.strptime(start_time, date_format_str) 27 | 28 | fake.add_provider(AirTravelProvider) 29 | 30 | flight_data = [] 31 | airport_data = [] 32 | 33 | # Generate data 34 | for i in range(NUMBER_OF_FLIGHTS): 35 | flight_time = flight_time + timedelta(seconds=i * 10) 36 | f = flatten(fake.flight()) 37 | f["departure_time"] = flight_time 38 | flight_data.append(f) 39 | flight_data = pd.DataFrame(flight_data) 40 | 41 | # Make SQL friendly 42 | flight_data = flight_data.replace("'", "''", regex=True) 43 | airport_data = flight_data[["origin_airport", "origin_iata"]].drop_duplicates() 44 | 45 | print("Adding test fail scenarios to generated data ...") 46 | print("Adding a broken relationship ...") 47 | # Remove a row from airports so there isn't a relationship to it from flights 48 | airport_data = airport_data.drop(3) 49 | 50 | print("Adding unique value exception ...") 51 | duplicate = airport_data.iloc[4] 52 | airport_data = airport_data.append(duplicate) 53 | 54 | print("Adding not negative exception ...") 55 | flight_data.loc[2, "price"] = -100 56 | airport_data = airport_data.append(duplicate) 57 | 58 | print("Adding null values, and associated values not null, exceptions ...") 59 | nan_mat = np.random.random(flight_data.shape) < 0.05 60 | flight_data = flight_data.mask(nan_mat) 61 | 62 | print("Adding accepted values exceptions ...") 63 | flight_data.loc[6, "stops"] = 97 64 | 65 | print("Adding duplicate forms (records) ...") 66 | duplicate = flight_data.iloc[4] 67 | flight_data = flight_data.append(duplicate) 68 | 69 | print("Expect similar means across reporters (airlines) ...") 70 | duplicate = flight_data.iloc[4] 71 | flight_data.loc[flight_data["airline"] == "British Airways", "price"] = ( 72 | 0.1 * flight_data.loc[flight_data["airline"] == "British Airways", "price"] 73 | ) 74 | 75 | flight_data = flight_data.reset_index() 76 | airport_data = airport_data.reset_index() 77 | 78 | flights_sql = """ 79 | CREATE TABLE IF NOT EXISTS public.flight_data( 80 | uuid UUID PRIMARY KEY, 81 | departure_time TIMESTAMP WITH TIME ZONE NULL, 82 | airline VARCHAR(200) NULL, 83 | origin_airport VARCHAR(200) NULL, 84 | origin_iata VARCHAR(200) NULL, 85 | destination_airport VARCHAR(200) NULL, 86 | destination_iata VARCHAR(200) NULL, 87 | stops VARCHAR(30) NULL, 88 | price FLOAT NULL 89 | ); 90 | 91 | """ 92 | for index, r in flight_data.iterrows(): 93 | uuid_str = uuid.uuid3( 94 | uuid.NAMESPACE_OID, 95 | str(r["origin_airport"]) + str(r["departure_time"]) + str(index), 96 | ) 97 | flights_sql += ( 98 | f"INSERT INTO public.flight_data VALUES('{uuid_str}', '{r['departure_time']}', '{r['airline']}', " 99 | f"'{r['origin_airport']}','{r['origin_iata']}', '{r['destination_airport']}', " 100 | f"'{r['destination_iata']}', '{r['stops']}', {r['price']} );\n" 101 | ) 102 | 103 | airports_sql = """ 104 | CREATE TABLE IF NOT EXISTS public.airport_data( 105 | uuid UUID PRIMARY KEY, 106 | airport VARCHAR(200) NULL, 107 | airport_iata VARCHAR(200) NULL 108 | ); 109 | 110 | """ 111 | for index, r in airport_data.iterrows(): 112 | uuid_str = uuid.uuid3(uuid.NAMESPACE_OID, r["origin_airport"] + str(index)) 113 | airports_sql += ( 114 | f"INSERT INTO public.airport_data VALUES('{uuid_str}', '{r['origin_airport']}'," 115 | f"'{r['origin_iata']}');\n" 116 | ) 117 | 118 | airports_sql = airports_sql.replace("'nan'", "NULL").replace("'NaT'", "NULL") 119 | flights_sql = flights_sql.replace("'nan'", "NULL").replace("'NaT'", "NULL") 120 | flights_sql = flights_sql.replace("nan", "NULL") 121 | 122 | with open("./dot/3-demo_data.sql", "w") as f: 123 | f.write(airports_sql) 124 | f.write(flights_sql) 125 | 126 | print("Done") 127 | -------------------------------------------------------------------------------- /docker/.gitignore: -------------------------------------------------------------------------------- 1 | *.tar 2 | *.gz 3 | .env 4 | 5 | # Airflow stuff: 6 | airflow/logs -------------------------------------------------------------------------------- /docker/airflow/dags/dot_projects.json: -------------------------------------------------------------------------------- 1 | { 2 | "target_connid": "dot_db", 3 | "dot_projects":[ 4 | { 5 | "project_id":"ScanProject1", 6 | "source_connid": "dot_data", 7 | "earliest_date_to_sync":"2021-10-02", 8 | "objects":[ 9 | { 10 | "object":"airport_data", 11 | "id_field":"uuid" 12 | }, 13 | { 14 | "object":"flight_data", 15 | "id_field":"uuid", 16 | "date_field":"departure_time" 17 | } 18 | ] 19 | } 20 | ] 21 | } 22 | -------------------------------------------------------------------------------- /docker/airflow/scripts/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datakind/Data-Observation-Toolkit/9c28925298e4b4a03e75a3b093d26efe13d8c3be/docker/airflow/scripts/.gitkeep -------------------------------------------------------------------------------- /docker/airflow/scripts/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | airflow db init 3 | airflow users create -r Admin -u admin -e admin@example.com -f admin -l user -p admin1234 4 | airflow webserver -------------------------------------------------------------------------------- /docker/appsmith/stacks/.gitignore: -------------------------------------------------------------------------------- 1 | configuration 2 | data 3 | letsencrypt 4 | logs 5 | ssl -------------------------------------------------------------------------------- /docker/demo/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datakind/Data-Observation-Toolkit/9c28925298e4b4a03e75a3b093d26efe13d8c3be/docker/demo/.gitkeep -------------------------------------------------------------------------------- /docker/docker-compose-demo.yml: -------------------------------------------------------------------------------- 1 | version: '3.1' 2 | 3 | services: 4 | 5 | dot_db: 6 | image: datakind/dot_db_demo:latest 7 | container_name: dot-db 8 | ports: 9 | - "5433:5432" 10 | volumes: 11 | - ./demo/db/data:/var/lib/postgresql/data 12 | 13 | dot: 14 | image: datakind/dot_demo:latest 15 | container_name: dot 16 | 17 | appsmith: 18 | image: datakind/dot_appsmith:latest 19 | container_name: appsmith 20 | ports: 21 | - "82:80" 22 | - "446:443" 23 | volumes: 24 | - ./demo/appsmith/stacks:/appsmith-stacks 25 | -------------------------------------------------------------------------------- /docker/docker-compose-with-airflow.yml: -------------------------------------------------------------------------------- 1 | #version: '3.1' 2 | version: '2.1' 3 | 4 | # 5 | # https://airflow.apache.org/docs/apache-airflow/2.2.1/start/docker.html 6 | # Licensed to the Apache Software Foundation (ASF) under one 7 | # or more contributor license agreements. See the NOTICE file 8 | # distributed with this work for additional information 9 | # regarding copyright ownership. The ASF licenses this file 10 | # to you under the Apache License, Version 2.0 (the 11 | # "License"); you may not use this file except in compliance 12 | # with the License. You may obtain a copy of the License at 13 | # 14 | # http://www.apache.org/licenses/LICENSE-2.0 15 | # 16 | # Unless required by applicable law or agreed to in writing, 17 | # software distributed under the License is distributed on an 18 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 19 | # KIND, either express or implied. See the License for the 20 | # specific language governing permissions and limitations 21 | # under the License. 22 | # 23 | 24 | # Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL. 25 | # 26 | # WARNING: This configuration is for local development. Do not use it in a production deployment. 27 | # 28 | # This configuration supports basic configuration using environment variables or an .env file 29 | # The following variables are supported: 30 | # 31 | # AIRFLOW_IMAGE_NAME - Docker image name used to run Airflow. 32 | # Default: apache/airflow:master-python3.8 33 | # AIRFLOW_UID - User ID in Airflow containers 34 | # Default: 50000 35 | # AIRFLOW_GID - Group ID in Airflow containers 36 | # Default: 50000 37 | # _AIRFLOW_WWW_USER_USERNAME - Username for the administrator account. 38 | # Default: airflow 39 | # _AIRFLOW_WWW_USER_PASSWORD - Password for the administrator account. 40 | # Default: airflow 41 | # _PIP_ADDITIONAL_REQUIREMENTS - Additional PIP requirements to add when starting all containers. 42 | # Default: '' 43 | # 44 | 45 | x-airflow-common: 46 | &airflow-common 47 | image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.2.4} 48 | environment: 49 | &airflow-common-env 50 | AIRFLOW__CORE__EXECUTOR: CeleryExecutor 51 | AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow 52 | AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow 53 | AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0 54 | AIRFLOW__CORE__FERNET_KEY: '' 55 | AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true' 56 | AIRFLOW__CORE__LOAD_EXAMPLES: 'true' 57 | AIRFLOW__CORE__ENABLE_XCOM_PICKLING: 'true' 58 | PYTHON_BASE_IMAGE: "python:3:8-slim-buster" 59 | POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} 60 | SSH_AUTH_SOCK: /ssh-agent 61 | AIRFLOW__CORE__LOAD_EXAMPLES: 'false' 62 | volumes: 63 | - ./airflow/dags:/opt/airflow/dags 64 | - airflow-logs-volume:/opt/airflow/logs 65 | - airflow-plugins-volume:/opt/airflow/plugins 66 | 67 | - ../:/app 68 | - ./dot/dot_config.yml:/app/dot/config/dot_config.yml 69 | - ${SSH_AUTH_SOCK}:/ssh-agent # Forward local machine SSH key to docker so we can use ssh tunnel on host to access DB 70 | #user: "${AIRFLOW_UID:-50000}:${AIRFLOW_GID:-50000}" 71 | user: "1000:0" # This will work but change mounts to root 72 | depends_on: 73 | redis: 74 | condition: service_healthy 75 | postgres: 76 | condition: service_healthy 77 | 78 | services: 79 | 80 | # =========================== DOT and DB =============================== 81 | dot_db: 82 | image: postgres:9.6.23-buster 83 | container_name: dot-db 84 | ports: 85 | - "5433:5432" 86 | environment: 87 | POSTGRES_DB: dot_db 88 | POSTGRES_USER: postgres 89 | POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} 90 | volumes: 91 | - .:/db_dumps 92 | - ../db/dot:/docker-entrypoint-initdb.d 93 | 94 | # Not needed here, because we deploy DOT to airflow worker 95 | #dot: 96 | # build: 97 | # context: .. 98 | # dockerfile: ./docker/dot/Dockerfile 99 | # image: dot 100 | # container_name: dot-dot-tool 101 | # environment: 102 | # POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} 103 | # volumes: 104 | # - ../dot:/app 105 | # - ./dot/dot_config.yml:/app/dot_config.yml 106 | # - ./dot/dbt_profiles.yml:/root/.dbt/profiles.yml 107 | # - ./dot/ge_config_variables.yml:/app/great_expectations/uncommitted/config_variables.yml 108 | 109 | #superset: 110 | # image: apache/superset 111 | # ports: 112 | # - "8080:8088" 113 | # container_name: superset 114 | 115 | # ================================== Web App =============================== 116 | #dot-webapp-server: 117 | # build: 118 | # context: .. 119 | # dockerfile: ./docker/webapp/server/Dockerfile 120 | # image: dot-tool-webapp-server 121 | # container_name: dot-dot-tool-web-app-server 122 | # environment: 123 | # DB_PASSWORD: ${POSTGRES_PASSWORD} 124 | # ports: 125 | # - "3002:3002" 126 | # volumes: 127 | # - ../webapp/server:/usr/src/app 128 | # - ./webapp/server/env:/usr/src/app/.env 129 | 130 | #dot-webapp-frontend: 131 | # build: 132 | # context: .. 133 | # dockerfile: ./docker/webapp/frontend/Dockerfile 134 | # image: dot-tool-webapp-frontend 135 | # container_name: dot-dot-tool-web-app-frontend 136 | # ports: 137 | # - "3000:3000" 138 | # volumes: 139 | # - ../webapp/frontend:/usr/src/app 140 | # - ./webapp/frontend/env:/usr/src/app/.env 141 | 142 | # ================================== AIRFLOW =============================== 143 | 144 | # DB used for airflow 145 | postgres: 146 | image: postgres:13 147 | ports: 148 | - "5434:5432" 149 | environment: 150 | POSTGRES_USER: airflow 151 | POSTGRES_PASSWORD: airflow 152 | POSTGRES_DB: airflow 153 | volumes: 154 | - postgres-db-volume:/var/lib/postgresql/data 155 | healthcheck: 156 | test: [ "CMD", "pg_isready", "-U", "airflow" ] 157 | interval: 5s 158 | retries: 5 159 | restart: always 160 | 161 | redis: 162 | image: redis:latest 163 | ports: 164 | - 6379:6379 165 | healthcheck: 166 | test: [ "CMD", "redis-cli", "ping" ] 167 | interval: 5s 168 | timeout: 30s 169 | retries: 50 170 | restart: always 171 | 172 | airflow-webserver: 173 | <<: *airflow-common 174 | command: webserver 175 | ports: 176 | - 8083:8080 177 | healthcheck: 178 | test: [ "CMD", "curl", "--fail", "http://localhost:8083/health" ] 179 | interval: 10s 180 | timeout: 10s 181 | retries: 5 182 | restart: always 183 | 184 | airflow-scheduler: 185 | <<: *airflow-common 186 | command: scheduler 187 | restart: always 188 | 189 | airflow-worker: 190 | <<: *airflow-common 191 | command: celery worker 192 | restart: always 193 | 194 | airflow-init: 195 | <<: *airflow-common 196 | command: version 197 | environment: 198 | <<: *airflow-common-env 199 | _AIRFLOW_DB_UPGRADE: 'true' 200 | _AIRFLOW_WWW_USER_CREATE: 'true' 201 | _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow} 202 | _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow} 203 | 204 | flower: 205 | <<: *airflow-common 206 | command: celery flower 207 | ports: 208 | - 5555:5555 209 | healthcheck: 210 | test: [ "CMD", "curl", "--fail", "http://localhost:5555/" ] 211 | interval: 10s 212 | timeout: 10s 213 | retries: 5 214 | restart: always 215 | 216 | volumes: 217 | postgres-db-volume: 218 | airflow-logs-volume: 219 | airflow-plugins-volume: 220 | -------------------------------------------------------------------------------- /docker/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.1' 2 | 3 | services: 4 | dot_db: 5 | image: postgres:9.6.23-buster 6 | container_name: dot-db 7 | ports: 8 | - "5433:5432" 9 | environment: 10 | POSTGRES_DB: dot_db 11 | POSTGRES_USER: postgres 12 | POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} 13 | volumes: 14 | - .:/db_dumps 15 | - ./db/data:/var/lib/postgresql/data 16 | - ../db/dot:/docker-entrypoint-initdb.d 17 | 18 | dot: 19 | build: 20 | context: .. 21 | dockerfile: ./docker/dot/Dockerfile 22 | image: dot 23 | container_name: dot 24 | environment: 25 | POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} 26 | volumes: 27 | - ../:/app 28 | - ./dot/dot_config.yml:/app/dot/config/dot_config.yml 29 | - ../dot/config/example/self_tests/dot_config_docker.yml:/app/dot/self_tests/data/base_self_test/dot_config.yml 30 | 31 | appsmith: 32 | image: index.docker.io/appsmith/appsmith-ce 33 | container_name: appsmith 34 | ports: 35 | - "82:80" 36 | - "446:443" 37 | volumes: 38 | - ./appsmith/stacks:/appsmith-stacks 39 | #restart: unless-stopped 40 | # # Uncomment the lines below to enable auto-update 41 | #labels: 42 | # com.centurylinklabs.watchtower.enable: "true" 43 | 44 | # appsmith auto-update 45 | #auto_update: 46 | # image: containrrr/watchtower:latest-dev 47 | # volumes: 48 | # - /var/run/docker.sock:/var/run/docker.sock 49 | # # Update check interval in seconds. 50 | # command: --schedule "0 0 * ? * *" --label-enable --cleanup 51 | # restart: unless-stopped 52 | 53 | -------------------------------------------------------------------------------- /docker/dot/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.8-slim-buster 2 | 3 | WORKDIR app/ 4 | 5 | RUN apt-get update 6 | 7 | RUN pip install --upgrade pip 8 | 9 | # This uses the requirements as defined in the dot directory 10 | COPY ./dot/requirements_dot.txt requirements_dot.txt 11 | COPY ./dot/install_dot.sh install_dot.sh 12 | RUN ./install_dot.sh 13 | 14 | # This will be overwritten if using local mounts, but keeping here for the demo build 15 | COPY ../ /app 16 | COPY ./docker/dot/dot_config.yml /app/dot/config/dot_config.yml 17 | COPY ./dot/config/example/self_tests/dot_config_docker.yml /app/dot/self_tests/data/base_self_test/dot_config.yml 18 | 19 | # Install psql, useful 20 | RUN apt-get -y install postgresql-client 21 | 22 | #CMD [ "python3" ] 23 | CMD tail -f /dev/null 24 | -------------------------------------------------------------------------------- /docker/dot/dot_config.yml: -------------------------------------------------------------------------------- 1 | dot: 2 | output_schema_suffix: tests 3 | dot_db: 4 | type: postgres 5 | host: dot_db 6 | user: postgres 7 | pass: "{{ env_var('POSTGRES_PASSWORD') }}" 8 | port: 5432 9 | dbname: dot_db 10 | schema: dot 11 | threads: 4 12 | ScanProject1_db: 13 | type: postgres 14 | host: dot_db 15 | user: postgres 16 | pass: "{{ env_var('POSTGRES_PASSWORD') }}" 17 | port: 5432 18 | dbname: dot_db 19 | schema: public 20 | threads: 4 21 | -------------------------------------------------------------------------------- /docker/run_demo.py: -------------------------------------------------------------------------------- 1 | """This script will run the DOT demo""" 2 | 3 | import os 4 | import shutil 5 | import time 6 | import tarfile 7 | import webbrowser 8 | import gdown 9 | from python_on_whales import DockerClient 10 | 11 | url_demo_data = "https://drive.google.com/uc?id=157Iad8mHnwbZ_dAeLQy5XfLihhcpD6yc" 12 | filename_demo_data = "dot_demo_data.tar.gz" 13 | url_dot_ui = "http://localhost:82/app/data-observation-toolkit/run-log-634491ea0da61b0e9f38760d?embed=True" # pylint: disable=line-too-long 14 | 15 | # Check if db, appsmith and tar file are there and if so, delete them. 16 | os.chdir("demo/") 17 | if os.path.exists("db"): 18 | shutil.rmtree("db") 19 | if os.path.exists("appsmith"): 20 | shutil.rmtree("appsmith") 21 | if os.path.exists("dot_demo_data.tar"): 22 | os.remove("dot_demo_data.tar") 23 | 24 | print("\nDownloading demo data file....\n") 25 | 26 | # Download Demo Data from Google Drive 27 | gdown.download(url_demo_data, filename_demo_data, quiet=False) 28 | 29 | print("Demo data has been downloaded\n") 30 | 31 | # Open/Extract tarfile 32 | with tarfile.open(filename_demo_data) as my_tar: 33 | my_tar.extractall("") 34 | my_tar.close() 35 | 36 | with open("./db/.env") as f: 37 | demo_pwd = f.read().split("=")[1] 38 | os.environ["POSTGRES_PASSWORD"] = demo_pwd 39 | 40 | # Composing and running container(s) 41 | print("Starting DOT...\n") 42 | os.chdir("../") 43 | docker = DockerClient(compose_files=[os.getcwd() + os.sep + "docker-compose-demo.yml"]) 44 | docker.compose.down(quiet=True) 45 | docker.compose.up(quiet=True, build=True, detach=True) 46 | 47 | print("Waiting for DOT to start, time to make a nice cup of tea! ☕ ...\n") 48 | time.sleep(20) 49 | 50 | webbrowser.open(url_dot_ui) 51 | 52 | print( 53 | "In case DOT was not opened in your browser, please go to this URL: " 54 | "http://localhost:82/app/data-observation-toolkit/run-log-634491ea0da61b0e9f38760d?embed=True\n" 55 | ) 56 | input("Press return to stop DOT container\n") 57 | print("Container is being stopped - we hope you enjoyed this demo :)") 58 | docker.compose.stop() 59 | -------------------------------------------------------------------------------- /dot/.gitignore: -------------------------------------------------------------------------------- 1 | .dbt 2 | uncommitted 3 | -------------------------------------------------------------------------------- /dot/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datakind/Data-Observation-Toolkit/9c28925298e4b4a03e75a3b093d26efe13d8c3be/dot/__init__.py -------------------------------------------------------------------------------- /dot/config/README.md: -------------------------------------------------------------------------------- 1 | # DOT configuration 2 | 3 | ## How the tool connects to the database 4 | 5 | There is just 1 config file that control how the tool connects to the database; follow the link to see an [example file](dot/config/example/dot_config.yml). 6 | 7 | This file contains connection parameters for the DOT database and also for any of the project databases (e.g. `Muso`) 8 | for which you want to run the DOT tests. See more details in the paragraph below on how to adapt this and other config files to your needs. 9 | 10 | Additionally to the database connections handled in `dot_config.yml`, the different objects generated by the DOT 11 | can be stored in different schemas. Read below about the file `dbt_project.yml` to learn how to define these output schemas. 12 | 13 | ## Running the tool per project 14 | 15 | The DOT can be run per project, where configuration and output files for each project are found in the following directories: 16 | 1. mandatory configuration 17 | ```bash 18 | |____config 19 | | |____dot_config.yml 20 | ``` 21 | 2. optional per project configuration 22 | ```bash 23 | |____config 24 | | |____ 25 | | | |____dbt 26 | | | | |____profiles.yml 27 | | | | |____dbt_project.yml 28 | | | |____ge 29 | | | | |____great_expectations.yml 30 | | | | |____config_variables.yml 31 | | | | |____batch_config.json 32 | ``` 33 | 3. generated files per project 34 | ```bash 35 | |____generated_files 36 | | |____ 37 | | | |____all_tests_summary.xlsx 38 | | | |____ge_clean_results.csv 39 | | | |____dbt_test_coverage_report.txt 40 | | | |____all_tests_rows.xlsx 41 | ``` 42 | 43 | ### Configuration per project 44 | 45 | All config files are grouped under the [config dir](dot/config). The DOT DB connection details are propagated through 46 | Jinja templates to other config files that belong to DBT and Great Expectations. Please follow the guidelines below 47 | if you need to customize other configurations. 48 | 49 | #### For the database connection 50 | 51 | A single file controls connections the DOT and any project for which you want to run DOT tests: 52 | - copy the default [dot_config](dot/config/example/dot_config.yml) into the top config folder (i.e. as `dot/config/dot_config.yml`) 53 | - note that the copied file will be ignored for version control 54 | - change the necessary parameters for the dot_db connection, e.g. `host`, `dbname` 55 | - add connection parameters for each of the projects you would like to run, with the same structure 56 | as the `Muso_db` entry for the [dot_config example](dot/config/example/dot_config.yml), i.e. 57 | ``` 58 | _db: 59 | type: connection type e.g. postgres 60 | host: host 61 | user: username 62 | pass: password 63 | port: port number e.g 5432 64 | dbname: database name 65 | schema: schema name, e.g. public 66 | threads: nubmer of threads for DBT, e.g. 4 67 | ``` 68 | - note that the DOT and the project connections should be at least in different schemas, but also they can be either 69 | in different databases of the same host, or in different servers 70 | 71 | #### Other project-dependent configurations 72 | 73 | In you need to edit configurations for DBT and Great Expectations, you would need to change the [Jinja templates](dot/config/templates). 74 | In general these customizations will not be needed, but only in some scenarios with particular requirements; 75 | these require a deeper knowledge of the DOT and of either DBT and/or Great Expectations. 76 | 77 | ##### dbt_project.yml (DBT) 78 | 79 | This file goes into the [dbt main folder](dot/dbt). If you don't need to customise it, DOT uses this [Jinja template](dot/config/templates/dbt/dbt_project.yml), 80 | after a few project-dependent adjustments: 81 | - `model-paths` is set to a subdirectory for the project, i.e. `["models_"]` 82 | - `test-paths` is also set to a subdirectory for the project, i.d. `["tests_"]` 83 | 84 | and the modified version is copied by the DOT into the destination [dbt main folder](dot/dbt). 85 | 86 | The tool also copies the content of the [models folder](dot/dbt/models/core) into the model path for the project, `dot/dbt/models//core`, 87 | and creates the custom SQL tests at `dot/dbt/tests/`. 88 | 89 | An example of a common personalization would be for changing the schema in which the objects generated by the dot are written. 90 | See the paragraph just below. 91 | 92 | ##### Writing the output objects of the DOT to different schemas 93 | 94 | The DOT generates 2 kind of database objects: 95 | - entities of the models that are being tested, e.g. assessments, follow ups, patients 96 | - results of the failing tests 97 | 98 | If nothing is done, these objects would be created in the same schema as the original data for the project (thus polluting the db). 99 | 100 | The following lines added to `dbt_project.yml` will modify where those objects are stored: 101 | ``` 102 | models: 103 | dbt_model_1: 104 | core: 105 | +schema: 106 | test: 107 | +schema: 108 | ``` 109 | Which will be added as a suffix. I.e. if the project data is stored in a certain schema, the output objects will go to 110 | `_` (e.g. to `public_tests` if the project schema is `public` and the suffix is set to 111 | `tests` in the lines above). 112 | 113 | Note that this mechanism uses a DBT feature, and that the same applies to the GE tests. 114 | 115 | Finally, although this is not really recommended, you can send the 2 different types of outputs to 2 schemas: 116 | - `core` in the lines above corresponds to the models 117 | - `test` corresponds to the failing test results 118 | 119 | ##### profiles.yml (DBT) 120 | 121 | This DBT configuration file goes into `~/.dbt/profiles.yml`. If you don't need to customise it, the [Jinja template](dot/config/templates/dbt/profiles.yml) 122 | is used by the tool to generate the final config file, using the connection parameters for the DOTdb in the [dot_config](dot/config/example/dot_config.yml) file. 123 | 124 | First sight there is no good reason to customise this config file. 125 | 126 | ##### great_expectations.yml (GE) 127 | 128 | This file goes into the [great expectations main folder](dot/great_expectations). Starting from this [Jinja template](dot/config/templates/great_expectations/great_expectations.yml) 129 | a config file is generated into the destination [great expectations main folder](dot/great_expectations). 130 | 131 | ##### batch_config.json (GE) 132 | 133 | This file goes into the [great expectations main folder](dot/great_expectations). The [Jinja template](dot/config/templates/great_expectations/batch_config.json) 134 | generates a file copied into the [great expectations main folder](dot/great_expectations). 135 | 136 | There are no obvious reasons why you may want to customize this file. 137 | 138 | ##### config_variables.yml (GE) 139 | 140 | Starting from this [Jinja template](dot/config/templates/great_expectations/config_variables.yml) 141 | the GE configuration file goes into `dot/great_expectations/uncommitted/config_variables.yml` 142 | 143 | First sight there is no good reason to customise this config file. -------------------------------------------------------------------------------- /dot/config/example/dot_config.yml: -------------------------------------------------------------------------------- 1 | dot: 2 | save_passed_tests: False 3 | output_schema_suffix: tests 4 | dot_db: 5 | type: postgres 6 | host: dot_db 7 | user: postgres 8 | pass: "{{ env_var('POSTGRES_PASSWORD') }}" 9 | port: 5432 10 | dbname: dot_db 11 | schema: dot 12 | threads: 4 13 | ScanProject1_db: 14 | type: postgres 15 | host: dot_db 16 | user: postgres 17 | pass: "{{ env_var('POSTGRES_PASSWORD') }}" 18 | port: 5432 19 | dbname: dot_db 20 | schema: public 21 | threads: 4 22 | -------------------------------------------------------------------------------- /dot/config/example/project_name/dbt/dbt_project.yml: -------------------------------------------------------------------------------- 1 | name: 'dbt_model_1' 2 | version: '0.0.1' 3 | 4 | # This setting configures which "profile" dbt uses for this project. 5 | profile: 'default' 6 | 7 | # These configurations specify where dbt should look for different types of files. 8 | # The `source-paths` config, for example, states that models in this project can be 9 | # found in the "models/" directory. You probably won't need to change these! 10 | model-paths: ["models"] # here the tool sets the output to a project-dependent folder 11 | analysis-paths: ["analysis"] 12 | test-paths: ["tests"] # here the tool sets the output to a project-dependent folder 13 | seed-paths: ["data"] 14 | macro-paths: ["macros"] 15 | snapshot-paths: ["snapshots"] 16 | 17 | target-path: "target" # directory which will store compiled SQL files 18 | clean-targets: # directories to be removed by `dbt clean` 19 | - "target" 20 | - "dbt_modules" 21 | 22 | config-version: 2 23 | 24 | # Configuring models 25 | # Full documentation: https://docs.getdbt.com/docs/configuring-models 26 | 27 | # In this example config, we tell dbt to build all models in the example/ directory 28 | # as tables. These settings can be overridden in the individual model files 29 | # using the `{{ config(...) }}` macro. 30 | -------------------------------------------------------------------------------- /dot/config/example/self_tests/dbt/dbt_project.yml: -------------------------------------------------------------------------------- 1 | 2 | 3 | name: 'dbt_model_1' 4 | version: '0.0.1' 5 | 6 | # This setting configures which "profile" dbt uses for this project. 7 | profile: 'default' 8 | 9 | # These configurations specify where dbt should look for different types of files. 10 | # The `source-paths` config, for example, states that models in this project can be 11 | # found in the "models/" directory. You probably won't need to change these! 12 | model-paths: ["models/ScanProject1"] # here the tool sets the output to a project-dependent folder 13 | analysis-paths: ["analysis"] 14 | test-paths: ["tests/ScanProject1"] # here the tool sets the output to a project-dependent folder 15 | seed-paths: ["data"] 16 | macro-paths: ["macros"] 17 | snapshot-paths: ["snapshots"] 18 | 19 | target-path: "target" # directory which will store compiled SQL files 20 | clean-targets: # directories to be removed by `dbt clean` 21 | - "target" 22 | - "dbt_modules" 23 | 24 | config-version: 2 25 | 26 | # Configuring models 27 | # Full documentation: https://docs.getdbt.com/docs/configuring-models 28 | 29 | # In this example config, we tell dbt to build all models in the example/ directory 30 | # as tables. These settings can be overridden in the individual model files 31 | models: 32 | dbt_model_1: 33 | core: 34 | +schema: 'tests' 35 | test: 36 | +schema: 'tests' -------------------------------------------------------------------------------- /dot/config/example/self_tests/dbt/profiles_github.yml: -------------------------------------------------------------------------------- 1 | config: 2 | partial_parse: False 3 | 4 | default: 5 | target: dev 6 | outputs: 7 | dev: 8 | type: postgres 9 | host: localhost 10 | user: postgres 11 | pass: postgres 12 | port: 5432 13 | dbname: dot_db 14 | schema: self_tests_public 15 | threads: 4 16 | -------------------------------------------------------------------------------- /dot/config/example/self_tests/dot_config_docker.yml: -------------------------------------------------------------------------------- 1 | dot: 2 | save_passed_tests: False 3 | output_schema_suffix: tests 4 | dot_db: 5 | type: postgres 6 | host: dot_db 7 | user: postgres 8 | pass: "{{ env_var('POSTGRES_PASSWORD') }}" 9 | port: 5432 10 | dbname: dot_db 11 | schema: self_tests_dot 12 | threads: 4 13 | ScanProject1_db: 14 | type: postgres 15 | host: dot_db 16 | user: postgres 17 | pass: "{{ env_var('POSTGRES_PASSWORD') }}" 18 | port: 5432 19 | dbname: dot_db 20 | schema: self_tests_public 21 | threads: 4 -------------------------------------------------------------------------------- /dot/config/example/self_tests/dot_config_github.yml: -------------------------------------------------------------------------------- 1 | dot: 2 | save_passed_tests: False 3 | output_schema_suffix: tests 4 | dot_db: 5 | type: postgres 6 | host: localhost 7 | user: postgres 8 | pass: postgres 9 | port: 5432 10 | dbname: dot_db 11 | schema: self_tests_dot 12 | threads: 4 13 | ScanProject1_db: 14 | type: postgres 15 | host: localhost 16 | user: postgres 17 | pass: postgres 18 | port: 5432 19 | dbname: dot_db 20 | schema: self_tests_public 21 | threads: 4 22 | -------------------------------------------------------------------------------- /dot/config/example/self_tests/dot_config_local.yml: -------------------------------------------------------------------------------- 1 | dot: 2 | save_passed_tests: False 3 | output_schema_suffix: tests 4 | dot_db: 5 | type: postgres 6 | host: localhost 7 | user: postgres 8 | pass: "{{ env_var('POSTGRES_PASSWORD') }}" 9 | port: 5433 10 | dbname: dot_db 11 | schema: self_tests_dot 12 | threads: 4 13 | ScanProject1_db: 14 | type: postgres 15 | host: localhost 16 | user: postgres 17 | pass: "{{ env_var('POSTGRES_PASSWORD') }}" 18 | port: 5433 19 | dbname: dot_db 20 | schema: self_tests_public 21 | threads: 4 22 | -------------------------------------------------------------------------------- /dot/config/templates/dbt/dbt_project.yml: -------------------------------------------------------------------------------- 1 | {# templates/dbt/dbt_project.yml #} 2 | 3 | name: 'dbt_model_1' 4 | version: '0.0.1' 5 | 6 | # This setting configures which "profile" dbt uses for this project. 7 | profile: 'default' 8 | 9 | # These configurations specify where dbt should look for different types of files. 10 | # The `source-paths` config, for example, states that models in this project can be 11 | # found in the "models/" directory. You probably won't need to change these! 12 | model-paths: ["models/{{ project_id }}"] # here the tool sets the output to a project-dependent folder 13 | analysis-paths: ["analysis"] 14 | test-paths: ["tests/{{ project_id }}"] # here the tool sets the output to a project-dependent folder 15 | seed-paths: ["data"] 16 | macro-paths: ["macros"] 17 | snapshot-paths: ["snapshots"] 18 | 19 | target-path: "target" # directory which will store compiled SQL files 20 | clean-targets: # directories to be removed by `dbt clean` 21 | - "target" 22 | - "dbt_modules" 23 | 24 | config-version: 2 25 | 26 | # Configuring models 27 | # Full documentation: https://docs.getdbt.com/docs/configuring-models 28 | 29 | # In this example config, we tell dbt to build all models in the example/ directory 30 | # as tables. These settings can be overridden in the individual model files 31 | -------------------------------------------------------------------------------- /dot/config/templates/dbt/profiles.yml: -------------------------------------------------------------------------------- 1 | {# templates/dbt/profiles.yml #} 2 | 3 | config: 4 | partial_parse: False 5 | 6 | default: 7 | target: dev 8 | outputs: 9 | dev: 10 | type: postgres 11 | host: {{ host }} 12 | user: {{ user }} 13 | pass: {{ password }} 14 | port: {{ port }} 15 | dbname: {{ dbname }} 16 | schema: {{ schema }} 17 | threads: 4 18 | -------------------------------------------------------------------------------- /dot/config/templates/great_expectations/batch_config.json: -------------------------------------------------------------------------------- 1 | {# templates/great_expectations/batch_config.json #} 2 | 3 | { 4 | "datasource_name": "mm", 5 | "query": "select 1 as c", 6 | "expectation_suite_name": "ge_test" 7 | } 8 | -------------------------------------------------------------------------------- /dot/config/templates/great_expectations/config_variables.yml: -------------------------------------------------------------------------------- 1 | {# templates/great_expectations/config_variables.yml #} 2 | 3 | # This config file supports variable substitution which enables: 1) keeping 4 | # secrets out of source control & 2) environment-based configuration changes 5 | # such as staging vs prod. 6 | # 7 | # When GE encounters substitution syntax (like `my_key: ${my_value}` or 8 | # `my_key: $my_value`) in the great_expectations.yml file, it will attempt 9 | # to replace the value of `my_key` with the value from an environment 10 | # variable `my_value` or a corresponding key read from this config file, 11 | # which is defined through the `config_variables_file_path`. 12 | # Environment variables take precedence over variables defined here. 13 | # 14 | # Substitution values defined here can be a simple (non-nested) value, 15 | # nested value such as a dictionary, or an environment variable (i.e. ${ENV_VAR}) 16 | # 17 | # 18 | # https://docs.greatexpectations.io/en/latest/guides/how_to_guides/configuring_data_contexts/how_to_use_a_yaml_file_or_environment_variables_to_populate_credentials.html 19 | 20 | instance_id: 67616838-4d81-4db9-9e53-b8fdad70f331 21 | mm: 22 | drivername: postgresql 23 | host: {{ project_db_host }} 24 | port: '{{ project_db_port }}' 25 | username: {{ project_db_username }} 26 | password: {{ project_db_password }} 27 | database: {{ project_db_database }} 28 | -------------------------------------------------------------------------------- /dot/config/templates/great_expectations/great_expectations.yml: -------------------------------------------------------------------------------- 1 | {# templates/great_expectations/great_expectations.yml #} 2 | 3 | # Welcome to Great Expectations! Always know what to expect from your data. 4 | # 5 | # Here you can define datasources, batch kwargs generators, integrations and 6 | # more. This file is intended to be committed to your repo. For help with 7 | # configuration please: 8 | # - Read our docs: https://docs.greatexpectations.io/en/latest/reference/spare_parts/data_context_reference.html#configuration 9 | # - Join our slack channel: http://greatexpectations.io/slack 10 | 11 | # config_version refers to the syntactic version of this config file, and is used in maintaining backwards compatibility 12 | # It is auto-generated and usually does not need to be changed. 13 | config_version: 2.0 14 | 15 | # Datasources tell Great Expectations where your data lives and how to get it. 16 | # You can use the CLI command `great_expectations datasource new` to help you 17 | # add a new datasource. Read more at https://docs.greatexpectations.io/en/latest/reference/core_concepts/datasource.html 18 | datasources: 19 | mm: 20 | module_name: great_expectations.datasource 21 | data_asset_type: 22 | module_name: custom_expectations.custom_dataset 23 | class_name: CustomSqlAlchemyDataset 24 | class_name: SqlAlchemyDatasource 25 | credentials: ${mm} 26 | 27 | # This config file supports variable substitution which enables: 1) keeping 28 | # secrets out of source control & 2) environment-based configuration changes 29 | # such as staging vs prod. 30 | # 31 | # When GE encounters substitution syntax (like `my_key: ${my_value}` or 32 | # `my_key: $my_value`) in the great_expectations.yml file, it will attempt 33 | # to replace the value of `my_key` with the value from an environment 34 | # variable `my_value` or a corresponding key read from this config file, 35 | # which is defined through the `config_variables_file_path`. 36 | # Environment variables take precedence over variables defined here. 37 | # 38 | # Substitution values defined here can be a simple (non-nested) value, 39 | # nested value such as a dictionary, or an environment variable (i.e. ${ENV_VAR}) 40 | # 41 | # 42 | # https://docs.greatexpectations.io/en/latest/guides/how_to_guides/configuring_data_contexts/how_to_use_a_yaml_file_or_environment_variables_to_populate_credentials.html 43 | 44 | 45 | config_variables_file_path: uncommitted/config_variables.yml # use here different data sources for different projects 46 | 47 | # The plugins_directory will be added to your python path for custom modules 48 | # used to override and extend Great Expectations. 49 | plugins_directory: plugins/ 50 | 51 | stores: 52 | # Stores are configurable places to store things like Expectations, Validations 53 | # Data Docs, and more. These are for advanced users only - most users can simply 54 | # leave this section alone. 55 | # 56 | # Three stores are required: expectations, validations, and 57 | # evaluation_parameters, and must exist with a valid store entry. Additional 58 | # stores can be configured for uses such as data_docs, etc. 59 | expectations_store: 60 | class_name: ExpectationsStore 61 | store_backend: 62 | class_name: TupleFilesystemStoreBackend 63 | base_directory: expectations/{{ project_id }} # the tool changes this into a project-dependent directory 64 | 65 | validations_store: 66 | class_name: ValidationsStore 67 | store_backend: 68 | class_name: TupleFilesystemStoreBackend 69 | base_directory: uncommitted/validations/{{ project_id }} # the tool changes this into a project-dependent directory 70 | 71 | evaluation_parameter_store: 72 | # Evaluation Parameters enable dynamic expectations. Read more here: 73 | # https://docs.greatexpectations.io/en/latest/reference/core_concepts/evaluation_parameters.html 74 | class_name: EvaluationParameterStore 75 | 76 | ## checkpoints are only available in GE 0.13 77 | # checkpoint_store: 78 | # class_name: CheckpointStore 79 | # store_backend: 80 | # class_name: TupleFilesystemStoreBackend 81 | # suppress_store_backend_id: true 82 | # base_directory: checkpoints/ 83 | 84 | expectations_store_name: expectations_store 85 | validations_store_name: validations_store 86 | evaluation_parameter_store_name: evaluation_parameter_store 87 | ## checkpoints are only available in GE 0.13 88 | #checkpoint_store_name: checkpoint_store 89 | 90 | data_docs_sites: 91 | # Data Docs make it simple to visualize data quality in your project. These 92 | # include Expectations, Validations & Profiles. The are built for all 93 | # Datasources from JSON artifacts in the local repo including validations & 94 | # profiles from the uncommitted directory. Read more at https://docs.greatexpectations.io/en/latest/reference/core_concepts/data_docs.html 95 | local_site: 96 | class_name: SiteBuilder 97 | # set to false to hide how-to buttons in Data Docs 98 | show_how_to_buttons: true 99 | store_backend: 100 | class_name: TupleFilesystemStoreBackend 101 | base_directory: uncommitted/data_docs/local_site/ 102 | site_index_builder: 103 | class_name: DefaultSiteIndexBuilder 104 | 105 | anonymous_usage_statistics: 106 | data_context_id: dc39ad04-6ad8-4270-8071-60ee1ef81f56 107 | enabled: true 108 | notebooks: 109 | ## concurrency is only valid in GE 0.13 110 | #concurrency: 111 | # enabled: false 112 | 113 | # validation_operators are deprecated in GE version 0.13 114 | # GE produces the following warning message 115 | # You appear to be using a legacy capability with the latest config version (3.0). 116 | # Your data context with this configuration version uses validation_operators, which are being deprecated. 117 | # Please consult the V3 API migration guide https://docs.greatexpectations.io/docs/guides/miscellaneous/migration_guide#migrating-to-the-batch-request-v3-api 118 | # and update your configuration to be compatible with the version number 3. 119 | # (This message will appear repeatedly until your configuration is updated.) 120 | # 121 | # remove the following when checkpoints are properly used 122 | # https://legacy.docs.greatexpectations.io/en/0.13.26/guides/how_to_guides/validation/how_to_create_a_new_checkpoint.html 123 | # https://legacy.docs.greatexpectations.io/en/0.13.26/guides/how_to_guides/validation/how_to_run_a_checkpoint_in_python.html 124 | validation_operators: 125 | action_list_operator: 126 | class_name: ActionListValidationOperator 127 | action_list: 128 | - name: store_validation_result 129 | action: 130 | class_name: StoreValidationResultAction 131 | - name: store_evaluation_params 132 | action: 133 | class_name: StoreEvaluationParametersAction 134 | - name: update_data_docs 135 | action: 136 | class_name: UpdateDataDocsAction 137 | -------------------------------------------------------------------------------- /dot/dbt/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | target/ 3 | dbt_modules/ 4 | dbt_packages/ 5 | logs/ 6 | .ipynb_checkpoints 7 | test_coverage.txt 8 | generated_reports/ 9 | models/test/*.sql 10 | -------------------------------------------------------------------------------- /dot/dbt/analysis/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datakind/Data-Observation-Toolkit/9c28925298e4b4a03e75a3b093d26efe13d8c3be/dot/dbt/analysis/.gitkeep -------------------------------------------------------------------------------- /dot/dbt/data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datakind/Data-Observation-Toolkit/9c28925298e4b4a03e75a3b093d26efe13d8c3be/dot/dbt/data/.gitkeep -------------------------------------------------------------------------------- /dot/dbt/macros/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datakind/Data-Observation-Toolkit/9c28925298e4b4a03e75a3b093d26efe13d8c3be/dot/dbt/macros/.gitkeep -------------------------------------------------------------------------------- /dot/dbt/macros/filter_by_list.sql: -------------------------------------------------------------------------------- 1 | -- filter the list so that we only keep the words that 2 | -- are in list_of_words 3 | {% macro filter_by_list(list_to_filter, list_of_words) %} 4 | 5 | {% set new_list = [] %} 6 | 7 | {% for filter_word in list_to_filter %} 8 | {% set filter_word = filter_word|string %} 9 | {% for word in list_of_words %} 10 | {% if word in filter_word %} 11 | {% do new_list.append(filter_word) %} 12 | {% endif %} 13 | {% endfor %} 14 | {% endfor %} 15 | 16 | {{ return(new_list) }} 17 | 18 | {% endmacro %} -------------------------------------------------------------------------------- /dot/dbt/macros/filter_by_word.sql: -------------------------------------------------------------------------------- 1 | -- filter out word_to_filter from list. 2 | {% macro filter_by_word(list_to_filter, word_to_filter) %} 3 | 4 | {% set new_list = [] %} 5 | 6 | {% for word in list_to_filter %} 7 | {% set word = word|string %} 8 | {% if word_to_filter in word %} 9 | {{ log("(filter_by_word macro output) Filtered word from list: " ~ word, info=True) }} 10 | {% else %} 11 | {% do new_list.append(word) %} 12 | {% endif %} 13 | {% endfor %} 14 | 15 | {{ return(new_list) }} 16 | 17 | {% endmacro %} -------------------------------------------------------------------------------- /dot/dbt/macros/get_column_name.sql: -------------------------------------------------------------------------------- 1 | -- get column name from a relation. 2 | {% macro get_column_name(relation, schema, relation_prefix) %} 3 | 4 | {% set base_column = relation | string %} 5 | {% set strip_col = base_column.split('.')[-1] %} 6 | 7 | {% set column_name = strip_col | replace(relation_prefix ~ "_", "") | replace("\"","") %} 8 | 9 | -- Debugging purposes. Can Comment out or switch to info=False to not print to stdout. 10 | {{ log("(get_column_name macro output) Column name: " ~ column_name, info=True) }} 11 | {{ return(column_name) }} 12 | {% endmacro %} -------------------------------------------------------------------------------- /dot/dbt/macros/get_column_names.sql: -------------------------------------------------------------------------------- 1 | -- get column names from multiple relations. 2 | {% macro get_column_names(schema, relation_prefix) %} 3 | 4 | {% set column_names = [] %} 5 | {% set relations = get_relations(schema, relation_prefix) %} 6 | 7 | {% for relation in relations %} 8 | {% set column_name = get_column_name(relation=relation, schema=schema, relation_prefix=relation_prefix) %} 9 | {% do column_names.append(column_name) %} 10 | {% endfor %} 11 | 12 | {{ return(column_names) }} 13 | 14 | {% endmacro %} -------------------------------------------------------------------------------- /dot/dbt/macros/get_relations.sql: -------------------------------------------------------------------------------- 1 | -- get relations by prefix string. 2 | {% macro get_relations(schema, relation_prefix) %} 3 | {{ return(dbt_utils.get_relations_by_prefix(schema, relation_prefix)) }} 4 | {% endmacro %} -------------------------------------------------------------------------------- /dot/dbt/macros/test_associated_columns_not_null.sql: -------------------------------------------------------------------------------- 1 | -- test the number of rows that are null in associated columns. 2 | -- For instance if fever = yes then one of the associated 3 | -- column is fever_duration. 4 | {% macro test_associated_columns_not_null(model, column_name, col_value, associated_columns, name, table_specific_uuid='uuid') %} 5 | 6 | select 7 | array_agg({{ table_specific_uuid }}) as uuid_list -- postgres only? 8 | from 9 | {{model}} 10 | where 11 | {{column_name}} = {{col_value}} 12 | and ( 13 | {% for col in associated_columns %} 14 | {{col}} is null 15 | {% if not loop.last %} 16 | or 17 | {% endif %} 18 | {% endfor %} 19 | ) 20 | having count(*) > 0 21 | 22 | {% endmacro %} 23 | -------------------------------------------------------------------------------- /dot/dbt/macros/test_expression_is_true.sql: -------------------------------------------------------------------------------- 1 | -- wrapper around dbt_utils.expression_is_true including the name 2 | -- Commented out column as the expression determines that 3 | {% test expression_is_true(model, expression, column_name=None, condition='1=1', name='do_set_name') %} 4 | {{ return(adapter.dispatch('test_expression_is_true', 'dbt_utils')(model, expression, '', condition)) }} 5 | {% endtest %} 6 | -------------------------------------------------------------------------------- /dot/dbt/macros/test_no_impossible_values.sql: -------------------------------------------------------------------------------- 1 | -- test to make sure the column values are all valid values. 2 | {% macro test_no_impossible_values(model, column_name, values, name) %} 3 | 4 | select 5 | count(*) 6 | from 7 | {{model}} 8 | where 9 | {{column_name}} in {{values}} 10 | 11 | {% endmacro %} -------------------------------------------------------------------------------- /dot/dbt/macros/test_not_less_than_or_equal_zero.sql: -------------------------------------------------------------------------------- 1 | -- test column values are not less than or equal to 0. 2 | {% macro test_not_less_than_or_equal_zero(model, column_name) %} 3 | 4 | select 5 | count(*) 6 | from 7 | {{model}} 8 | where 9 | {{column_name}} <= 0 10 | 11 | {% endmacro %} -------------------------------------------------------------------------------- /dot/dbt/macros/test_not_negative_string_column.sql: -------------------------------------------------------------------------------- 1 | -- test to make sure that columns of string type that 2 | -- represents an integer is not negative. 3 | {% macro test_not_negative_string_column(model, column_name, name, description, table_specific_uuid='uuid') %} 4 | 5 | select 6 | array_agg({{ table_specific_uuid }}) as uuid_list -- postgres only? 7 | from 8 | {{model}} 9 | where 10 | {{column_name}}::varchar like '-%' 11 | having count(*) > 0 12 | 13 | {% endmacro %} 14 | -------------------------------------------------------------------------------- /dot/dbt/macros/test_possible_duplicate_forms.sql: -------------------------------------------------------------------------------- 1 | -- Test to check if a patient was reported more than once in a specified period. 2 | -- Input period parameter is database specific (https://hub.getdbt.com/dbt-labs/dbt_utils/0.1.13/) 3 | -- Some valid values for PostgreSQL include 'day', 'week', 'hour' 4 | -- (See https://www.postgresql.org/docs/9.1/functions-datetime.html#FUNCTIONS-DATETIME-TRUNC) 5 | -- If so, flag as possible duplicate form. 6 | -- TODO Convert these parameter names to generic. Be sure to update DB schema and samples 7 | {% macro test_possible_duplicate_forms(model, table_specific_reported_date='reported', table_specific_patient_uuid='patient_uuid', table_specific_uuid='uuid', table_specific_period='hour', name='possible_duplicate_records') %} 8 | 9 | with records_per_patient_period as ( 10 | select 11 | date_trunc('{{ table_specific_period}}', {{ table_specific_reported_date }}::timestamp) as date_period, 12 | {{ table_specific_patient_uuid }} as patient_uuid_to_flag, 13 | count({{ table_specific_uuid }}) as number_of_records 14 | FROM {{ model }} 15 | group by 1, 2 16 | ), 17 | 18 | possible_duplicate_combinations as ( 19 | select * 20 | from records_per_patient_period 21 | where number_of_records > 1 22 | ) 23 | 24 | select array_agg({{ table_specific_uuid }}) as uuid_list -- postgres only? 25 | from possible_duplicate_combinations pdc 26 | left join {{ model }} m 27 | on date_trunc('{{ table_specific_period}}', m.{{ table_specific_reported_date }}::timestamp) = pdc.date_period 28 | and m.{{ table_specific_patient_uuid }} = pdc.patient_uuid_to_flag 29 | having count(*) > 0 30 | 31 | {% endmacro %} 32 | -------------------------------------------------------------------------------- /dot/dbt/macros/test_relationships.sql: -------------------------------------------------------------------------------- 1 | -- This modification makes it possible to create downstream views 2 | -- of failing test records by avoiding name collisions between 3 | -- columns in the resulting view. 4 | {% macro test_relationships(model, to, field, where, table_specific_uuid='uuid') %} 5 | 6 | {% set column_name = kwargs.get('column_name', kwargs.get('from')) %} 7 | 8 | select array_agg(from_uuid) as uuid_list -- postgres only? 9 | from ( 10 | select {{ table_specific_uuid }} as from_uuid, {{ column_name }} as from_column_id from {{ model }} 11 | {{ where }} 12 | ) as from_model 13 | left join ( 14 | select {{ field }} as to_id from {{ to }} 15 | ) as to_model on to_model.to_id = from_model.from_column_id 16 | where from_model.from_column_id is not null 17 | and to_model.to_id is null 18 | having count(*) > 0 19 | 20 | {% endmacro %} -------------------------------------------------------------------------------- /dot/dbt/macros/test_valid_date.sql: -------------------------------------------------------------------------------- 1 | -- make sure dates are valid. Can specifiy the earliest date a 2 | -- particular column value can be within the schema test. The 3 | -- latest possible date is the current date. 4 | {% macro test_valid_date(model, column_name, earliest_date) %} 5 | 6 | with validation as ( 7 | select 8 | {{ column_name }} as date_field 9 | from {{ model }} 10 | ), 11 | 12 | validation_errors as ( 13 | select 14 | date_field 15 | from validation 16 | where date_field::date < '{{ earliest_date }}'::date 17 | or date_field::date > NOW()::date 18 | ) 19 | 20 | select count(*) 21 | from validation_errors 22 | 23 | {% endmacro %} -------------------------------------------------------------------------------- /dot/dbt/packages.yml: -------------------------------------------------------------------------------- 1 | packages: 2 | - package: dbt-labs/dbt_utils 3 | version: 0.8.0 4 | -------------------------------------------------------------------------------- /dot/dbt/snapshots/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datakind/Data-Observation-Toolkit/9c28925298e4b4a03e75a3b093d26efe13d8c3be/dot/dbt/snapshots/.gitkeep -------------------------------------------------------------------------------- /dot/great_expectations/.gitignore: -------------------------------------------------------------------------------- 1 | uncommitted/ 2 | ge_clean_results.csv 3 | results 4 | -------------------------------------------------------------------------------- /dot/great_expectations/checkpoints/iop_tool_checkpoint.yml: -------------------------------------------------------------------------------- 1 | name: dot_checkpoint 2 | config_version: 1.0 3 | template_name: 4 | module_name: great_expectations.checkpoint 5 | class_name: Checkpoint 6 | run_name_template: 7 | expectation_suite_name: 8 | batch_request: 9 | action_list: 10 | - name: store_validation_result 11 | action: 12 | class_name: StoreValidationResultAction 13 | - name: store_evaluation_params 14 | action: 15 | class_name: StoreEvaluationParametersAction 16 | - name: update_data_docs 17 | action: 18 | class_name: UpdateDataDocsAction 19 | site_names: [] 20 | evaluation_parameters: {} 21 | runtime_configuration: {} 22 | validations: 23 | - batch_request: 24 | datasource_name: mm 25 | data_connector_name: DatabaseDataConnector 26 | data_asset_name: custom_expectations.custom_dataset.CustomSqlAlchemyDataset 27 | data_connector_query: 28 | index: -1 29 | expectation_suite_name: ge_test 30 | profilers: [] 31 | ge_cloud_id: 32 | expectation_suite_ge_cloud_id: 33 | -------------------------------------------------------------------------------- /dot/great_expectations/expectations/.gitignore: -------------------------------------------------------------------------------- 1 | *.json 2 | -------------------------------------------------------------------------------- /dot/great_expectations/notebooks/sql/validation_playground.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Validation Playground\n", 8 | "\n", 9 | "**Watch** a [short tutorial video](https://greatexpectations.io/videos/getting_started/integrate_expectations) or **read** [the written tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data)\n", 10 | "\n", 11 | "#### This notebook assumes that you created at least one expectation suite in your project.\n", 12 | "#### Here you will learn how to validate data in a SQL database against an expectation suite.\n", 13 | "\n", 14 | "\n", 15 | "We'd love it if you **reach out for help on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import json\n", 25 | "import great_expectations as ge\n", 26 | "import great_expectations.jupyter_ux\n", 27 | "from great_expectations.datasource.types import BatchKwargs\n", 28 | "from datetime import datetime" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "## 1. Get a DataContext\n", 36 | "This represents your **project** that you just created using `great_expectations init`." 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "context = ge.data_context.DataContext()" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "## 2. Choose an Expectation Suite\n", 53 | "\n", 54 | "List expectation suites that you created in your project" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "context.list_expectation_suite_names()" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "expectation_suite_name = # TODO: set to a name from the list above" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "## 3. Load a batch of data you want to validate\n", 80 | "\n", 81 | "To learn more about `get_batch`, see [this tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#load-a-batch-of-data-to-validate)\n" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "# list datasources of the type SqlAlchemyDatasource in your project\n", 91 | "[datasource['name'] for datasource in context.list_datasources() if datasource['class_name'] == 'SqlAlchemyDatasource']" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "datasource_name = # TODO: set to a datasource name from above" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "# If you would like to validate an entire table or view in your database's default schema:\n", 110 | "batch_kwargs = {'table': \"YOUR_TABLE\", 'datasource': datasource_name}\n", 111 | "\n", 112 | "# If you would like to validate an entire table or view from a non-default schema in your database:\n", 113 | "batch_kwargs = {'table': \"YOUR_TABLE\", \"schema\": \"YOUR_SCHEMA\", 'datasource': datasource_name}\n", 114 | "\n", 115 | "# If you would like to validate the result set of a query:\n", 116 | "# batch_kwargs = {'query': 'SELECT YOUR_ROWS FROM YOUR_TABLE', 'datasource': datasource_name}\n", 117 | "\n", 118 | "\n", 119 | "\n", 120 | "batch = context.get_batch(batch_kwargs, expectation_suite_name)\n", 121 | "batch.head()" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "## 4. Validate the batch with Validation Operators\n", 129 | "\n", 130 | "`Validation Operators` provide a convenient way to bundle the validation of\n", 131 | "multiple expectation suites and the actions that should be taken after validation.\n", 132 | "\n", 133 | "When deploying Great Expectations in a **real data pipeline, you will typically discover these needs**:\n", 134 | "\n", 135 | "* validating a group of batches that are logically related\n", 136 | "* validating a batch against several expectation suites such as using a tiered pattern like `warning` and `failure`\n", 137 | "* doing something with the validation results (e.g., saving them for a later review, sending notifications in case of failures, etc.).\n", 138 | "\n", 139 | "[Read more about Validation Operators in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#save-validation-results)" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "# This is an example of invoking a validation operator that is configured by default in the great_expectations.yml file\n", 149 | "\n", 150 | "#Generate a run id, a timestamp, or a meaningful string that will help you refer to validation results. We recommend they be chronologically sortable.\n", 151 | "# Let's make a simple sortable timestamp. Note this could come from your pipeline runner (e.g., Airflow run id).\n", 152 | "run_id = datetime.utcnow().isoformat().replace(\":\", \"\") + \"Z\"\n", 153 | "\n", 154 | "results = context.run_validation_operator(\n", 155 | " \"action_list_operator\", \n", 156 | " assets_to_validate=[batch], \n", 157 | " run_id=run_id)" 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "metadata": {}, 163 | "source": [ 164 | "## 5. View the Validation Results in Data Docs\n", 165 | "\n", 166 | "Let's now build and look at your Data Docs. These will now include an **data quality report** built from the `ValidationResults` you just created that helps you communicate about your data with both machines and humans.\n", 167 | "\n", 168 | "[Read more about Data Docs in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "context.open_data_docs()" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": {}, 183 | "source": [ 184 | "## Congratulations! You ran Validations!\n", 185 | "\n", 186 | "## Next steps:\n", 187 | "\n", 188 | "### 1. Read about the typical workflow with Great Expectations:\n", 189 | "\n", 190 | "[typical workflow](https://docs.greatexpectations.io/en/latest/getting_started/typical_workflow.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)\n", 191 | "\n", 192 | "### 2. Explore the documentation & community\n", 193 | "\n", 194 | "You are now among the elite data professionals who know how to build robust descriptions of your data and protections for pipelines and machine learning models. Join the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack) to see how others are wielding these superpowers." 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [] 203 | } 204 | ], 205 | "metadata": { 206 | "kernelspec": { 207 | "display_name": "Python 3", 208 | "language": "python", 209 | "name": "python3" 210 | }, 211 | "language_info": { 212 | "codemirror_mode": { 213 | "name": "ipython", 214 | "version": 3 215 | }, 216 | "file_extension": ".py", 217 | "mimetype": "text/x-python", 218 | "name": "python", 219 | "nbconvert_exporter": "python", 220 | "pygments_lexer": "ipython3", 221 | "version": "3.7.0" 222 | }, 223 | "pycharm": { 224 | "stem_cell": { 225 | "cell_type": "raw", 226 | "source": [], 227 | "metadata": { 228 | "collapsed": false 229 | } 230 | } 231 | } 232 | }, 233 | "nbformat": 4, 234 | "nbformat_minor": 4 235 | } -------------------------------------------------------------------------------- /dot/great_expectations/plugins/custom_data_docs/styles/data_docs_custom_styles.css: -------------------------------------------------------------------------------- 1 | /*index page*/ 2 | .ge-index-page-site-name-title {} 3 | .ge-index-page-table-container {} 4 | .ge-index-page-table {} 5 | .ge-index-page-table-profiling-links-header {} 6 | .ge-index-page-table-expectations-links-header {} 7 | .ge-index-page-table-validations-links-header {} 8 | .ge-index-page-table-profiling-links-list {} 9 | .ge-index-page-table-profiling-links-item {} 10 | .ge-index-page-table-expectation-suite-link {} 11 | .ge-index-page-table-validation-links-list {} 12 | .ge-index-page-table-validation-links-item {} 13 | 14 | /*breadcrumbs*/ 15 | .ge-breadcrumbs {} 16 | .ge-breadcrumbs-item {} 17 | 18 | /*navigation sidebar*/ 19 | .ge-navigation-sidebar-container {} 20 | .ge-navigation-sidebar-content {} 21 | .ge-navigation-sidebar-title {} 22 | .ge-navigation-sidebar-link {} 23 | -------------------------------------------------------------------------------- /dot/great_expectations/readme.md: -------------------------------------------------------------------------------- 1 | # Validating Community Health Data with Great Expectations 2 | 3 | #### Directory structure 4 | 5 | The [great_expectations folder](dot/great_expectations) is a [Great Expectations](https://greatexpectations.io/) project. 6 | 7 | - Core directories 8 | - __expectations:__ JSON files indicating the tests to be run (similar to yml files containing schema tests in DBT); 9 | these are not in version control, but generated by the tool from `dot.configured_tests` 10 | - __notebooks:__ Jupyter notebooks automatically created by Great Expectations during setup to allow for a more 11 | convenient front-end to edit the JSON files in `expectations` (check out the flow described by the 12 | [Great Expectations documentation](https://docs.greatexpectations.io/en/latest/how_to_guides/creating_and_editing_expectations/how_to_edit_an_expectation_suite_using_a_disposable_notebook.html)) 13 | - __plugins:__ Additional code for customizing this Great Expectations project. Most important in here is `custom_expectations.py`, which is where tests requiring arbitrary python should be added as methods under the `CustomSqlAlchemyDataset` class (somewhat similar to the custom SQL tests in DBT, except written in python). 14 | 15 | - Non-version controlled directories (automatically created by Great Expectations) 16 | - __uncommitted:__ Generic place to capture all Great Expecations files that should not be version controlled, 17 | including logs and database connection details 18 | 19 | - Scripts 20 | - [__great_expectations.py:__](dot/utils/great_expectations.py) Utility script to automatically run GE tests 21 | and create coverage reports. GE's CLI commands for this are much more verbose and more difficult to remember than DBTs. 22 | 23 | - Config files -not in version control, these are either managed by the tool or set in the project-dependent config 24 | - __batch_config.json:__ Defines datasources, test suites, and tables to be included when running Great Expectations 25 | - __great_expectations.yml:__ Main config file for Great Expectations (similar to `dbt_project.yml`) 26 | 27 | #### Terminology 28 | 29 | - An expectation is a particular function accepting one or multiple parameters (defined in Python) 30 | - A test is an instance of an expectation, with a specific set of parameters (defined in a JSON file) 31 | - Out Of The Box (OOTB) expectations are provided by Great Expectations and built-in with the library's codebase. 32 | 33 | #### Structuring tests 34 | 35 | - Tests are defined in JSON, akin to dbt schema tests in yaml 36 | - These live in a suite called `great_expectations/expectations/.json` 37 | 38 | - Tests could be used OOTB, but most of the time are written custom 39 | - Custom expectations can operate on any tables passed as a parameter, but OOTB expectations will only be applied on 40 | the selected table in batch_config.json (see extra notes below for details) 41 | - OOTB tests can use views defined in DBT 42 | - OOTB tests can be defined directly in the JSON file 43 | - Custom expectations need to be added as decorated methods in `plugins/custom_expectations.py` 44 | - Custom tests can run arbitrary python/pandas (even though this isn't well-documented in the GE published docs) 45 | - Once added to in custom_expectations.py, tests can be defined in the JSON file similarly to OOTB tests 46 | - OOTB tests have a variety of outputs and therefore might not conform to the format expected by the 47 | Data Integrity framework -whenever possible, use DBT tests instead 48 | 49 | If a mix of OOTB and custom expectations are needed, it is suggested to keep them in two suites of tests to manage 50 | their differences efficiently 51 | 52 | #### Extra notes 53 | 54 | The data integrity tool works with a few assumptions in terms of what an expectation should accept and return. 55 | 56 | 1. We create views out of the DOT results with Postgresql-specific syntax. If you're using any other database engine, 57 | please adapt the query in [great_expectations.py](dot/utils/great_expectations.py). 58 | 59 | 2. An expectation accepts both column names and table names as arguments. Great Expectations generally has 60 | table-agnostic suites running on specific single tables, but we're changing this model a bit because data integrity 61 | queries often depend on more than one table. Therefore, a default empty dataset is added in the `batch_config.json` 62 | for all custom expectations, and a relevant table name should be passed to the expectation in the suite definition. 63 | The default dataset won't be read at all and is used as a placeholder. 64 | 65 | 3. Custom expectations are found in custom_expectations.py under plugins, it is recommended to follow their format and 66 | to add your own custom expectations as methods of that same class. 67 | 68 | 4. The tool's post-processing step expects a few specific field in the output of the expectations 69 | (refer to example custom expectations to see how they're implemented) 70 | -------------------------------------------------------------------------------- /dot/install_dot.sh: -------------------------------------------------------------------------------- 1 | # Simple install script to install Python dependencies, dbt and ge 2 | 3 | # Python packages 4 | pip install --upgrade pip 5 | # Based on this https://stackoverflow.com/questions/69287269/installing-ruamel-yaml-clib-with-docker. 6 | pip install -U pip setuptools wheel ruamel.yaml ruamel.yaml.clib==0.2.6 7 | pip install -r requirements_dot.txt -------------------------------------------------------------------------------- /dot/logs/.gitignore: -------------------------------------------------------------------------------- 1 | *.log* -------------------------------------------------------------------------------- /dot/requirements_dot.txt: -------------------------------------------------------------------------------- 1 | black==21.12b0 2 | dbt-core==1.0.0 3 | dbt-extractor==0.4.0 4 | dbt-postgres==1.0.0 5 | decorator==5.1.0 6 | Faker==15.1.1 7 | faker_airtravel==0.4 8 | flatten_json==0.1.13 9 | gdown==4.5.3 10 | great-expectations==0.12.10 11 | ipython==8.10.0 12 | ipython-genutils==0.2.0 13 | jinja2==2.11.3 14 | json-rpc==1.13.0 15 | jupyter-client==7.1.0 16 | jupyter-core==4.9.1 17 | jupyterlab-pygments==0.1.2 18 | jupyterlab-widgets==1.0.2 19 | MarkupSafe==2.0.1 20 | minimal-snowplow-tracker==0.0.2 21 | mock==4.0.3 22 | nbformat==5.1.3 23 | nest-asyncio==1.5.4 24 | notebook==6.4.12 25 | numpy==1.22.0 26 | oauthlib==3.2.2 27 | openpyxl==3.0.9 28 | pandas==1.3.4 29 | proto-plus==1.19.8 30 | psycopg2-binary==2.9.2 31 | Pygments==2.10.0 32 | pylint==2.14.1 33 | pytest==7.2.0 34 | pytest-timeout==2.1.0 35 | python_on_whales==0.53.0 36 | python-slugify==5.0.2 37 | PyYAML==6.0 38 | requests==2.23.0 39 | requests-oauthlib==1.3.0 40 | rsa==4.8 41 | ruamel.yaml==0.17.17 42 | ruamel.yaml.clib==0.2.6 43 | setuptools-scm==6.3.2 44 | SQLAlchemy==1.3.24 45 | toolz==0.11.2 46 | typing-extensions==3.10.0.2 47 | -------------------------------------------------------------------------------- /dot/run_everything.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import argparse 3 | import uuid 4 | from utils.run_management import run_dot_tests 5 | from utils.utils import setup_custom_logger 6 | 7 | 8 | class DOTRunException(Exception): 9 | """Catch exceptions from an DOT run""" 10 | 11 | 12 | logger = setup_custom_logger("./logs/run_everything.log", logging.INFO) 13 | 14 | logger.info("Starting DOT test run") 15 | 16 | # Set up arguments for parsing files 17 | parser = argparse.ArgumentParser(description="Specify arguments") 18 | parser.add_argument( 19 | "--project_id", 20 | action="store", 21 | required=True, 22 | help="DOT project name, eg Muso or Brac", 23 | ) 24 | project_id = parser.parse_args().project_id 25 | 26 | # Generate the run_id 27 | run_id = uuid.uuid4() 28 | 29 | # noinspection PyBroadException 30 | try: 31 | run_dot_tests(project_id, logger, run_id) 32 | logger.info("Completed DOT run. Your results should in DB table dot.test_results.") 33 | except DOTRunException: 34 | logging.exception("Fatal Error in Main Loop", exc_info=True) 35 | -------------------------------------------------------------------------------- /dot/self_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datakind/Data-Observation-Toolkit/9c28925298e4b4a03e75a3b093d26efe13d8c3be/dot/self_tests/__init__.py -------------------------------------------------------------------------------- /dot/self_tests/data/base_self_test/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datakind/Data-Observation-Toolkit/9c28925298e4b4a03e75a3b093d26efe13d8c3be/dot/self_tests/data/base_self_test/.gitkeep -------------------------------------------------------------------------------- /dot/self_tests/data/dot_input_files/dbt/core/dot_model__airlines_data.sql: -------------------------------------------------------------------------------- 1 | {{ config(materialized='view') }} 2 | {% set schema = 'self_tests_public' %} 3 | select DISTINCT airline 4 | from {{ schema }}.flight_data 5 | -------------------------------------------------------------------------------- /dot/self_tests/data/dot_input_files/dbt/core/dot_model__all_airports_data.sql: -------------------------------------------------------------------------------- 1 | {{ config(materialized='view') }} 2 | {% set schema = 'self_tests_public' %} 3 | select * 4 | from {{ schema }}.airport_data 5 | -------------------------------------------------------------------------------- /dot/self_tests/data/dot_input_files/dbt/core/dot_model__all_airports_data.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | models: 3 | - name: dot_model__all_airports_data 4 | columns: 5 | - name: airport 6 | description: Airport not unique 7 | tests: 8 | - unique 9 | -------------------------------------------------------------------------------- /dot/self_tests/data/dot_input_files/dbt/core/dot_model__all_flight_data.sql: -------------------------------------------------------------------------------- 1 | {{ config(materialized='view') }} 2 | {% set schema = 'self_tests_public' %} 3 | select * 4 | from {{ schema }}.flight_data 5 | -------------------------------------------------------------------------------- /dot/self_tests/data/dot_input_files/dbt/core/dot_model__all_flight_data.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | models: 3 | - name: dot_model__all_flight_data 4 | columns: 5 | - name: origin_airport 6 | description: Flight with no airport record 7 | tests: 8 | - not_null 9 | - relationships: 10 | to: ref('dot_model__all_airports_data') 11 | name: flight_with_no_airport 12 | field: airport 13 | - name: price 14 | description: Price is not negative 15 | tests: 16 | - not_negative_string_column: 17 | name: price 18 | - name: stops 19 | description: Disallowed FP methods entered in form 20 | tests: 21 | - accepted_values: 22 | values: 23 | - '1' 24 | - '2' 25 | - '3' 26 | - Non-stop 27 | -------------------------------------------------------------------------------- /dot/self_tests/data/dot_input_files/dbt/core/dot_model__ethiopia_airlines_data.sql: -------------------------------------------------------------------------------- 1 | {{ config(materialized='view') }} 2 | {% set schema = 'self_tests_public' %} 3 | select * 4 | from {{ schema }}.flight_data WHERE airline='Ethiopian Airlines' 5 | -------------------------------------------------------------------------------- /dot/self_tests/data/dot_input_files/dbt/core/dot_model__zagreb_flight_data.sql: -------------------------------------------------------------------------------- 1 | {{ config(materialized='view') }} 2 | {% set schema = 'self_tests_public' %} 3 | select * 4 | from {{ schema }}.flight_data WHERE origin_airport='Zagreb airport' 5 | -------------------------------------------------------------------------------- /dot/self_tests/data/dot_input_files/dbt/test/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datakind/Data-Observation-Toolkit/9c28925298e4b4a03e75a3b093d26efe13d8c3be/dot/self_tests/data/dot_input_files/dbt/test/.gitkeep -------------------------------------------------------------------------------- /dot/self_tests/data/dot_output_files/dbt/manifest_node_ex_non_negative_string_column.json: -------------------------------------------------------------------------------- 1 | {'raw_sql': '{{ test_not_negative_string_column(**_dbt_generic_test_kwargs) }}{{ config(alias="not_negative_string_column_dot_c94cb403bb77709eee61711f6b11ce44") }}', 2 | 'test_metadata': {'name': 'not_negative_string_column', 3 | 'kwargs': {'name': 'value', 4 | 'column_name': 'value', 5 | 'model': "{{ get_where_subquery(ref('dot_model__fpview_registration')) }}"}, 6 | 'namespace': None}, 7 | 'compiled': True, 8 | 'resource_type': 'test', 9 | 'depends_on': {'macros': ['macro.dbt_model_1.test_not_negative_string_column', 10 | 'macro.dbt.get_where_subquery', 11 | 'macro.dbt.should_store_failures', 12 | 'macro.dbt.statement'], 13 | 'nodes': ['model.dbt_model_1.dot_model__fpview_registration']}, 14 | 'config': {'enabled': True, 15 | 'alias': 'not_negative_string_column_dot_c94cb403bb77709eee61711f6b11ce44', 16 | 'schema': 'dbt_test__audit', 17 | 'database': None, 18 | 'tags': [], 19 | 'meta': {}, 20 | 'materialized': 'test', 21 | 'severity': 'ERROR', 22 | 'store_failures': None, 23 | 'where': None, 24 | 'limit': None, 25 | 'fail_calc': 'count(*)', 26 | 'warn_if': '!= 0', 27 | 'error_if': '!= 0'}, 28 | 'database': 'dot_db', 29 | 'schema': 'self_tests_public_dbt_test__audit', 30 | 'fqn': ['dbt_model_1', 31 | 'Muso', 32 | 'core', 33 | 'not_negative_string_column_dot_model__fpview_registration_value__value'], 34 | 'unique_id': 'test.dbt_model_1.not_negative_string_column_dot_model__fpview_registration_value__value.e15d766b3b', 35 | 'package_name': 'dbt_model_1', 36 | 'root_path': '/Users/lrnzcig/git/Data-Observation-Toolkit/dot/dbt', 37 | 'path': 'not_negative_string_column_dot_c94cb403bb77709eee61711f6b11ce44.sql', 38 | 'original_file_path': 'models/Muso/core/dot_model__fpview_registration.yml', 39 | 'name': 'not_negative_string_column_dot_model__fpview_registration_value__value', 40 | 'alias': 'not_negative_string_column_dot_c94cb403bb77709eee61711f6b11ce44', 41 | 'checksum': {'name': 'none', 'checksum': ''}, 42 | 'tags': [], 43 | 'refs': [['dot_model__fpview_registration']], 44 | 'sources': [], 45 | 'description': '', 46 | 'columns': {}, 47 | 'meta': {}, 48 | 'docs': {'show': True}, 49 | 'patch_path': None, 50 | 'compiled_path': 'target/compiled/dbt_model_1/models/Muso/core/dot_model__fpview_registration.yml/not_negative_string_column_dot_c94cb403bb77709eee61711f6b11ce44.sql', 51 | 'build_path': 'target/run/dbt_model_1/models/Muso/core/dot_model__fpview_registration.yml/not_negative_string_column_dot_c94cb403bb77709eee61711f6b11ce44.sql', 52 | 'deferred': False, 53 | 'unrendered_config': {'alias': 'not_negative_string_column_dot_c94cb403bb77709eee61711f6b11ce44'}, 54 | 'created_at': 1654030541.08323, 55 | 'compiled_sql': '\n\nselect\n array_agg(uuid) as uuid_list -- postgres only?\nfrom\n "dot_db"."self_tests_public_tests"."dot_model__fpview_registration"\nwhere\n value::varchar like \'-%\'\nhaving count(*) > 0\n\n', 56 | 'extra_ctes_injected': True, 57 | 'extra_ctes': [], 58 | 'relation_name': None, 59 | 'column_name': 'value', 60 | 'file_key_name': 'models.dot_model__fpview_registration'} -------------------------------------------------------------------------------- /dot/self_tests/data/dot_output_files/dbt/target/run_results_archive.json: -------------------------------------------------------------------------------- 1 | {"metadata": {"dbt_schema_version": "https://schemas.getdbt.com/dbt/run-results/v4.json", "dbt_version": "1.0.0", "generated_at": "2022-10-24T19:02:02.886650Z", "invocation_id": "ff0b065e-3919-43e7-be6f-d0ae82e4455b", "env": {}}, "results": [{"status": "fail", "timing": [{"name": "compile", "started_at": "2022-10-24T19:02:02.612688Z", "completed_at": "2022-10-24T19:02:02.690684Z"}, {"name": "execute", "started_at": "2022-10-24T19:02:02.724331Z", "completed_at": "2022-10-24T19:02:02.788380Z"}], "thread_id": "Thread-3", "execution_time": 0.18724608421325684, "adapter_response": {}, "message": "Got 53 results, configured to fail if != 0", "failures": 53, "unique_id": "test.dbt_model_1.not_null_dot_model__all_flight_data_origin_airport.2196b664b6"}, {"status": "fail", "timing": [{"name": "compile", "started_at": "2022-10-24T19:02:02.612127Z", "completed_at": "2022-10-24T19:02:02.689966Z"}, {"name": "execute", "started_at": "2022-10-24T19:02:02.691191Z", "completed_at": "2022-10-24T19:02:02.790061Z"}], "thread_id": "Thread-1", "execution_time": 0.18971800804138184, "adapter_response": {}, "message": "Got 2 results, configured to fail if != 0", "failures": 2, "unique_id": "test.dbt_model_1.accepted_values_dot_model__all_flight_data_stops__1__2__3__Non_stop.b734743116"}, {"status": "fail", "timing": [{"name": "compile", "started_at": "2022-10-24T19:02:02.612507Z", "completed_at": "2022-10-24T19:02:02.690232Z"}, {"name": "execute", "started_at": "2022-10-24T19:02:02.714221Z", "completed_at": "2022-10-24T19:02:02.791497Z"}], "thread_id": "Thread-2", "execution_time": 0.19088268280029297, "adapter_response": {}, "message": "Got 1 result, configured to fail if != 0", "failures": 1, "unique_id": "test.dbt_model_1.not_negative_string_column_dot_model__all_flight_data_price__price.322389c2ba"}, {"status": "fail", "timing": [{"name": "compile", "started_at": "2022-10-24T19:02:02.663637Z", "completed_at": "2022-10-24T19:02:02.696902Z"}, {"name": "execute", "started_at": "2022-10-24T19:02:02.727077Z", "completed_at": "2022-10-24T19:02:02.793657Z"}], "thread_id": "Thread-4", "execution_time": 0.19394469261169434, "adapter_response": {}, "message": "Got 1 result, configured to fail if != 0", "failures": 1, "unique_id": "test.dbt_model_1.relationships_dot_model__all_flight_data_origin_airport__airport__flight_with_no_airport__ref_dot_model__all_airports_data_.3a9f7e32d9"}, {"status": "fail", "timing": [{"name": "compile", "started_at": "2022-10-24T19:02:02.808912Z", "completed_at": "2022-10-24T19:02:02.818847Z"}, {"name": "execute", "started_at": "2022-10-24T19:02:02.819295Z", "completed_at": "2022-10-24T19:02:02.858589Z"}], "thread_id": "Thread-3", "execution_time": 0.054596662521362305, "adapter_response": {}, "message": "Got 2 results, configured to fail if != 0", "failures": 2, "unique_id": "test.dbt_model_1.unique_dot_model__all_airports_data_airport.912f240fa1"}], "elapsed_time": 0.3934199810028076, "args": {"write_json": true, "use_colors": true, "printer_width": 80, "version_check": true, "partial_parse": false, "static_parser": true, "profiles_dir": "/Users/lrnzcig/.dbt", "send_anonymous_usage_stats": true, "event_buffer_size": 100000, "store_failures": false, "indirect_selection": "eager", "select": ["core"], "which": "test", "rpc_method": "test"}} -------------------------------------------------------------------------------- /dot/self_tests/data/dot_output_files/dbt/target/run_results_test.json: -------------------------------------------------------------------------------- 1 | {"metadata": {"dbt_schema_version": "https://schemas.getdbt.com/dbt/run-results/v4.json", "dbt_version": "1.0.0", "generated_at": "2022-10-24T19:02:08.843325Z", "invocation_id": "4b824a62-a9ca-4493-a264-b60a1cd88a69", "env": {}}, "results": [{"status": "success", "timing": [{"name": "compile", "started_at": "2022-10-24T19:02:08.495513Z", "completed_at": "2022-10-24T19:02:08.524823Z"}, {"name": "execute", "started_at": "2022-10-24T19:02:08.556902Z", "completed_at": "2022-10-24T19:02:08.724281Z"}], "thread_id": "Thread-2", "execution_time": 0.23859381675720215, "adapter_response": {"_message": "CREATE VIEW", "code": "CREATE VIEW", "rows_affected": -1}, "message": "CREATE VIEW", "failures": null, "unique_id": "model.dbt_model_1.tr_dot_model__all_flight_data_accepted_values_stops"}, {"status": "success", "timing": [{"name": "compile", "started_at": "2022-10-24T19:02:08.495752Z", "completed_at": "2022-10-24T19:02:08.525439Z"}, {"name": "execute", "started_at": "2022-10-24T19:02:08.561418Z", "completed_at": "2022-10-24T19:02:08.726332Z"}], "thread_id": "Thread-3", "execution_time": 0.23929595947265625, "adapter_response": {"_message": "CREATE VIEW", "code": "CREATE VIEW", "rows_affected": -1}, "message": "CREATE VIEW", "failures": null, "unique_id": "model.dbt_model_1.tr_dot_model__all_flight_data_flight_with_no_a"}, {"status": "success", "timing": [{"name": "compile", "started_at": "2022-10-24T19:02:08.499595Z", "completed_at": "2022-10-24T19:02:08.525885Z"}, {"name": "execute", "started_at": "2022-10-24T19:02:08.564159Z", "completed_at": "2022-10-24T19:02:08.728836Z"}], "thread_id": "Thread-4", "execution_time": 0.2406601905822754, "adapter_response": {"_message": "CREATE VIEW", "code": "CREATE VIEW", "rows_affected": -1}, "message": "CREATE VIEW", "failures": null, "unique_id": "model.dbt_model_1.tr_dot_model__all_flight_data_not_null_origin_a"}, {"status": "success", "timing": [{"name": "compile", "started_at": "2022-10-24T19:02:08.495061Z", "completed_at": "2022-10-24T19:02:08.513356Z"}, {"name": "execute", "started_at": "2022-10-24T19:02:08.514007Z", "completed_at": "2022-10-24T19:02:08.730398Z"}], "thread_id": "Thread-1", "execution_time": 0.24455904960632324, "adapter_response": {"_message": "CREATE VIEW", "code": "CREATE VIEW", "rows_affected": -1}, "message": "CREATE VIEW", "failures": null, "unique_id": "model.dbt_model_1.tr_dot_model__all_airports_data_unique_airport"}, {"status": "success", "timing": [{"name": "compile", "started_at": "2022-10-24T19:02:08.746047Z", "completed_at": "2022-10-24T19:02:08.750603Z"}, {"name": "execute", "started_at": "2022-10-24T19:02:08.751054Z", "completed_at": "2022-10-24T19:02:08.812377Z"}], "thread_id": "Thread-2", "execution_time": 0.06791210174560547, "adapter_response": {"_message": "CREATE VIEW", "code": "CREATE VIEW", "rows_affected": -1}, "message": "CREATE VIEW", "failures": null, "unique_id": "model.dbt_model_1.tr_dot_model__all_flight_data_price"}], "elapsed_time": 0.5887680053710938, "args": {"write_json": true, "use_colors": true, "printer_width": 80, "version_check": true, "partial_parse": false, "static_parser": true, "profiles_dir": "/Users/lrnzcig/.dbt", "send_anonymous_usage_stats": true, "event_buffer_size": 100000, "select": ["test"], "which": "run", "rpc_method": "run", "indirect_selection": "eager"}} -------------------------------------------------------------------------------- /dot/self_tests/data/expected/dot_model__all_flight_data.sql: -------------------------------------------------------------------------------- 1 | {{ config(materialized='view') }} 2 | {% set schema = 'schema_project' %} 3 | select * 4 | from {{ schema }}.flight_data 5 | -------------------------------------------------------------------------------- /dot/self_tests/data/expected/extract_df_from_dbt_test_results_json.csv: -------------------------------------------------------------------------------- 1 | ,run_id,test_id,entity_id,test_type,column_name,id_column_name,test_parameters,test_status,test_status_message,failed_tests_view,failed_tests_view_sql 2 | test.dbt_model_1.accepted_values_dot_model__all_flight_data_stops__1__2__3__Non_stop.b734743116,4541476c-814e-43fe-ab38-786f36beecbc,cad13f73-27b5-3427-be8f-4d213bba3b19,all_flight_data,accepted_values,stops,,"{'values': ['1', '2', '3', 'Non-stop']}",fail,"got 2 results, configured to fail if != 0",tr_dot_model__all_flight_data_accepted_values_stops," WITH all_values AS ( 3 | SELECT dot_model__all_flight_data.stops AS value_field, 4 | count(*) AS n_records 5 | FROM self_tests_public_tests.dot_model__all_flight_data 6 | GROUP BY dot_model__all_flight_data.stops 7 | ) 8 | SELECT all_values.value_field, 9 | all_values.n_records 10 | FROM all_values 11 | WHERE all_values.value_field::text <> ALL (ARRAY['1'::character varying::text, '2'::character varying::text, '3'::character varying::text, 'Non-stop'::character varying::text]);" 12 | test.dbt_model_1.not_negative_string_column_dot_model__all_flight_data_price__price.322389c2ba,4541476c-814e-43fe-ab38-786f36beecbc,ed27037a-4054-3070-9d88-fdf9cd0231c8,all_flight_data,not_negative_string_column,price,,{'name': 'price'},fail,"got 1 result, configured to fail if != 0",tr_dot_model__all_flight_data_price," SELECT array_agg(dot_model__all_flight_data.uuid) AS uuid_list 13 | FROM self_tests_public_tests.dot_model__all_flight_data 14 | WHERE dot_model__all_flight_data.price::character varying::text ~~ '-%'::text 15 | HAVING count(*) > 0;" 16 | 17 | test.dbt_model_1.not_null_dot_model__all_flight_data_origin_airport.2196b664b6,4541476c-814e-43fe-ab38-786f36beecbc,df44c2f4-65f8-3170-a03f-6035aaa45660,all_flight_data,not_null,origin_airport,,{},fail,"got 53 results, configured to fail if != 0",tr_dot_model__all_flight_data_not_null_origin_a," SELECT dot_model__all_flight_data.uuid, 18 | dot_model__all_flight_data.departure_time, 19 | dot_model__all_flight_data.airline, 20 | dot_model__all_flight_data.origin_airport, 21 | dot_model__all_flight_data.origin_iata, 22 | dot_model__all_flight_data.destination_airport, 23 | dot_model__all_flight_data.destination_iata, 24 | dot_model__all_flight_data.stops, 25 | dot_model__all_flight_data.price 26 | FROM self_tests_public_tests.dot_model__all_flight_data 27 | WHERE dot_model__all_flight_data.origin_airport IS NULL;" 28 | 29 | test.dbt_model_1.relationships_dot_model__all_flight_data_origin_airport__airport__flight_with_no_airport__ref_dot_model__all_airports_data_.3a9f7e32d9,4541476c-814e-43fe-ab38-786f36beecbc,2ba7f3e8-cd62-37ac-854f-01f704489130,all_flight_data,relationships,origin_airport,,"{'to': ""ref('dot_model__all_airports_data')"", 'name': 'flight_with_no_airport', 'field': 'airport'}",fail,"got 1 result, configured to fail if != 0",tr_dot_model__all_flight_data_flight_with_no_a," SELECT array_agg(from_model.from_uuid) AS uuid_list 30 | FROM ( SELECT dot_model__all_flight_data.uuid AS from_uuid, 31 | dot_model__all_flight_data.origin_airport AS from_column_id 32 | FROM self_tests_public_tests.dot_model__all_flight_data) from_model 33 | LEFT JOIN ( SELECT dot_model__all_airports_data.airport AS to_id 34 | FROM self_tests_public_tests.dot_model__all_airports_data) to_model ON to_model.to_id::text = from_model.from_column_id::text 35 | WHERE from_model.from_column_id IS NOT NULL AND to_model.to_id IS NULL 36 | HAVING count(*) > 0;" 37 | 38 | test.dbt_model_1.unique_dot_model__all_airports_data_airport.912f240fa1,4541476c-814e-43fe-ab38-786f36beecbc,942f4496-1202-3768-9cfe-96128bcd754c,all_airports_data,unique,airport,,{},fail,"got 2 results, configured to fail if != 0",tr_dot_model__all_airports_data_unique_airport," SELECT dot_model__all_airports_data.airport AS unique_field, 39 | count(*) AS n_records 40 | FROM self_tests_public_tests.dot_model__all_airports_data 41 | WHERE dot_model__all_airports_data.airport IS NOT NULL 42 | GROUP BY dot_model__all_airports_data.airport 43 | HAVING count(*) > 1;" 44 | -------------------------------------------------------------------------------- /dot/self_tests/data/expected/get_test_parameters_non_negative_string_column.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datakind/Data-Observation-Toolkit/9c28925298e4b4a03e75a3b093d26efe13d8c3be/dot/self_tests/data/expected/get_test_parameters_non_negative_string_column.json -------------------------------------------------------------------------------- /dot/self_tests/data/expected/integration/test_results_summary.csv: -------------------------------------------------------------------------------- 1 | ,run_id,test_id,entity_id,test_type,column_name,test_parameters,test_status,test_status_message,failed_tests_view,failed_tests_view_sql,rows_total,rows_failed,rows_passed 2 | 0,51479924-b5ff-4efc-9ba0-97cbd021b39f,cad13f73-27b5-3427-be8f-4d213bba3b19,all_flight_data,accepted_values,stops,"{'values': ['1', '2', '3', 'Non-stop']}",fail,"got 2 results, configured to fail if != 0",tr_dot_model__all_flight_data_accepted_values_stops," WITH all_values AS ( 3 | SELECT dot_model__all_flight_data.stops AS value_field, 4 | count(*) AS n_records 5 | FROM self_tests_public_tests.dot_model__all_flight_data 6 | GROUP BY dot_model__all_flight_data.stops 7 | ) 8 | SELECT all_values.value_field, 9 | all_values.n_records 10 | FROM all_values 11 | WHERE all_values.value_field::text <> ALL (ARRAY['1'::character varying, '2'::character varying, '3'::character varying, 'Non-stop'::character varying]::text[]);",1001,2,999 12 | 1,51479924-b5ff-4efc-9ba0-97cbd021b39f,a87d911d-28cb-3453-a777-b62cec58c7ba,all_flight_data,custom_sql,," 13 | select 14 | distinct uuid, 15 | 'dot_model__all_flight_data' as primary_table, 16 | 'uuid' as primary_table_id_field 17 | from {{ ref('dot_model__all_flight_data') }} 18 | where CAST(REGEXP_REPLACE(COALESCE(stops,'0'), '[^0-9]+', '0', 'g') as INTEGER) > 5 19 | ",fail,"got 1 result, configured to fail if != 0",tr_dot_model__all_flight_data_id7," SELECT DISTINCT dot_model__all_flight_data.uuid, 20 | 'dot_model__all_flight_data'::text AS primary_table, 21 | 'uuid'::text AS primary_table_id_field 22 | FROM self_tests_public_tests.dot_model__all_flight_data 23 | WHERE regexp_replace(COALESCE(dot_model__all_flight_data.stops, '0'::character varying)::text, '[^0-9]+'::text, '0'::text, 'g'::text)::integer > 5;",1001,1,1000 24 | 2,51479924-b5ff-4efc-9ba0-97cbd021b39f,368b65ac-b5e4-37a9-902d-5f385f94a9a0,all_flight_data,expression_is_true,,"{'name': 't_direct_flights_positive_price', 'condition': ""stops = 'non-stop'"", 'expression': 'price is not null and price > 0'}",fail,"got 10 results, configured to fail if != 0",tr_dot_model__all_flight_data_t_direct_flights_p_p," WITH meet_condition AS ( 25 | SELECT dot_model__all_flight_data.uuid, 26 | dot_model__all_flight_data.departure_time, 27 | dot_model__all_flight_data.airline, 28 | dot_model__all_flight_data.origin_airport, 29 | dot_model__all_flight_data.origin_iata, 30 | dot_model__all_flight_data.destination_airport, 31 | dot_model__all_flight_data.destination_iata, 32 | dot_model__all_flight_data.stops, 33 | dot_model__all_flight_data.price 34 | FROM self_tests_public_tests.dot_model__all_flight_data 35 | WHERE dot_model__all_flight_data.stops::text = 'non-stop'::text 36 | ) 37 | SELECT meet_condition.uuid, 38 | meet_condition.departure_time, 39 | meet_condition.airline, 40 | meet_condition.origin_airport, 41 | meet_condition.origin_iata, 42 | meet_condition.destination_airport, 43 | meet_condition.destination_iata, 44 | meet_condition.stops, 45 | meet_condition.price 46 | FROM meet_condition 47 | WHERE NOT (meet_condition.price IS NOT NULL AND meet_condition.price > 0::double precision);",1001,10,991 48 | 3,51479924-b5ff-4efc-9ba0-97cbd021b39f,ed27037a-4054-3070-9d88-fdf9cd0231c8,all_flight_data,not_negative_string_column,price,{'name': 'price'},fail,"got 1 result, configured to fail if != 0",tr_dot_model__all_flight_data_price," SELECT array_agg(dot_model__all_flight_data.uuid) AS uuid_list 49 | FROM self_tests_public_tests.dot_model__all_flight_data 50 | WHERE dot_model__all_flight_data.price::character varying::text ~~ '-%'::text 51 | HAVING count(*) > 0;",1001,1,1000 52 | 4,51479924-b5ff-4efc-9ba0-97cbd021b39f,df44c2f4-65f8-3170-a03f-6035aaa45660,all_flight_data,not_null,origin_airport,{},fail,"got 53 results, configured to fail if != 0",tr_dot_model__all_flight_data_not_null_origin_a," SELECT dot_model__all_flight_data.uuid, 53 | dot_model__all_flight_data.departure_time, 54 | dot_model__all_flight_data.airline, 55 | dot_model__all_flight_data.origin_airport, 56 | dot_model__all_flight_data.origin_iata, 57 | dot_model__all_flight_data.destination_airport, 58 | dot_model__all_flight_data.destination_iata, 59 | dot_model__all_flight_data.stops, 60 | dot_model__all_flight_data.price 61 | FROM self_tests_public_tests.dot_model__all_flight_data 62 | WHERE dot_model__all_flight_data.origin_airport IS NULL;",1001,53,948 63 | 5,51479924-b5ff-4efc-9ba0-97cbd021b39f,59b7fabd-acb8-3a38-8cbf-91736a214cab,all_flight_data,possible_duplicate_forms,,"{'table_specific_uuid': 'uuid', 'table_specific_period': 'day', 'table_specific_patient_uuid': 'airline', 'table_specific_reported_date': 'departure_time'}",fail,"got 1 result, configured to fail if != 0",tr_dot_model__all_flight_data_possible_duplicate_f," WITH records_per_patient_period AS ( 64 | SELECT date_trunc('day'::text, dot_model__all_flight_data.departure_time::timestamp without time zone) AS date_period, 65 | dot_model__all_flight_data.airline AS patient_uuid_to_flag, 66 | count(dot_model__all_flight_data.uuid) AS number_of_records 67 | FROM self_tests_public_tests.dot_model__all_flight_data 68 | GROUP BY (date_trunc('day'::text, dot_model__all_flight_data.departure_time::timestamp without time zone)), dot_model__all_flight_data.airline 69 | ), possible_duplicate_combinations AS ( 70 | SELECT records_per_patient_period.date_period, 71 | records_per_patient_period.patient_uuid_to_flag, 72 | records_per_patient_period.number_of_records 73 | FROM records_per_patient_period 74 | WHERE records_per_patient_period.number_of_records > 1 75 | ) 76 | SELECT array_agg(m.uuid) AS uuid_list 77 | FROM possible_duplicate_combinations pdc 78 | LEFT JOIN self_tests_public_tests.dot_model__all_flight_data m ON date_trunc('day'::text, m.departure_time::timestamp without time zone) = pdc.date_period AND m.airline::text = pdc.patient_uuid_to_flag::text 79 | HAVING count(*) > 0;",1001,274,727 80 | 6,51479924-b5ff-4efc-9ba0-97cbd021b39f,2ba7f3e8-cd62-37ac-854f-01f704489130,all_flight_data,relationships,origin_airport,"{'to': ""ref('dot_model__all_airports_data')"", 'name': 'flight_with_no_airport', 'field': 'airport'}",fail,"got 1 result, configured to fail if != 0",tr_dot_model__all_flight_data_flight_with_no_a," SELECT array_agg(from_model.from_uuid) AS uuid_list 81 | FROM ( SELECT dot_model__all_flight_data.uuid AS from_uuid, 82 | dot_model__all_flight_data.origin_airport AS from_column_id 83 | FROM self_tests_public_tests.dot_model__all_flight_data) from_model 84 | LEFT JOIN ( SELECT dot_model__all_airports_data.airport AS to_id 85 | FROM self_tests_public_tests.dot_model__all_airports_data) to_model ON to_model.to_id::text = from_model.from_column_id::text 86 | WHERE from_model.from_column_id IS NOT NULL AND to_model.to_id IS NULL 87 | HAVING count(*) > 0;",1001,5,996 88 | 7,51479924-b5ff-4efc-9ba0-97cbd021b39f,942f4496-1202-3768-9cfe-96128bcd754c,all_airports_data,unique,airport,{},fail,"got 2 results, configured to fail if != 0",tr_dot_model__all_airports_data_unique_airport," SELECT dot_model__all_airports_data.airport AS unique_field, 89 | count(*) AS n_records 90 | FROM self_tests_public_tests.dot_model__all_airports_data 91 | WHERE dot_model__all_airports_data.airport IS NOT NULL 92 | GROUP BY dot_model__all_airports_data.airport 93 | HAVING count(*) > 1;",365,2,363 94 | 8,51479924-b5ff-4efc-9ba0-97cbd021b39f,8abccc35-874f-3e54-98eb-6eed5c00cf72,all_flight_data,expect_similar_means_across_reporters,price,"{'key': 'airline', 'quantity': 'price', 'id_column': 'airline', 'data_table': 'dot_model__all_flight_data', 'target_table': 'dot_model__airlines_data'}",fail,,chv_tr_different_dot_model__all_flight_data_price_distribution,,1001,1,1000 -------------------------------------------------------------------------------- /dot/self_tests/data/queries/dbt_core_generated_objects.sql: -------------------------------------------------------------------------------- 1 | -- these are generated the db objects generated by `test_run_dot_tests` 2 | 3 | CREATE OR REPLACE VIEW self_tests_public_tests.dot_model__airlines_data 4 | AS SELECT DISTINCT flight_data.airline 5 | FROM self_tests_public.flight_data; 6 | 7 | CREATE OR REPLACE VIEW self_tests_public_tests.chv_tr_different_dot_model__all_flight_data_price_distribution 8 | AS SELECT dot_model__airlines_data.airline, 9 | failed.failed 10 | FROM self_tests_public_tests.dot_model__airlines_data 11 | JOIN unnest(ARRAY['British Airways'::text]) failed(failed) ON failed.failed = dot_model__airlines_data.airline::text; 12 | 13 | CREATE OR REPLACE VIEW self_tests_public_tests.dot_model__all_airports_data 14 | AS SELECT airport_data.uuid, 15 | airport_data.airport, 16 | airport_data.airport_iata 17 | FROM self_tests_public.airport_data; 18 | 19 | CREATE OR REPLACE VIEW self_tests_public_tests.dot_model__all_flight_data 20 | AS SELECT flight_data.uuid, 21 | flight_data.departure_time, 22 | flight_data.airline, 23 | flight_data.origin_airport, 24 | flight_data.origin_iata, 25 | flight_data.destination_airport, 26 | flight_data.destination_iata, 27 | flight_data.stops, 28 | flight_data.price 29 | FROM self_tests_public.flight_data; 30 | 31 | CREATE OR REPLACE VIEW self_tests_public_tests.dot_model__ethiopia_airlines_data 32 | AS SELECT flight_data.uuid, 33 | flight_data.departure_time, 34 | flight_data.airline, 35 | flight_data.origin_airport, 36 | flight_data.origin_iata, 37 | flight_data.destination_airport, 38 | flight_data.destination_iata, 39 | flight_data.stops, 40 | flight_data.price 41 | FROM self_tests_public.flight_data 42 | WHERE flight_data.airline::text = 'Ethiopian Airlines'::text; 43 | 44 | CREATE OR REPLACE VIEW self_tests_public_tests.dot_model__zagreb_flight_data 45 | AS SELECT flight_data.uuid, 46 | flight_data.departure_time, 47 | flight_data.airline, 48 | flight_data.origin_airport, 49 | flight_data.origin_iata, 50 | flight_data.destination_airport, 51 | flight_data.destination_iata, 52 | flight_data.stops, 53 | flight_data.price 54 | FROM self_tests_public.flight_data 55 | WHERE flight_data.origin_airport::text = 'Zagreb airport'::text; 56 | 57 | CREATE OR REPLACE VIEW self_tests_public_tests.tr_dot_model__all_airports_data_unique_airport 58 | AS SELECT dot_model__all_airports_data.airport AS unique_field, 59 | count(*) AS n_records 60 | FROM self_tests_public_tests.dot_model__all_airports_data 61 | WHERE dot_model__all_airports_data.airport IS NOT NULL 62 | GROUP BY dot_model__all_airports_data.airport 63 | HAVING count(*) > 1; 64 | 65 | CREATE OR REPLACE VIEW self_tests_public_tests.tr_dot_model__all_flight_data_accepted_values_stops 66 | AS WITH all_values AS ( 67 | SELECT dot_model__all_flight_data.stops AS value_field, 68 | count(*) AS n_records 69 | FROM self_tests_public_tests.dot_model__all_flight_data 70 | GROUP BY dot_model__all_flight_data.stops 71 | ) 72 | SELECT all_values.value_field, 73 | all_values.n_records 74 | FROM all_values 75 | WHERE all_values.value_field::text <> ALL (ARRAY['1'::character varying, '2'::character varying, '3'::character varying, 'Non-stop'::character varying]::text[]); 76 | 77 | CREATE OR REPLACE VIEW self_tests_public_tests.tr_dot_model__all_flight_data_flight_with_no_a 78 | AS SELECT array_agg(from_model.from_uuid) AS uuid_list 79 | FROM ( SELECT dot_model__all_flight_data.uuid AS from_uuid, 80 | dot_model__all_flight_data.origin_airport AS from_column_id 81 | FROM self_tests_public_tests.dot_model__all_flight_data) from_model 82 | LEFT JOIN ( SELECT dot_model__all_airports_data.airport AS to_id 83 | FROM self_tests_public_tests.dot_model__all_airports_data) to_model ON to_model.to_id::text = from_model.from_column_id::text 84 | WHERE from_model.from_column_id IS NOT NULL AND to_model.to_id IS NULL 85 | HAVING count(*) > 0; 86 | 87 | CREATE OR REPLACE VIEW self_tests_public_tests.tr_dot_model__all_flight_data_not_null_origin_a 88 | AS SELECT dot_model__all_flight_data.uuid, 89 | dot_model__all_flight_data.departure_time, 90 | dot_model__all_flight_data.airline, 91 | dot_model__all_flight_data.origin_airport, 92 | dot_model__all_flight_data.origin_iata, 93 | dot_model__all_flight_data.destination_airport, 94 | dot_model__all_flight_data.destination_iata, 95 | dot_model__all_flight_data.stops, 96 | dot_model__all_flight_data.price 97 | FROM self_tests_public_tests.dot_model__all_flight_data 98 | WHERE dot_model__all_flight_data.origin_airport IS NULL; 99 | 100 | CREATE OR REPLACE VIEW self_tests_public_tests.tr_dot_model__all_flight_data_price 101 | AS SELECT array_agg(dot_model__all_flight_data.uuid) AS uuid_list 102 | FROM self_tests_public_tests.dot_model__all_flight_data 103 | WHERE dot_model__all_flight_data.price::character varying::text ~~ '-%'::text 104 | HAVING count(*) > 0; 105 | -------------------------------------------------------------------------------- /dot/self_tests/data/test_configuration_utils/dot_config.yml: -------------------------------------------------------------------------------- 1 | dot: 2 | save_passed_tests: False 3 | dot_db: 4 | type: postgres 5 | host: dot_db 6 | user: postgres 7 | pass: "{{ env_var('POSTGRES_PASSWORD') }}" 8 | port: 5432 9 | dbname: dot_db 10 | schema: dot 11 | threads: 4 12 | Muso_db: 13 | type: postgres 14 | host: dot_db 15 | user: postgres 16 | pass: "{{ env_var('POSTGRES_PASSWORD') }}" 17 | port: 5432 18 | dbname: dot_db 19 | schema: public 20 | threads: 4 21 | -------------------------------------------------------------------------------- /dot/self_tests/integration/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datakind/Data-Observation-Toolkit/9c28925298e4b4a03e75a3b093d26efe13d8c3be/dot/self_tests/integration/__init__.py -------------------------------------------------------------------------------- /dot/self_tests/integration/test_run_dot_tests.py: -------------------------------------------------------------------------------- 1 | """ Integration test: runs DOT for the demo dataset and checks the results """ 2 | import uuid 3 | import logging 4 | import math 5 | import pandas as pd 6 | from mock import patch 7 | from ..self_tests_utils.dbt_base_safe_test_class import DbtBaseSelfTestClass 8 | 9 | # UT after base_self_test_class imports 10 | from utils.run_management import run_dot_tests # pylint: disable=wrong-import-order 11 | from utils.utils import setup_custom_logger # pylint: disable=wrong-import-order 12 | from utils.connection_utils import ( # pylint: disable=wrong-import-order 13 | get_db_params_from_config, 14 | ) 15 | from utils.configuration_utils import ( # pylint: disable=wrong-import-order 16 | DbParamsConfigFile, 17 | DbParamsConnection, 18 | ) 19 | 20 | 21 | class RunDotTestsTest(DbtBaseSelfTestClass): 22 | """Test Class""" 23 | 24 | def setUp( 25 | self, 26 | dummy=None, # pylint: disable=unused-argument 27 | ) -> None: 28 | # load the DOT demo dataset 29 | self.create_self_tests_db_schema() 30 | 31 | self.cleanup_dbt_output_dir() 32 | 33 | def tearDown( 34 | self, 35 | debug=False, # pylint: disable=unused-argument 36 | ) -> None: 37 | super().tearDown( 38 | debug=False 39 | ) # if debug=True, do not remove results in the database 40 | 41 | @patch("utils.configuration_utils._get_filename_safely") 42 | def test_run_dot_tests( 43 | self, mock_get_filename_safely 44 | ): # pylint: disable=no-value-for-parameter 45 | """run all dot tests""" 46 | mock_get_filename_safely.side_effect = self.mock_get_filename_safely 47 | 48 | logger = setup_custom_logger( 49 | "self_tests/output/logs/run_everything.log", logging.INFO 50 | ) 51 | 52 | run_id = uuid.uuid4() 53 | 54 | run_dot_tests("ScanProject1", logger, run_id) 55 | 56 | # check results 57 | schema_dot, _, conn_dot = get_db_params_from_config( 58 | DbParamsConfigFile["dot_config.yml"], 59 | DbParamsConnection["dot"], 60 | "ScanProject1", 61 | ) 62 | 63 | test_results_summary = pd.read_sql( 64 | f"SELECT * FROM {schema_dot}.test_results_summary", conn_dot 65 | ) 66 | expected_test_results_summary = pd.read_csv( 67 | "self_tests/data/expected/integration/test_results_summary.csv", index_col=0 68 | ) 69 | pd.testing.assert_frame_equal( 70 | test_results_summary.drop(columns=["run_id"]), 71 | expected_test_results_summary.drop(columns=["run_id"]), 72 | ) 73 | 74 | test_results = pd.read_sql(f"SELECT * FROM {schema_dot}.test_results", conn_dot) 75 | expected_test_results = pd.read_csv( 76 | "self_tests/data/expected/integration/test_results.csv", index_col=0 77 | ) 78 | pd.testing.assert_frame_equal( 79 | expected_test_results.drop( 80 | columns=["run_id", "test_result_id", "id_column_value"] 81 | ), 82 | test_results.drop(columns=["run_id", "test_result_id", "id_column_value"]), 83 | ) 84 | self.assertListEqual( 85 | sorted( 86 | [ 87 | "" 88 | if v is None 89 | or (isinstance(v, float) and math.isnan(v)) 90 | or (v == "NULL") 91 | else str(v) 92 | for v in expected_test_results["id_column_value"] 93 | ] 94 | ), 95 | sorted( 96 | [ 97 | "" 98 | if v is None 99 | or (isinstance(v, float) and math.isnan(v)) 100 | or (v == "NULL") 101 | else str(v) 102 | for v in test_results["id_column_value"] 103 | ] 104 | ), 105 | ) 106 | -------------------------------------------------------------------------------- /dot/self_tests/self_tests_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datakind/Data-Observation-Toolkit/9c28925298e4b4a03e75a3b093d26efe13d8c3be/dot/self_tests/self_tests_utils/__init__.py -------------------------------------------------------------------------------- /dot/self_tests/self_tests_utils/base_self_test_class.py: -------------------------------------------------------------------------------- 1 | """ base class for self tests""" 2 | import unittest 3 | import os 4 | import sys 5 | import shutil 6 | from typing import Tuple, Optional, Iterable 7 | from mock import patch 8 | 9 | import psycopg2 as pg 10 | import sqlalchemy as sa 11 | from psycopg2 import sql 12 | 13 | # go to `dot` directory, i.e. 2 levels up to current test file 14 | os.chdir(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 15 | print(os.getcwd()) 16 | sys.path.append(".") 17 | 18 | # after this, imports from the dot_run can be done 19 | from utils.connection_utils import ( # pylint: disable=wrong-import-position 20 | get_db_params_from_config, 21 | DbParamsConnection, 22 | DbParamsConfigFile, 23 | ) 24 | from utils.configuration_utils import ( # pylint: disable=wrong-import-position 25 | dot_config_FILENAME, 26 | DBT_PROJECT_FINAL_FILENAME, 27 | ) 28 | 29 | 30 | class BaseSelfTestClass(unittest.TestCase): 31 | """ 32 | Base class for all tests, includes some utility functions for self test outputs and 33 | db connection 34 | """ 35 | 36 | @classmethod 37 | def setUpClass(cls): 38 | # prepare dir for output files 39 | test_output_path = "./self_tests/output" 40 | if os.path.isdir(test_output_path): 41 | shutil.rmtree(test_output_path) 42 | os.makedirs(test_output_path) 43 | 44 | @staticmethod 45 | def mock_get_filename_safely(path: str) -> str: 46 | """ 47 | Mock paths of config files 48 | 49 | Parameters 50 | ---------- 51 | path 52 | 53 | Returns 54 | ------- 55 | 56 | """ 57 | if path == dot_config_FILENAME: 58 | return "self_tests/data/base_self_test/dot_config.yml" 59 | if path == "./config/example/project_name/dbt/dbt_project.yml": 60 | return path 61 | if path == DBT_PROJECT_FINAL_FILENAME: 62 | return DBT_PROJECT_FINAL_FILENAME 63 | raise FileNotFoundError(f"file path {path} needs to be mocked") 64 | 65 | def setUp(self) -> None: 66 | """creates DB schema for the demo dataset by default""" 67 | self.create_self_tests_db_schema() 68 | 69 | def tearDown(self, debug=False) -> None: 70 | """drops the DB schema for the demo dataset by default""" 71 | self.drop_self_tests_db_schema(debug=debug) 72 | 73 | @patch("utils.configuration_utils._get_filename_safely") 74 | def get_self_tests_db_conn( 75 | self, 76 | mock_get_filename_safely, 77 | connection: DbParamsConnection = DbParamsConnection["dot"], 78 | ) -> Tuple[ 79 | str, sa.engine.base.Engine, pg.extensions.connection 80 | ]: # pylint: disable=no-value-for-parameter 81 | """ 82 | Obtains the db connection for the self tests db 83 | 84 | Parameters 85 | ---------- 86 | mock_get_filename_safely 87 | connection: DbParamsConnection 88 | enum for the connection to dot 89 | 90 | Returns 91 | ------- 92 | schema: str 93 | engine: sa.engine.base.Engine 94 | conn: pg.extensions.connection 95 | """ 96 | mock_get_filename_safely.side_effect = self.mock_get_filename_safely 97 | schema, engine, conn = get_db_params_from_config( 98 | DbParamsConfigFile["dot_config.yml"], 99 | connection, 100 | "ScanProject1", # TODO maybe should be a parameter; at least configurable somehow 101 | ) 102 | 103 | return schema, engine, conn 104 | 105 | def drop_self_tests_db_schema( 106 | self, 107 | schema: str = None, 108 | conn: Optional[pg.extensions.connection] = None, 109 | cursor: Optional[pg.extensions.cursor] = None, 110 | debug: bool = False, 111 | ) -> None: 112 | """ 113 | Drops the self tests' schema 114 | 115 | Parameters 116 | ---------- 117 | schema: str 118 | schema for self tests 119 | conn: Optional[pg.extensions.connection] 120 | connection to the selt tests db; if not provided will 121 | figure out 122 | cursor: Optional[pg.extensions.cursor] 123 | cursor within `conn`, if not provided will figure out 124 | debug: 125 | if True, it does not drop the schemas 126 | 127 | Returns 128 | ------- 129 | 130 | """ 131 | # TODO drop self_tests_public and self_test_public_tests 132 | if debug: 133 | return 134 | 135 | if schema is None or conn is None: 136 | ( 137 | schema, 138 | _, 139 | conn, 140 | ) = self.get_self_tests_db_conn() # pylint: disable=no-value-for-parameter 141 | 142 | if cursor is None: 143 | cursor = conn.cursor() 144 | 145 | query_drop = sql.SQL("drop schema if exists {name} cascade").format( 146 | name=sql.Identifier(schema) 147 | ) 148 | cursor.execute(query_drop) 149 | conn.commit() 150 | 151 | @staticmethod 152 | def get_queries_from_file(f, dot_schema, public_schema): 153 | """ 154 | Gets queries from file 155 | 156 | Parameters 157 | ---------- 158 | f: file 159 | file object 160 | schema: str 161 | schema for self tests 162 | 163 | Returns 164 | ------- 165 | transformed query lines 166 | """ 167 | all_query_lines = [] 168 | lines = f.readlines() 169 | for line in lines: 170 | if "create schema" in line.lower(): 171 | continue 172 | line = line.replace("dot.", f"{dot_schema}.") 173 | line = line.replace("public.", f"{public_schema}.") 174 | all_query_lines.append(line) 175 | return all_query_lines 176 | 177 | def create_self_tests_db_schema( 178 | self, 179 | additional_query: str = None, 180 | schema_filepath: str = "../db/dot/1-schema.sql", 181 | additional_filepaths: Iterable[str] = [ 182 | "../db/dot/2-upload_static_data.sql", 183 | "../db/dot/3-demo_data.sql", 184 | "../db/dot/4-upload_sample_dot_data.sql", 185 | ], 186 | do_recreate_schema: bool = True, 187 | ): 188 | """ 189 | Creates the self tests' schema and runs the queries in `additional_query` 190 | if provided 191 | 192 | Parameters 193 | ---------- 194 | additional_query 195 | string with valid queries to run 196 | schema_filepath 197 | path of the file that creates the schema 198 | additional_filepaths 199 | list of paths of the files that e.g. uploads the static data, creates project, etc 200 | do_recreate_schema 201 | drops and recreates the schema, True by default 202 | 203 | Returns 204 | ------- 205 | None 206 | """ 207 | schema_list = [] 208 | for member in list(DbParamsConnection.__members__): 209 | (schema, _, _) = self.get_self_tests_db_conn( 210 | connection=DbParamsConnection[member] 211 | ) 212 | schema_list.append(schema) 213 | 214 | ( 215 | schema_dot, 216 | _, 217 | conn, 218 | ) = self.get_self_tests_db_conn() # pylint: disable=no-value-for-parameter 219 | 220 | ( 221 | schema_project, 222 | _, 223 | conn, 224 | ) = self.get_self_tests_db_conn(connection=DbParamsConnection.project) 225 | 226 | cursor = conn.cursor() 227 | 228 | try: 229 | if do_recreate_schema: 230 | for sch in set(schema_list): 231 | self.drop_self_tests_db_schema(sch, conn, cursor) 232 | 233 | query_create = sql.SQL( 234 | """ 235 | CREATE SCHEMA {name}; 236 | """ 237 | ).format(name=sql.Identifier(sch)) 238 | cursor.execute(query_create) 239 | conn.commit() 240 | 241 | if schema_filepath is not None: 242 | with open(schema_filepath, "r") as f: 243 | all_query_lines = self.get_queries_from_file( 244 | f, schema_dot, schema_project 245 | ) 246 | 247 | # execute all queries 248 | cursor.execute("".join(all_query_lines)) 249 | conn.commit() 250 | 251 | if additional_filepaths is not None: 252 | for additional_filepath in additional_filepaths: 253 | with open(additional_filepath, "r") as f: 254 | all_query_lines = self.get_queries_from_file( 255 | f, schema_dot, schema_project 256 | ) 257 | 258 | # execute all queries 259 | cursor.execute("".join(all_query_lines)) 260 | conn.commit() 261 | 262 | if additional_query: 263 | cursor.execute(additional_query) 264 | conn.commit() 265 | 266 | except Exception as e: 267 | conn.rollback() 268 | raise e 269 | -------------------------------------------------------------------------------- /dot/self_tests/self_tests_utils/dbt_base_safe_test_class.py: -------------------------------------------------------------------------------- 1 | """base self tests class for tests checking the output of the DBT process""" 2 | import os 3 | import logging 4 | import shutil 5 | 6 | from mock import patch 7 | from ..self_tests_utils.base_self_test_class import BaseSelfTestClass 8 | 9 | from utils.utils import setup_custom_logger # pylint: disable=wrong-import-order 10 | 11 | from utils.dbt import ( # pylint: disable=wrong-import-order 12 | run_dbt_core, 13 | archive_previous_dbt_results, 14 | create_failed_dbt_test_models, 15 | run_dbt_test, 16 | ) 17 | 18 | 19 | class DbtBaseSelfTestClass(BaseSelfTestClass): 20 | @staticmethod 21 | def cleanup_dbt_output_dir(): 22 | # for safety: remove any previous dbt target directory and model files 23 | if os.path.isdir("dbt/target"): 24 | shutil.rmtree("dbt/target") 25 | for path in os.listdir("dbt/"): 26 | if path.startswith("models") or path.startswith("tests"): 27 | shutil.rmtree(f"dbt/{path}") 28 | 29 | @patch("utils.configuration_utils._get_filename_safely") 30 | def setUp( 31 | self, mock_get_filename_safely 32 | ) -> None: # pylint: disable=no-value-for-parameter 33 | super().setUp() 34 | 35 | self.cleanup_dbt_output_dir() 36 | 37 | mock_get_filename_safely.side_effect = self.mock_get_filename_safely 38 | 39 | self.dbt_test_setup() 40 | 41 | def dbt_test_setup(self): 42 | """ 43 | setup for dbt tests 44 | 45 | - dbt_project config file 46 | - entities to be tested 47 | """ 48 | shutil.copy( 49 | "./config/example/self_tests/dbt/dbt_project.yml", "./dbt/dbt_project.yml" 50 | ) 51 | 52 | # copy the models 53 | # (i.e. in the full DOT pipeline these are generated from the configured_entities) 54 | shutil.rmtree("dbt/models", ignore_errors=True) 55 | shutil.copytree( 56 | "self_tests/data/dot_input_files/dbt", "dbt/models/ScanProject1" 57 | ) 58 | 59 | @staticmethod 60 | def run_dbt_steps(): 61 | """ 62 | Runs all the actions for dbt 63 | """ 64 | project_id = "ScanProject1" 65 | logger = setup_custom_logger("self_tests/output/test.log", logging.INFO) 66 | run_dbt_core(project_id, logger) 67 | archive_previous_dbt_results(logger) 68 | create_failed_dbt_test_models(project_id, logger, "view") 69 | run_dbt_test(project_id, logger) 70 | -------------------------------------------------------------------------------- /dot/self_tests/unit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datakind/Data-Observation-Toolkit/9c28925298e4b4a03e75a3b093d26efe13d8c3be/dot/self_tests/unit/__init__.py -------------------------------------------------------------------------------- /dot/self_tests/unit/test_configuration_utils.py: -------------------------------------------------------------------------------- 1 | """ Tests of configuration utils module """ 2 | 3 | from mock import patch 4 | from ..self_tests_utils.base_self_test_class import BaseSelfTestClass 5 | 6 | # UT after base_self_test_class imports 7 | from utils.configuration_utils import ( # pylint: disable=wrong-import-order 8 | get_dbt_config_custom_schema_output_objects, 9 | ) 10 | 11 | 12 | class ConfigUtilsTest(BaseSelfTestClass): 13 | """Test Class""" 14 | 15 | @patch("utils.configuration_utils._get_filename_safely") 16 | def test_dbt_config_custom_schema_output_objects(self, mock_get_filename_safely): 17 | """test get_dbt_config_custom_schema_output_objects""" 18 | mock_get_filename_safely.side_effect = self.mock_get_filename_safely 19 | 20 | assert get_dbt_config_custom_schema_output_objects() == "tests" 21 | assert get_dbt_config_custom_schema_output_objects() == "tests" 22 | assert get_dbt_config_custom_schema_output_objects() == "tests" 23 | -------------------------------------------------------------------------------- /dot/self_tests/unit/test_connection_utils.py: -------------------------------------------------------------------------------- 1 | from ..self_tests_utils.base_self_test_class import BaseSelfTestClass 2 | 3 | # UT after base_self_test_class imports 4 | from utils.connection_utils import ( # pylint: disable=wrong-import-order 5 | remove_ge_schema_parameters, 6 | add_ge_schema_parameters, 7 | ) 8 | 9 | 10 | class ConnUtilsTest(BaseSelfTestClass): 11 | """Test Clase""" 12 | 13 | @staticmethod 14 | def test_remove_ge_schema_parameters(): 15 | """test function remove_ge_schema_parameters""" 16 | assert remove_ge_schema_parameters( 17 | [ 18 | { 19 | "key": "reported_by", 20 | "quantity": "child_temperature_pre_chw", 21 | "form_name": "dot_model__iccmview_assessment", 22 | "id_column": "reported_by", 23 | "schema_core": "public_tests", 24 | "schema_source": "public", 25 | }, 26 | {"another_param": "v", "schema_core": "public_tests"}, 27 | ] 28 | ) == [ 29 | { 30 | "key": "reported_by", 31 | "quantity": "child_temperature_pre_chw", 32 | "form_name": "dot_model__iccmview_assessment", 33 | "id_column": "reported_by", 34 | }, 35 | { 36 | "another_param": "v", 37 | }, 38 | ] 39 | 40 | @staticmethod 41 | def test_add_ge_schema_parameters(): 42 | """test function add_ge_schema_parameters""" 43 | assert add_ge_schema_parameters( 44 | { 45 | "key": "reported_by", 46 | "quantity": "child_temperature_pre_chw", 47 | "form_name": "dot_model__iccmview_assessment", 48 | "id_column": "reported_by", 49 | }, 50 | project_id=None, 51 | schema_core="whatever", 52 | schema_source="public", 53 | ) == { 54 | "key": "reported_by", 55 | "quantity": "child_temperature_pre_chw", 56 | "form_name": "dot_model__iccmview_assessment", 57 | "id_column": "reported_by", 58 | "schema_core": "whatever", 59 | "schema_source": "public", 60 | } 61 | -------------------------------------------------------------------------------- /dot/self_tests/unit/test_core_entities_creation.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from mock import patch 4 | from ..self_tests_utils.base_self_test_class import BaseSelfTestClass 5 | 6 | # UT after base_self_test_class imports 7 | from utils.dbt import create_core_entities # pylint: disable=wrong-import-order 8 | 9 | 10 | class CoreEntitiesCreationTest(BaseSelfTestClass): 11 | """Test Class""" 12 | 13 | @patch("utils.configuration_utils._get_filename_safely") 14 | def test_yaml_creation( 15 | self, mock_get_filename_safely 16 | ): # pylint: disable=no-value-for-parameter 17 | """test yaml file creation for 1 core entity -see file in filename below""" 18 | mock_get_filename_safely.side_effect = self.mock_get_filename_safely 19 | 20 | schema, _, conn = self.get_self_tests_db_conn() 21 | output_dir = "self_tests/output/test_yaml_creation" 22 | schema_project = "schema_project" 23 | project_id = "ScanProject1" 24 | create_core_entities( 25 | schema, 26 | conn, 27 | schema_project, 28 | project_id, 29 | output_dir, 30 | logger=logging.getLogger(), 31 | ) 32 | conn.close() 33 | 34 | filename = "dot_model__all_flight_data.sql" 35 | assert os.path.isfile(os.path.join(output_dir, filename)) 36 | with open(f"self_tests/data/expected/{filename}", "r") as expected: 37 | expected_lines = expected.readlines() 38 | with open(os.path.join(output_dir, filename), "r") as result: 39 | result_lines = result.readlines() 40 | self.assertListEqual(expected_lines, result_lines) 41 | -------------------------------------------------------------------------------- /dot/self_tests/unit/test_dbt.py: -------------------------------------------------------------------------------- 1 | """ tests for utils/dbt.py """ 2 | 3 | import uuid 4 | import logging 5 | import pandas as pd 6 | from mock import patch 7 | 8 | from ..self_tests_utils.base_self_test_class import BaseSelfTestClass 9 | 10 | # UT after base_self_test_class imports 11 | from utils.dbt import ( # pylint: disable=wrong-import-order 12 | extract_df_from_dbt_test_results_json, 13 | get_view_definition, 14 | ) 15 | from utils.utils import ( # pylint: disable=wrong-import-order 16 | setup_custom_logger, 17 | format_uuid_list, 18 | ) 19 | 20 | 21 | class DbtUtilsTest(BaseSelfTestClass): 22 | """Test Class""" 23 | 24 | def setUp(self) -> None: 25 | with open("self_tests/data/queries/dbt_core_generated_objects.sql", "r") as f: 26 | self.create_self_tests_db_schema(additional_query=f.read()) 27 | 28 | @patch("utils.configuration_utils._get_filename_safely") 29 | def test_extract_df_from_dbt_test_results_json( 30 | self, mock_get_filename_safely 31 | ): # pylint: disable=no-value-for-parameter 32 | """ 33 | test output df generated from dbt results in json format 34 | (dbt target directory) 35 | """ 36 | mock_get_filename_safely.side_effect = self.mock_get_filename_safely 37 | 38 | run_id = uuid.UUID("4541476c-814e-43fe-ab38-786f36beecbc") 39 | output = extract_df_from_dbt_test_results_json( 40 | run_id=run_id, 41 | project_id="ScanProject1", 42 | logger=setup_custom_logger("self_tests/output/test.log", logging.INFO), 43 | target_path="self_tests/data/dot_output_files/dbt/target", 44 | ) 45 | 46 | expected = pd.read_csv( 47 | "self_tests/data/expected/extract_df_from_dbt_test_results_json.csv", 48 | index_col=0, 49 | ).fillna("") 50 | skip_columns = [ 51 | "run_id", 52 | "id_column_name", 53 | ] 54 | pd.testing.assert_frame_equal( 55 | output.drop(columns=skip_columns), expected.drop(columns=skip_columns) 56 | ) 57 | 58 | @patch("utils.configuration_utils._get_filename_safely") 59 | def test_get_view_definition( 60 | self, mock_get_filename_safely 61 | ): # pylint: disable=no-value-for-parameter 62 | """ 63 | test for function get_view_definition; needs db connection & the test view 64 | """ 65 | mock_get_filename_safely.side_effect = self.mock_get_filename_safely 66 | 67 | self.assertEqual( 68 | get_view_definition( 69 | "ScanProject1", 70 | "chv_tr_different_dot_model__all_flight_data_price_distribution", 71 | ), 72 | " SELECT dot_model__airlines_data.airline,\n" 73 | " failed.failed\n" 74 | " FROM self_tests_public_tests.dot_model__airlines_data\n" 75 | " JOIN unnest(ARRAY['British Airways'::text]) failed(failed)" 76 | " ON failed.failed = dot_model__airlines_data.airline::text;", 77 | ) 78 | 79 | @staticmethod 80 | def test_format_uuid_list(): 81 | """ 82 | Formats `uuid_list` from postgres as actually a list 83 | 84 | Returns 85 | ------- 86 | 87 | """ 88 | assert format_uuid_list("{fc9f60d4-3cbf-3493-918e-a01478aa91db}") == [ 89 | "fc9f60d4-3cbf-3493-918e-a01478aa91db", 90 | ] 91 | assert format_uuid_list( 92 | "{f542d6ed-7fa7-3d86-b054-8dacf1a73406," 93 | "04c739e0-13ea-3c8f-9e65-38eeafcca330," 94 | "fa8a11a6-79ab-307b-bede-81cbff179e46}" 95 | ) == [ 96 | "f542d6ed-7fa7-3d86-b054-8dacf1a73406", 97 | "04c739e0-13ea-3c8f-9e65-38eeafcca330", 98 | "fa8a11a6-79ab-307b-bede-81cbff179e46", 99 | ] 100 | -------------------------------------------------------------------------------- /dot/self_tests/unit/test_dbt_logs.py: -------------------------------------------------------------------------------- 1 | """ tests for utils/dbt.py """ 2 | 3 | import ast 4 | 5 | from mock import patch 6 | from ..self_tests_utils.dbt_base_safe_test_class import DbtBaseSelfTestClass 7 | 8 | # functions under test 9 | from utils.dbt_logs import ( # pylint: disable=wrong-import-order 10 | DbtOutputProcessedRow, 11 | read_dbt_logs, 12 | _get_test_parameters, 13 | _get_test_type, 14 | process_dbt_logs_row, 15 | ) 16 | 17 | 18 | class DbtLogsUtilsTest(DbtBaseSelfTestClass): 19 | """Test Class for dbt log processing""" 20 | 21 | def test_read_dbt_logs(self): 22 | """ 23 | This test is not really so useful; a better test would run dbt on the inputs 24 | and check that the logs have not changed 25 | """ 26 | output = read_dbt_logs( 27 | target_path="self_tests/data/dot_output_files/dbt/target", 28 | ) 29 | with open("self_tests/data/expected/read_dbt_output_files.json", "r") as f: 30 | expected = ast.literal_eval(f.read()) 31 | self.assertEqual(output, expected) 32 | 33 | def test_get_test_parameters_non_neg_string_column(self): 34 | """ 35 | this test should be refactored to use not a full node from a manifest 36 | but a json constructed in the test itself w the right parameters 37 | TODO add test for mode node types, and in particular cutom_sql nodes 38 | """ 39 | with open( 40 | "self_tests/data/dot_output_files/dbt/manifest_node_ex_non_negative_string_column.json", 41 | "r", 42 | ) as f: 43 | node = ast.literal_eval(f.read()) 44 | output = _get_test_parameters(node, "not_negative_string_column") 45 | self.assertEqual(output, "{'name': 'value'}") 46 | 47 | def test_get_test_type(self): # pylint: disable=no-value-for-parameter 48 | """ 49 | Gets test type from dbt manifest metadata 50 | """ 51 | node = {"test_metadata": {"name": "test_type_x"}} 52 | self.assertEqual(_get_test_type(node), "test_type_x") 53 | node = { 54 | "test_metadata": {}, 55 | "original_file_path": "tests/ScanProject1/test_x.sql", 56 | } 57 | self.assertEqual(_get_test_type(node), "custom_sql") 58 | node = {"test_metadata": {}} 59 | self.assertEqual(_get_test_type(node), None) 60 | 61 | def test_process_dbt_logs_row(self): 62 | """ 63 | Same as test_read_dbt_output_files, will not detect a problem due to dbt 64 | version changing logs 65 | """ 66 | with open("self_tests/data/expected/read_dbt_output_files.json", "r") as f: 67 | # below contains lines read from logs passed through read_dbt_output_files 68 | expected_lines = ast.literal_eval(f.read()) 69 | res = process_dbt_logs_row(expected_lines[0]) 70 | expected = DbtOutputProcessedRow( 71 | unique_id="test.dbt_model_1.not_null_dot_model__all_flight_data_origin_airport.2196b664b6", 72 | test_type="not_null", 73 | test_status="fail", 74 | test_message="got 53 results, configured to fail if != 0", 75 | column_name="origin_airport", 76 | entity_id="dot_model__all_flight_data", 77 | test_parameters="{}", 78 | short_test_name="tr_dot_model__all_flight_data_not_null_origin_a", 79 | ) 80 | self.assertEqual(res, expected) 81 | -------------------------------------------------------------------------------- /dot/self_tests/unit/test_dbt_logs_safe.py: -------------------------------------------------------------------------------- 1 | """ tests for utils/dbt.py """ 2 | 3 | import ast 4 | 5 | from ..self_tests_utils.dbt_base_safe_test_class import DbtBaseSelfTestClass 6 | 7 | # functions under test 8 | from utils.dbt_logs import ( # pylint: disable=wrong-import-order 9 | DbtOutputProcessedRow, 10 | read_dbt_logs, 11 | process_dbt_logs_row, 12 | ) 13 | 14 | 15 | class DbtLogsUtilsTest(DbtBaseSelfTestClass): 16 | """Test Class for dbt log processing 17 | 18 | safe test -meaning it will detect if a change of version in DBT 19 | changes the output logs in a way that will make DOT fail 20 | 21 | (i.e. because DOT relies on DBT logs, and that's not really safe) 22 | """ 23 | 24 | def setUp(self) -> None: # pylint: disable=arguments-differ 25 | super().setUp() # pylint: disable=no-value-for-parameter 26 | 27 | # i.e. DBT is run for each of the tests on this class 28 | self.run_dbt_steps() 29 | 30 | @staticmethod 31 | def _cleanup_schema_name(value): 32 | """ 33 | Cleans up schema from self_tests_dot to dot 34 | """ 35 | return value.replace("self_tests_", "") if isinstance(value, str) else value 36 | 37 | def check_output_recursive( 38 | self, 39 | exp_line: str, 40 | out_line: str, 41 | skip_keys: dict = { 42 | 0: ["timing", "execution_time", "thread_id"], 43 | 1: ["created_at", "root_path"], 44 | }, 45 | recursion_level: int = 0, 46 | ): 47 | """check outputs recursively for dbt logs""" 48 | for exp_k, exp_v in exp_line.items(): 49 | if exp_k in skip_keys.get(recursion_level, []): 50 | continue 51 | out_line_v = out_line.get(exp_k) 52 | if isinstance(exp_v, dict): 53 | self.check_output_recursive( 54 | exp_v, out_line_v, skip_keys, recursion_level + 1 55 | ) 56 | else: 57 | self.assertEqual( 58 | self._cleanup_schema_name(out_line_v), 59 | self._cleanup_schema_name(exp_v), 60 | f"failed key {exp_k}; expected: {exp_v}, output: {out_line.get(exp_k)}", 61 | ) 62 | 63 | def test_read_dbt_logs_safe(self): 64 | """ 65 | Will detect a change in logs due to dbt versions 66 | """ 67 | 68 | # 2. test that the outputs are still ok 69 | output = read_dbt_logs( 70 | target_path="dbt/target", # i.e. the usual execution path 71 | ) 72 | with open("self_tests/data/expected/read_dbt_output_files.json", "r") as f: 73 | expected = ast.literal_eval(f.read()) 74 | self.assertEqual(len(output), len(expected)) 75 | for exp_line in expected: 76 | unique_id = exp_line["unique_id"] 77 | out_lines = [l for l in output if l.get("unique_id") == unique_id] 78 | self.assertEqual( 79 | len(out_lines), 80 | 1, 81 | f"there should be 1 and only 1 output w unique_id {unique_id}", 82 | ) 83 | out_line = out_lines[0] 84 | self.check_output_recursive(exp_line, out_line) 85 | 86 | def test_process_dbt_logs_row_safe(self): 87 | """ 88 | Will detect a change in logs due to dbt versions, processing the raw and 89 | looking only for the required parameters 90 | """ 91 | 92 | # 2. check results 93 | output = read_dbt_logs( 94 | target_path="dbt/target", # i.e. the usual execution path 95 | ) 96 | checked = False 97 | for line in output: 98 | res = process_dbt_logs_row(line) 99 | if res.test_type == "not_negative_string_column": 100 | expected = DbtOutputProcessedRow( 101 | unique_id="test.dbt_model_1." 102 | "not_negative_string_column_dot_model__all_flight_data_price__price." 103 | "322389c2ba", 104 | test_type="not_negative_string_column", 105 | test_status="fail", 106 | test_message="got 1 result, configured to fail if != 0", 107 | column_name="price", 108 | entity_id="dot_model__all_flight_data", 109 | test_parameters="{'name': 'price'}", 110 | short_test_name="tr_dot_model__all_flight_data_price", 111 | ) 112 | self.assertEqual(res, expected) 113 | checked = True 114 | 115 | self.assertEqual(checked, True) 116 | -------------------------------------------------------------------------------- /dot/self_tests/unit/test_dot_utils_schema_improved.py: -------------------------------------------------------------------------------- 1 | """ 2 | Replicates tests in test_dot_utils.py adding the column id_column_name to the schema 3 | """ 4 | import uuid 5 | import logging 6 | 7 | from mock import patch 8 | from .test_dot_utils import UtilsTest 9 | 10 | # UT after base_self_test_class imports 11 | from utils.utils import ( # pylint: disable=wrong-import-order 12 | get_configured_tests_row, 13 | get_test_rows, 14 | setup_custom_logger, 15 | ) 16 | 17 | 18 | class UtilsTestImproved(UtilsTest): 19 | """Test Class""" 20 | 21 | def setUp(self) -> None: 22 | self.create_self_tests_db_schema( 23 | "\n".join( 24 | [ 25 | "ALTER TABLE self_tests_dot.configured_tests " 26 | "ADD COLUMN id_column_name VARCHAR(300) NULL;", 27 | "UPDATE self_tests_dot.configured_tests " 28 | "SET id_column_name = 'uuid';", 29 | ] 30 | ) 31 | ) 32 | 33 | def tearDown(self) -> None: 34 | self.drop_self_tests_db_schema() 35 | 36 | @patch("utils.configuration_utils._get_filename_safely") 37 | def test_get_configured_tests_row( 38 | self, mock_get_filename_safely 39 | ): # pylint: disable=no-value-for-parameter 40 | """test yaml file creation for 1 core entity -see file in filename below""" 41 | mock_get_filename_safely.side_effect = self.mock_get_filename_safely 42 | 43 | configured_tests_row = get_configured_tests_row( 44 | test_type="accepted_values", 45 | entity_id="all_flight_data", 46 | column="stops", 47 | project_id="ScanProject1", 48 | test_parameters='{"values": ["1", "2", "3", "Non-stop"]}', 49 | ) 50 | expected_test_id = "cad13f73-27b5-3427-be8f-4d213bba3b19" 51 | self.assertEqual( 52 | expected_test_id, 53 | configured_tests_row["test_id"], 54 | f"difference in generated_test_id {configured_tests_row['test_id']} " 55 | f"vs {expected_test_id} for possible_duplicate_forms test", 56 | ) 57 | expected_row = { 58 | "test_activated": True, 59 | "project_id": "ScanProject1", 60 | "test_id": expected_test_id, 61 | "scenario_id": "INCONSISTENT-1", 62 | "priority": 3, 63 | "description": "Disallowed FP methods entered in form", 64 | "impact": "", 65 | "proposed_remediation": "", 66 | "entity_id": "all_flight_data", 67 | "test_type": "accepted_values", 68 | "column_name": "stops", 69 | "column_description": "", 70 | "id_column_name": "uuid", 71 | "test_parameters": "{'values': ['1', '2', '3', 'Non-stop']}", 72 | "last_updated_by": "Matt", 73 | } 74 | for k, v in expected_row.items(): # pylint: disable=invalid-name 75 | self.assertEqual( 76 | str(v), 77 | str(configured_tests_row.get(k)), 78 | f"difference in {k}; {v} vs {configured_tests_row[k]}", 79 | ) 80 | 81 | @patch("utils.configuration_utils._get_filename_safely") 82 | def test_get_test_rows( 83 | self, mock_get_filename_safely 84 | ): # pylint: disable=no-value-for-parameter 85 | """test get failing rows for custom test""" 86 | mock_get_filename_safely.side_effect = self.mock_get_filename_safely 87 | 88 | # create data for the test view that has failing rows 89 | run_id = uuid.UUID("4541476c-814e-43fe-ab38-786f36beecbc") 90 | self.prepare_failing_test_view() 91 | 92 | # create data for the test view of failing rows 93 | test_summary, run_id = self.get_test_summary(run_id) 94 | test_rows = get_test_rows( 95 | test_summary, 96 | run_id, 97 | project_id="ScanProject1", 98 | logger=setup_custom_logger("self_tests/output/test.log", logging.INFO), 99 | ) 100 | self.assertEqual( 101 | len(test_rows.id_column_value.to_list()), 102 | 253, 103 | ) 104 | self.assertEqual( 105 | sorted(test_rows.id_column_value.to_list())[0], 106 | "000ea267-ffb3-3a58-8e71-eaa3c6a0a81f", 107 | ) 108 | -------------------------------------------------------------------------------- /dot/self_tests/unit/test_generate_tests_from_db.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pytest 3 | from mock import patch 4 | from ..self_tests_utils.base_self_test_class import BaseSelfTestClass 5 | 6 | # UT after base_self_test_class imports 7 | from utils.configuration_management import ( 8 | generate_tests_from_db, 9 | ) # pylint: disable=wrong-import-order 10 | 11 | 12 | class GenerateTestsFromDbTest(BaseSelfTestClass): 13 | """Test Class""" 14 | 15 | @patch("utils.configuration_utils._get_filename_safely") 16 | @pytest.mark.skip("intermediate commit - WIP for this test") 17 | def test_generate_tests_from_db( 18 | self, mock_get_filename_safely 19 | ): # pylint: disable=no-value-for-parameter 20 | """test yaml file creation for 1 core entity -see file in filename below""" 21 | mock_get_filename_safely.side_effect = self.mock_get_filename_safely 22 | 23 | dot_tests = generate_tests_from_db( 24 | project_id="Muso", logger=logging.getLogger() 25 | ) 26 | assert dot_tests 27 | assert False 28 | -------------------------------------------------------------------------------- /dot/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datakind/Data-Observation-Toolkit/9c28925298e4b4a03e75a3b093d26efe13d8c3be/dot/utils/__init__.py -------------------------------------------------------------------------------- /dot/utils/configuration_utils.py: -------------------------------------------------------------------------------- 1 | """ Configuration utils """ 2 | 3 | import os 4 | import re 5 | from enum import Enum 6 | from pathlib import Path 7 | from typing import Iterable, Optional 8 | import yaml 9 | 10 | DbParamsConfigFile = Enum("DbParamsConfigFile", "dot_config.yml") 11 | DbParamsConnection = Enum("DbParamsConnection", "dot project project_test project_core") 12 | 13 | dot_config_FILENAME = "./config/dot_config.yml" 14 | DBT_PROJECT_FINAL_FILENAME = "./dbt/dbt_project.yml" 15 | DBT_PROFILES_FINAL_FILENAME = f"{Path.home()}/.dbt/profiles.yml" 16 | GE_GREAT_EXPECTATIONS_FINAL_FILENAME = "./great_expectations/great_expectations.yml" 17 | GE_BATCH_CONFIG_FINAL_FILENAME = "./great_expectations/batch_config.json" 18 | GE_CONFIG_VARIABLES_FINAL_FILENAME = ( 19 | "./great_expectations/uncommitted/config_variables.yml" 20 | ) 21 | DBT_MODELNAME_PREFIX = "dot_model__" 22 | 23 | DBT_PROJECT_SEPARATOR = "/" 24 | 25 | 26 | def _get_filename_safely(path: str) -> str: 27 | """ 28 | Internal function - checks if the path exists 29 | 30 | Parameters 31 | ---------- 32 | path : str 33 | path of the file 34 | Returns 35 | ------- 36 | path : str 37 | path of the file 38 | """ 39 | if not os.path.isfile(path): 40 | raise FileNotFoundError(f"Cannot find file {path}") 41 | return path 42 | 43 | 44 | def _get_credentials( 45 | db_config: dict, project_id: str, connection_params: DbParamsConnection 46 | ) -> dict: 47 | """ 48 | Internal function - gets credentials either for the project or the DOT 49 | 50 | Parameters 51 | ---------- 52 | db_config : dict 53 | dictionary containing all credentials (from dot_config.yml) 54 | project_id : str 55 | Project id as found in dot.project_id, eg 'Muso' 56 | connection_params : DbParamsConnection 57 | Enum for DOT vs project db connection 58 | Returns 59 | ------- 60 | path : str 61 | path of the file 62 | """ 63 | key = connection_params.name 64 | if connection_params in [ 65 | DbParamsConnection["project"], 66 | DbParamsConnection["project_test"], 67 | DbParamsConnection["project_core"], 68 | ]: 69 | key = project_id 70 | if (db_config is None) or (f"{key}_db" not in db_config.keys()): 71 | raise Exception( 72 | f"review malformed config at dot_config.yml; content of file as follows '{db_config}'" 73 | ) 74 | creds = db_config[f"{key}_db"] 75 | 76 | if connection_params in [ 77 | DbParamsConnection["project_test"], 78 | DbParamsConnection["project_core"], 79 | ]: 80 | # add schema suffix, if present 81 | schema_suffix = get_dbt_config_custom_schema_output_objects() 82 | if schema_suffix: 83 | creds["schema"] = "_".join( 84 | [ 85 | creds["schema"], 86 | schema_suffix, 87 | ] 88 | ) 89 | return creds 90 | 91 | 92 | def load_credentials(project_id: str, connection_params: DbParamsConnection) -> dict: 93 | """ 94 | Loads credentials and transforms password for project id and connection 95 | 96 | Parameters 97 | ---------- 98 | project_id : str 99 | Project ID, eg 'Muso'. Must align with project_id in dot.projects 100 | connection_params : DbParamsConnection 101 | enum type 102 | """ 103 | db_config = load_config_file() 104 | 105 | db_credentials = _get_credentials(db_config, project_id, connection_params) 106 | 107 | # Support dbt environment variable format 108 | db_credentials["pass"] = extract_dbt_config_env_variable(db_credentials["pass"]) 109 | return db_credentials 110 | 111 | 112 | def load_config_file(): 113 | """ 114 | Reads config file safely 115 | 116 | Returns 117 | ------- 118 | config: str 119 | content of config file 120 | """ 121 | with open(_get_filename_safely(dot_config_FILENAME)) as f: 122 | return yaml.load(f, Loader=yaml.FullLoader) 123 | 124 | 125 | def extract_dbt_config_env_variable(dbt_setting: dict) -> str: 126 | """Takes a dbt config file and replaces any environment variable syntax with the 127 | environment variable. Syntax looks like this ... 128 | 129 | pass: "{{ env_var('POSTGRES_PASSWORD') }}" 130 | 131 | Parameters 132 | ---------- 133 | dbt_setting : dict 134 | credentials dictionary 135 | Returns 136 | ------- 137 | val : str 138 | The environment variable value 139 | 140 | """ 141 | val = dbt_setting 142 | if "env_var" in dbt_setting: 143 | env_variable = re.search(r"env_var\(\'(.*?)\'\)", dbt_setting).group(1) 144 | return os.getenv(env_variable) 145 | return val 146 | 147 | 148 | def get_dbt_config_original_model_paths(dbt_config: Optional[dict] = None) -> str: 149 | """ 150 | Gets original (i.e. project_independent) path of `model-paths` (i.e. where DBT 151 | models are located) from dbt_project.yml config file items 152 | 153 | Parameters 154 | ---------- 155 | dbt_config : Optional[dict] 156 | dbt_project.yml config file 157 | Returns 158 | ------- 159 | path : str 160 | e.g. models 161 | """ 162 | return _get_dbt_config_original_path("model-paths", dbt_config) 163 | 164 | 165 | def get_dbt_config_original_test_paths(dbt_config: Optional[dict] = None) -> str: 166 | """ 167 | Gets original (i.e. project_independent) path of `test-paths` (i.e. where DBT 168 | tests are located) from dbt_project.yml config file items 169 | 170 | Parameters 171 | ---------- 172 | dbt_config : Optional[dict] 173 | dbt_project.yml config file 174 | Returns 175 | ------- 176 | path : str 177 | e.g. tests 178 | """ 179 | return _get_dbt_config_original_path("test-paths", dbt_config) 180 | 181 | 182 | def get_dbt_config_model_paths(dbt_config: Optional[dict] = None) -> str: 183 | """ 184 | Gets project-dependent path of `models-paths` (i.e. where DBT models are located) 185 | from dbt_project.yml config file items 186 | 187 | Parameters 188 | ---------- 189 | dbt_config : Optional[dict] 190 | dbt_project.yml config file 191 | Returns 192 | ------- 193 | path : str 194 | e.g. models_Muso 195 | """ 196 | return _get_dbt_config_key("model-paths", dbt_config) 197 | 198 | 199 | def get_dbt_config_test_paths(dbt_config: Optional[dict] = None) -> str: 200 | """ 201 | Gets project-dependent path of `tests-paths` (i.e. where DBT tests are located) 202 | from dbt_project.yml config file items 203 | 204 | Parameters 205 | ---------- 206 | dbt_config : Optional[dict] 207 | dbt_project.yml config file 208 | Returns 209 | ------- 210 | path : str 211 | e.g. tests_Muso 212 | """ 213 | return _get_dbt_config_key("test-paths", dbt_config) 214 | 215 | 216 | def _get_dbt_config_key(key: str, dbt_config: Optional[dict] = None) -> str: 217 | """ 218 | Gets key from dbt_project.yml config file items 219 | Converts the result from list to str, assuming the list has only one element 220 | 221 | Parameters 222 | ---------- 223 | key : str 224 | key of dbt_project.yml config file 225 | dbt_config : Optional[dict] 226 | dbt_project.yml config file 227 | Returns 228 | ------- 229 | path : str 230 | e.g. models_Muso 231 | """ 232 | if dbt_config is None: 233 | filename = _get_filename_safely(DBT_PROJECT_FINAL_FILENAME) 234 | with open(filename) as f: 235 | dbt_config = yaml.load(f, Loader=yaml.FullLoader) 236 | 237 | if len(dbt_config[key]) != 1: 238 | raise NotImplementedError( 239 | f"current implementation expects a list of size exactly 1 for {key} " 240 | f"in dbt_config (list={dbt_config[key]})" 241 | ) 242 | 243 | return dbt_config[key][0] 244 | 245 | 246 | def get_dbt_config_custom_schema_output_objects( 247 | dot_config: Optional[dict] = None, 248 | ) -> str: 249 | """ 250 | Get schema suffix for test objects generated by dbt 251 | 252 | Parameters 253 | ---------- 254 | dot_config : Optional[dict] 255 | dot_project.yml config file 256 | Returns 257 | ------- 258 | path : str 259 | schema suffix for test objects 260 | """ 261 | if dot_config is None: 262 | with open(_get_filename_safely(dot_config_FILENAME)) as f: 263 | dot_config = yaml.load(f, Loader=yaml.FullLoader) 264 | 265 | return dot_config.get("dot", {}).get("output_schema_suffix", None) 266 | 267 | 268 | def _get_dbt_config_original_path(key: str, dbt_config: Optional[dict] = None) -> str: 269 | """ 270 | Gets original (i.e. project_independent) path of `key` (can be anything, but refers 271 | to either dbt models or tests) from dbt_project.yml config file items 272 | 273 | Parameters 274 | ---------- 275 | key : str 276 | key of dbt_project.yml config file 277 | dbt_config : Optional[dict] 278 | dbt_project.yml config file 279 | Returns 280 | ------- 281 | path : str 282 | e.g. models 283 | """ 284 | if dbt_config is None: 285 | with open(DBT_PROJECT_FINAL_FILENAME) as f: 286 | dbt_config = yaml.load(f, Loader=yaml.FullLoader) 287 | 288 | return DBT_PROJECT_SEPARATOR.join( 289 | _get_dbt_config_key(key, dbt_config).split(DBT_PROJECT_SEPARATOR)[:-1] 290 | ) 291 | 292 | 293 | def adapt_core_entities(schema_project: str, entity_definition: str) -> Iterable[str]: 294 | """ 295 | Adapts core entities definition to point the schema Jinja statement to the 296 | project schema 297 | 298 | Parameters 299 | ---------- 300 | schema_project: str 301 | project schema 302 | entity_definition: str 303 | text for the entity definition 304 | 305 | Returns 306 | ------- 307 | output_lines : Iterable[str] 308 | transformed lines, can be used to file.writelines() 309 | """ 310 | output_lines = [] 311 | for line in entity_definition.split("\n"): 312 | line = re.sub( 313 | "%\s*set\s*schema\s*=\s*(.+%)", # pylint: disable=anomalous-backslash-in-string 314 | f"% set schema = '{schema_project}' %", 315 | line, 316 | ) 317 | output_lines.append(line + "\n") 318 | return output_lines 319 | -------------------------------------------------------------------------------- /dot/utils/connection_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Tuple, Iterable, Optional 3 | 4 | import psycopg2 as pg 5 | import sqlalchemy as sa 6 | from sqlalchemy import create_engine, MetaData 7 | from utils.configuration_utils import ( 8 | DbParamsConfigFile, 9 | DbParamsConnection, 10 | load_credentials, 11 | ) 12 | 13 | metadata: sa.sql.schema.MetaData = None 14 | 15 | 16 | def get_metadata() -> sa.sql.schema.MetaData: 17 | """ 18 | Gets the metadata local object to this module 19 | 20 | Returns 21 | ------- 22 | MetaData 23 | """ 24 | return metadata 25 | 26 | 27 | def refresh_db_metadata( 28 | engine: sa.engine.base.Engine, schema: str 29 | ) -> sa.sql.schema.MetaData: 30 | """ 31 | Refresh ddbb metadata. Slow operation! 32 | Can be called from anywhere and refreshes the object local object to this module 33 | 34 | @param engine: engine for connection to database 35 | @return: updated database metadata 36 | """ 37 | global metadata 38 | metadata = MetaData(engine, schema=schema) 39 | metadata.reflect() 40 | return metadata 41 | 42 | 43 | def create_sqlalchemy_engine(db_credentials: dict) -> sa.engine.base.Engine: 44 | """ 45 | Default configs for creating engine 46 | 47 | Parameters 48 | ---------- 49 | db_credentials : dict 50 | db credentials dictiontay 51 | Returns 52 | ------- 53 | engine : 54 | SQL alchemy engine 55 | """ 56 | engine = create_engine( 57 | "postgresql://" 58 | + db_credentials["user"] 59 | + ":" 60 | + db_credentials["pass"] 61 | + "@" 62 | + db_credentials["host"] 63 | + ":" 64 | + str(db_credentials["port"]) 65 | + "/" 66 | + db_credentials["dbname"], 67 | paramstyle="format", 68 | executemany_mode="values", 69 | executemany_values_page_size=1000, 70 | executemany_batch_page_size=200, 71 | ) 72 | refresh_db_metadata(engine, db_credentials["schema"]) 73 | return engine 74 | 75 | 76 | def update_db_config_from_os(db_config): 77 | """Overrides password with environment variable if db_config has environment 78 | variable syntax 79 | 80 | Parameters 81 | ---------- 82 | db_config : json 83 | Raw Great expectations database config 84 | Returns 85 | ------- 86 | db_config : 87 | Great expectations database config file with environment variables inserted 88 | 89 | """ 90 | orig_pw = db_config["password"] 91 | env_var = ( 92 | orig_pw[2:-1] if (orig_pw.startswith("${") and orig_pw.endswith("}")) else None 93 | ) 94 | db_config["password"] = os.environ[env_var] if env_var else orig_pw 95 | return db_config 96 | 97 | 98 | def get_db_params_from_config( 99 | config_file: DbParamsConfigFile, 100 | connection_params: DbParamsConnection, 101 | project_id: str, 102 | ) -> Tuple[str, sa.engine.base.Engine, pg.extensions.connection]: 103 | """Parses dbt yaml file to get db credentials. Also substitutes environment 104 | variables. 105 | 106 | Parameters 107 | ---------- 108 | config_file: DbParamsConfigFile 109 | enum for dbt_project.yml config file 110 | connection_params: DbParamsConnection 111 | enum for connection 112 | project_id: str 113 | Project ID, eg 'Muso'. Must align with project_id in dot.projects 114 | 115 | Returns 116 | ------- 117 | schema : str 118 | Name of db schema in yaml file 119 | engine : sqlalchemy db connection 120 | sqlalchemy db connection 121 | connection : 122 | pg.connect 123 | """ 124 | if config_file == DbParamsConfigFile["dot_config.yml"]: 125 | db_credentials = load_credentials(project_id, connection_params) 126 | else: 127 | # since config_file is enum, this could only happen if a new value is added 128 | # but not implemented 129 | raise NotImplementedError(f"{config_file.name} is not implemented yet") 130 | 131 | conn = pg.connect( 132 | host=db_credentials["host"], 133 | user=db_credentials["user"], 134 | password=db_credentials["pass"], 135 | port=db_credentials["port"], 136 | dbname=db_credentials["dbname"], 137 | ) 138 | 139 | # Added to prevent timeout in self-tests due to locked select query transation 140 | conn.set_session(autocommit=True) 141 | 142 | schema = db_credentials["schema"] 143 | engine = create_sqlalchemy_engine(db_credentials) 144 | return schema, engine, conn 145 | 146 | 147 | def add_ge_schema_parameters( 148 | test_parameters: dict, 149 | project_id: str, 150 | schema_core: Optional[str] = None, 151 | schema_source: Optional[str] = None, 152 | ) -> dict: 153 | """ 154 | Regardless of the parameters for any of the GE tests in db, some extra parameters 155 | for config need to be added,e.g. schema in which the core models are stored 156 | 157 | Parameters 158 | ---------- 159 | test_parameters: dict 160 | json for the parameters of 1 test 161 | project_id: str 162 | Project ID, eg 'Muso'. Must align with project_id in dot.projects 163 | schema_core: Optional[str] 164 | schema in which the core models are stored 165 | if not informed, goes to db params to fetch the name 166 | schema_source: Optional[str] 167 | schema of the source data 168 | if not informed, goes to db params to fetch the name 169 | 170 | Returns 171 | ------- 172 | output : dict 173 | test parameters including extra 174 | """ 175 | if schema_core is None: 176 | schema_core, _, _ = get_db_params_from_config( 177 | DbParamsConfigFile["dot_config.yml"], 178 | DbParamsConnection["project_core"], 179 | project_id, 180 | ) 181 | 182 | if schema_source is None: 183 | schema_source, _, _ = get_db_params_from_config( 184 | DbParamsConfigFile["dot_config.yml"], 185 | DbParamsConnection["project"], 186 | project_id, 187 | ) 188 | 189 | return { 190 | **test_parameters, 191 | **{ 192 | "schema_core": schema_core, 193 | "schema_source": schema_source, 194 | }, 195 | } 196 | 197 | 198 | def remove_ge_schema_parameters(test_parameters: Iterable[dict]) -> Iterable[dict]: 199 | """ 200 | Remove extra paramets added by `add_ge_schema_parameters` so that test parameters 201 | correspond to the ones stored in the DOT configuration database 202 | 203 | Parameters 204 | ---------- 205 | test_parameters: Iterable[dict] 206 | list of json for the parameters of all GE tests, including extra parameters 207 | 208 | Returns 209 | ------- 210 | output : Iterable[dict] 211 | list of json for the parameters of all GE tests, excluding extra parameters 212 | """ 213 | return [ 214 | { 215 | k: v 216 | for k, v in tp.items() 217 | if k 218 | not in [ 219 | "schema_core", 220 | "schema_source", 221 | ] 222 | } 223 | for tp in test_parameters 224 | ] 225 | -------------------------------------------------------------------------------- /dot/utils/dbt_logs.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility functions for dbt that read results from logs (i.e. dbt `target` 3 | directory) thus may have to be changed when upgrading dbt versions 4 | 5 | Could be better solved as a class with methods to read logs and get row but 6 | we have not used classes extensively in DOT for now 7 | """ 8 | import json 9 | from dataclasses import dataclass 10 | from utils.configuration_utils import _get_filename_safely 11 | from utils.configuration_utils import get_dbt_config_test_paths 12 | from utils.utils import get_short_test_name 13 | 14 | 15 | @dataclass 16 | class DbtOutputProcessedRow: 17 | unique_id: str 18 | test_type: str 19 | test_status: str 20 | test_message: str 21 | column_name: str 22 | entity_id: str 23 | test_parameters: str 24 | short_test_name: str 25 | 26 | 27 | def read_dbt_logs(target_path: str, suffix: str = "archive") -> dict: 28 | """ 29 | Read generated json files. Assumes the cleanup has run and most recent 30 | results are in _archive files 31 | TODO maybe dbt output _test.json is not needed 32 | 33 | Parameters 34 | ---------- 35 | target_path: str 36 | path of the dbt output dir (called target) 37 | suffix: str 38 | suffix to the dbt output files ("archive", copy of these files in a prev step) 39 | 40 | Returns 41 | ------- 42 | json 43 | the structure corresponds to dbt logs 44 | """ 45 | filename = _get_filename_safely(f"{target_path}/run_results_{suffix}.json") 46 | with open(filename) as f: 47 | dbt_results = json.load(f) 48 | # Manifest, see https://docs.getdbt.com/reference/artifacts/manifest-json 49 | filename = _get_filename_safely(f"{target_path}/manifest_{suffix}.json") 50 | with open(filename) as f: 51 | manifest = json.load(f) 52 | return [ 53 | {**i, **{"node": manifest["nodes"][i["unique_id"]]}} 54 | for i in dbt_results["results"] 55 | ] 56 | 57 | 58 | def _get_test_parameters(node: dict, test_type: str) -> str: 59 | """ 60 | Figures out test parameters from the dbt logs 61 | INTERNAL function, do not use 62 | 63 | Parameters 64 | ---------- 65 | node: dict 66 | json from dbt manifest corresponding to a test 67 | test_type: str 68 | test type e.g. "not_null", "custom_sql" 69 | 70 | Returns 71 | ------- 72 | str 73 | string for the structure of test parameters 74 | """ 75 | if test_type == "custom_sql": 76 | # Custom sql (dbt/tests/*.sql) tests do not have the same structure 77 | # and we have to get SQL from file 78 | with open("dbt/" + node["original_file_path"]) as f: 79 | return f.read() 80 | 81 | test_parameters = node.get("test_metadata", {}).get("kwargs", {}) 82 | 83 | # TODO figure out why and for which test types this is needed 84 | if "model" in test_parameters: 85 | del test_parameters["model"] 86 | if "column_name" in test_parameters: 87 | del test_parameters["column_name"] 88 | 89 | # Where clauses live under the config node 90 | where_clause = node.get("config", {}).get("where", {}) 91 | if where_clause is not None: 92 | test_parameters["where"] = where_clause 93 | 94 | return str(test_parameters) 95 | 96 | 97 | def _get_test_type(node): 98 | """ 99 | Figures out test type from the dbt logs 100 | INTERNAL function, do not use 101 | 102 | Parameters 103 | ---------- 104 | node: dict 105 | json from dbt manifest corresponding to a test 106 | 107 | Returns 108 | ------- 109 | str 110 | string for the test type 111 | """ 112 | test_type = node.get("test_metadata", {}).get("name") 113 | if test_type is None: 114 | # Custom sql (dbt/tests/*.sql) tests do not have the same structure 115 | if f"{get_dbt_config_test_paths()}/" in node.get("original_file_path", ""): 116 | test_type = "custom_sql" 117 | return test_type 118 | 119 | 120 | def process_dbt_logs_row(row: dict) -> dict: 121 | """ 122 | Figures out parameters from each of the tests of the dbt output rows 123 | 124 | Parameters 125 | ---------- 126 | row: dict 127 | json from dbt logs & manifest corresponding to a test 128 | 129 | Returns 130 | ------- 131 | str 132 | string for the test type 133 | """ 134 | unique_id = row["unique_id"] 135 | node = row["node"] 136 | test_type = _get_test_type(node) 137 | test_status = row["status"].lower() 138 | test_message = row["message"].lower() if row["message"] else "" 139 | 140 | column_name = node.get("column_name") 141 | entity_id = node["original_file_path"].split("/")[-1].split(".")[0] 142 | 143 | test_parameters = _get_test_parameters(node, test_type) 144 | 145 | # For custom sql tests the view name has "id_XX" at the end, needs to be stripped 146 | entity_id = entity_id.split("_id")[0] 147 | 148 | _, short_test_name = get_short_test_name(node) 149 | 150 | return DbtOutputProcessedRow( 151 | unique_id, 152 | test_type, 153 | test_status, 154 | test_message, 155 | column_name, 156 | entity_id, 157 | test_parameters, 158 | short_test_name, 159 | ) 160 | -------------------------------------------------------------------------------- /dot/utils/run_management.py: -------------------------------------------------------------------------------- 1 | """Package to contain helper functions for running the DOT Pipeline. Includes logging 2 | and catching exceptions. 3 | mjh. 4 | """ 5 | import sys 6 | import os 7 | import traceback 8 | import datetime 9 | import pandas as pd 10 | from utils.connection_utils import create_sqlalchemy_engine 11 | from utils.configuration_management import ( 12 | generate_tests_from_db, 13 | generate_master_config_files, 14 | create_project_directories, 15 | ) 16 | from utils.utils import ( 17 | save_tests_to_db, 18 | get_test_rows, 19 | generate_dbt_test_coverage_report, 20 | set_summary_stats, 21 | ) 22 | from utils.dbt import ( 23 | run_dbt_core, 24 | archive_previous_dbt_results, 25 | create_failed_dbt_test_models, 26 | run_dbt_test, 27 | extract_df_from_dbt_test_results_json, 28 | ) 29 | from utils.great_expectations import run_ge_tests, extract_df_from_ge_test_results_csv 30 | from utils.configuration_utils import load_credentials, DbParamsConnection 31 | 32 | 33 | def run_dot_stages(project_id, logger, run_id): 34 | """Runs the full pipeline of DOT: 35 | - dbt tests 36 | - great expectation tests 37 | - report generation and save of results to the database 38 | 39 | Parameters 40 | ---------- 41 | project_id : str 42 | Project ID, eg 'Muso'. Must align with project_id in dot.projects 43 | logger : logger object 44 | The logger 45 | run_id: UUID 46 | Run ID, as provided by run_everything.py, set with ... 47 | run_id = run_id = uuid.uuid4() 48 | This UUID links results in dot.test_results with dot.run_log 49 | 50 | Also note that if environment variable DISABLE_TEST_GENERATION is set, 51 | the pipeline will not generate test files based on the contents of 52 | dot.configured_tests. 53 | This env variable is for testing purposes only, if using it you'll need to 54 | make sure test files exist in the correct places (see 55 | configuration_management.py for more details). 56 | 57 | Returns 58 | ------- 59 | No variables returned, but does update the run status in dot.run_log 60 | """ 61 | 62 | dbt_test_summary = pd.DataFrame() 63 | dbt_test_rows = pd.DataFrame() 64 | ge_test_summary = pd.DataFrame() 65 | ge_test_rows = pd.DataFrame() 66 | 67 | # Create any directories dot needs for outputs and configuration files 68 | create_project_directories(project_id, logger=logger) 69 | 70 | # Generate master config files 71 | generate_master_config_files(project_id, logger=logger) 72 | 73 | # Generate config files from DB 74 | if ( 75 | "DISABLE_TEST_GENERATION" not in os.environ 76 | ): # TODO if set, `dot_tests` will not exist and the rest will fail 77 | dot_tests = generate_tests_from_db(project_id=project_id, logger=logger) 78 | 79 | # ========================= preparation ============================ 80 | if not os.path.isdir(f"generated_files/{project_id}"): 81 | os.makedirs(f"generated_files/{project_id}") 82 | 83 | # ========================== DBT tests ============================= 84 | if "dbt" in list(dot_tests["library"]): 85 | run_dbt_core(project_id, logger) 86 | generate_dbt_test_coverage_report(project_id, logger) 87 | archive_previous_dbt_results(logger) 88 | create_failed_dbt_test_models(project_id, logger, "view") 89 | run_dbt_test(project_id, logger) 90 | # dbt.run_dbt_chv_tests(logger) 91 | 92 | # =========================== GE tests ============================= 93 | if "great_expectations" in list(dot_tests["library"]): 94 | run_ge_tests(project_id, logger) 95 | 96 | # ================= Extract tests from results files =============== 97 | if "dbt" in list(dot_tests["library"]): 98 | dbt_test_summary = extract_df_from_dbt_test_results_json( 99 | run_id, project_id, logger 100 | ) 101 | dbt_test_rows = get_test_rows(dbt_test_summary, run_id, project_id, logger) 102 | 103 | if "great_expectations" in list(dot_tests["library"]): 104 | ge_test_summary = extract_df_from_ge_test_results_csv( 105 | run_id, project_id, logger 106 | ) 107 | ge_test_rows = get_test_rows(ge_test_summary, run_id, project_id, logger) 108 | 109 | all_tests_summary = pd.concat([dbt_test_summary, ge_test_summary]) 110 | all_tests_rows = pd.concat([dbt_test_rows, ge_test_rows]) 111 | 112 | if all_tests_summary.shape[0] > 0: 113 | 114 | # ===== Populate summary stats for rows total, passed, failed ===== 115 | all_tests_summary = set_summary_stats(all_tests_summary, project_id, logger) 116 | 117 | # ========================= Save results ========================= 118 | # To flat file, useful for debugging 119 | all_tests_summary.to_excel(f"./generated_files/{project_id}/all_tests_summary.xlsx") 120 | all_tests_rows.to_excel(f"./generated_files/{project_id}/all_tests_rows.xlsx") 121 | 122 | # To DB 123 | save_tests_to_db(all_tests_rows, all_tests_summary, project_id, logger) 124 | 125 | logger.info( 126 | "Ping!!! ... DOT run " 127 | + str(run_id) 128 | + " complete for project " 129 | + str(project_id) 130 | + ". 😊" 131 | ) 132 | else: 133 | logger.info( 134 | "Ooops!!! ... DOT run " 135 | + str(run_id) 136 | + " or project " 137 | + str(project_id) 138 | + " has no test results." 139 | ) 140 | 141 | 142 | def run_dot_tests(project_id, logger, run_id): 143 | """Wrapper around the DOT pipeline which will set status, start and end 144 | times in dot.run_status. Also catches exception and updates dot.run_log 145 | to set status='Failed' 146 | 147 | Parameters 148 | ---------- 149 | project_id : str 150 | Project ID, eg 'Muso'. Must align with project_id in dot.projects 151 | logger : logger object 152 | The logger 153 | run_id: UUID 154 | The UUID for the current run, generated by the driver script. This UUID 155 | will be stored in dot.run_log as well as dot.test_results. 156 | 157 | Returns 158 | ------- 159 | Nothing 160 | """ 161 | db_credentials = load_credentials(project_id, DbParamsConnection["dot"]) 162 | schema_dot = db_credentials["schema"] 163 | engine = create_sqlalchemy_engine(db_credentials) 164 | 165 | # Create our index 166 | run_start = str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) 167 | 168 | # Initialize our row of data to be input into status table 169 | status = pd.DataFrame( 170 | { 171 | "run_id": run_id, 172 | "project_id": project_id, 173 | "run_start": run_start, 174 | "run_finish": None, 175 | "run_status": "Running", 176 | "run_error": None, 177 | }, 178 | index=[0], 179 | ) 180 | logger.info(status) 181 | status.to_sql("run_log", engine, index=False, if_exists="append", schema=schema_dot) 182 | 183 | # Attempts the main function 184 | try: 185 | logger.info("Running tests for project_id: %s", project_id) 186 | 187 | # Run the dot_pipeline 188 | run_dot_stages(project_id, logger, run_id) 189 | 190 | # If no errors occur updates SQL table to say completed 191 | run_finish = str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) 192 | sql = ( 193 | "UPDATE " 194 | + schema_dot 195 | + ".run_log SET run_status = 'Finished', run_finish = '" 196 | + run_finish 197 | + "' WHERE run_id = '" 198 | + str(run_id) 199 | + "'" 200 | ) 201 | logger.info(sql) 202 | with engine.begin() as conn: 203 | conn.execute(sql) 204 | 205 | except Exception as e: 206 | # Make sure we get them logged 207 | err_block = "+++++++++++++++++++++++++++++++++ ERROR ++++++++++++++++++++++++++" 208 | logger.error(err_block) 209 | logger.error(sys.exc_info()) 210 | logger.error(err_block) 211 | 212 | error_string = str(sys.exc_info()) 213 | 214 | tb = sys.exc_info()[2] 215 | tb = traceback.format_tb(tb) 216 | for t in tb: 217 | logger.error(t) 218 | tb_str = t + "\n\n" 219 | error_string = error_string + tb_str 220 | 221 | error_string = error_string.replace('"', '""').replace("'", "''") 222 | 223 | # Run failed 224 | run_finish = str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) 225 | sql = ( 226 | "UPDATE " 227 | + schema_dot 228 | + ".run_log SET run_status = 'Failed', run_finish = '" 229 | + run_finish 230 | + "', run_error='" 231 | + error_string 232 | + "' WHERE run_id = '" 233 | + str(run_id) 234 | + "'" 235 | ) 236 | logger.info(sql) 237 | with engine.begin() as conn: 238 | conn.execute(sql) 239 | 240 | logger.info("Setting Feed_Status to ERROR") 241 | raise e 242 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | # A Docker environment is provided with DOT, but below you can also find 2 | # a simple conda environment. To use this ... 3 | # 4 | # 1. Install [miniconda](https://docs.conda.io/en/latest/miniconda.html) by selecting the installer that fits your OS version. Once it is installed you may have to restart your terminal (closing your terminal and opening again) 5 | # 2. In this directory, open terminal 6 | # 3. `conda env create -f environment.yml` 7 | # 4. `conda activate dot_conda_env` 8 | # 9 | name: dot_conda_env 10 | dependencies: 11 | - pip 12 | - python=3.8.10 13 | - pip: 14 | - -r dot/requirements_dot.txt -------------------------------------------------------------------------------- /images/db_schema.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datakind/Data-Observation-Toolkit/9c28925298e4b4a03e75a3b093d26efe13d8c3be/images/db_schema.png -------------------------------------------------------------------------------- /images/dot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datakind/Data-Observation-Toolkit/9c28925298e4b4a03e75a3b093d26efe13d8c3be/images/dot.png -------------------------------------------------------------------------------- /images/dot_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datakind/Data-Observation-Toolkit/9c28925298e4b4a03e75a3b093d26efe13d8c3be/images/dot_logo.png -------------------------------------------------------------------------------- /lint.py: -------------------------------------------------------------------------------- 1 | """ runs lint on the project sources""" 2 | 3 | import argparse 4 | import logging 5 | import sys 6 | from pylint.lint import Run 7 | 8 | sys.path.append("./dot") 9 | 10 | 11 | logging.getLogger().setLevel(logging.INFO) 12 | 13 | parser = argparse.ArgumentParser(prog="LINT") 14 | 15 | parser.add_argument( 16 | "-p", 17 | "--path", 18 | help="path to directory you want to run pylint | " 19 | "Default: %(default)s | " 20 | "Type: %(type)s ", 21 | default="./src", 22 | type=str, 23 | ) 24 | 25 | parser.add_argument( 26 | "-t", 27 | "--threshold", 28 | help="score threshold to fail pylint runner | " 29 | "Default: %(default)s | " 30 | "Type: %(type)s ", 31 | default=7, 32 | type=float, 33 | ) 34 | 35 | args = parser.parse_args() 36 | PATH = str(args.path) 37 | threshold = float(args.threshold) 38 | 39 | logging.info(f"PyLint Starting | " "Path: {PATH} | " "Threshold: {threshold}") 40 | 41 | results = Run([PATH], do_exit=False) 42 | 43 | final_score = results.linter.stats.global_note 44 | 45 | if final_score < threshold: 46 | 47 | MESSAGE = ( 48 | "PyLint Failed | " 49 | "Score: {} | " 50 | "Threshold: {} ".format(final_score, threshold) 51 | ) 52 | 53 | logging.error(MESSAGE) 54 | sys.exit(1) 55 | 56 | MESSAGE = ( 57 | "PyLint Passed | " "Score: {} | " "Threshold: {} ".format(final_score, threshold) 58 | ) 59 | 60 | logging.info(MESSAGE) 61 | 62 | sys.exit(0) 63 | -------------------------------------------------------------------------------- /setup_hooks.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | echo $'#/bin/sh\nblack .\nfor f in `git diff --name-only | grep ".py"`; do python lint.py -p $f || exit 1; done; pytest dot/self_tests/unit; pytest dot/self_tests/integration'> .git/hooks/pre-commit 3 | chmod +x .git/hooks/pre-commit 4 | --------------------------------------------------------------------------------