├── .github └── workflows │ └── main.yml ├── .gitignore ├── LICENSE ├── README.md ├── __init__.py ├── defog_data ├── __init__.py ├── academic │ ├── academic.json │ └── academic.sql ├── advising │ ├── advising.json │ └── advising.sql ├── atis │ ├── atis.json │ └── atis.sql ├── broker │ ├── broker.json │ └── broker.sql ├── car_dealership │ ├── car_dealership.json │ └── car_dealership.sql ├── derm_treatment │ ├── derm_treatment.json │ └── derm_treatment.sql ├── ewallet │ ├── ewallet.json │ └── ewallet.sql ├── geography │ ├── geography.json │ └── geography.sql ├── metadata.py ├── restaurants │ ├── restaurants.json │ └── restaurants.sql ├── scholar │ ├── scholar.json │ └── scholar.sql ├── supplementary.py └── yelp │ ├── yelp.json │ └── yelp.sql ├── requirements.txt ├── setup.py ├── setup.sh ├── setup_snowflake.sh ├── tests.py ├── translate_ddl_dialect.py └── utils_dialects.py /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: build 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | test: 7 | runs-on: ubuntu-latest 8 | 9 | steps: 10 | - uses: actions/checkout@v3 11 | 12 | - name: Set up Python 13 | uses: actions/setup-python@v4 14 | with: 15 | python-version: '3.10' 16 | - name: Install pip dependencies 17 | run: | 18 | pip install -r requirements.txt 19 | pip install pytest 20 | - name: Run tests 21 | run: pytest tests.py 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | defog_data.egg-info/ 3 | __pycache__/ 4 | .pytest_cache/ 5 | build/ 6 | dist/ 7 | 8 | # IDE 9 | .vscode/ 10 | 11 | # BigQuery, MySQL, SQLite and TSQL files 12 | *_bigquery.sql 13 | *_mysql.sql 14 | *_sqlite.sql 15 | *_tsql.sql 16 | sqlite_dbs/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Defog Data 2 | 3 | [![Build Status](https://github.com/defog-ai/defog-data/actions/workflows/main.yml/badge.svg)](https://github.com/defog-ai/defog-data/actions/workflows/main.yml) 4 | 5 | This repository contains the metadata and data of different databases that we require for various purposes, specifically testing. 6 | 7 | ## Usage 8 | 9 | ### Data Import 10 | 11 | To import the data into your database, you can use the `setup.sh` script. The script will drop and recreate the database if already present, so be careful when using it. This has the desired effect of idempotency, ie running it multiple times will result in the same state. Do remember to set the following environment variables to suit your postgres connection before running the script: 12 | 13 | ```sh 14 | export DBPASSWORD="postgres" 15 | export DBUSER="postgres" 16 | export DBHOST="localhost" 17 | export DBPORT=5432 18 | ./setup.sh 19 | ``` 20 | 21 | #### Snowflake 22 | 23 | To set up the data in snowflake, you would need to have the snowflake cli installed ([instructions](https://docs.snowflake.com/en/user-guide/snowsql-install-config)), and have your credentials configured as per the [docs](https://docs.snowflake.com/en/user-guide/snowsql-config). You can then run the following command to setup the data: 24 | 25 | ```sh 26 | ./setup_snowflake.sh 27 | ``` 28 | 29 | This will create 1 database per database in the repo as before, using `public` as the default schema. 30 | 31 | Note that the same sql files work for both the postgres and snowflake databases, so you can use the same sql files to setup both databases. 32 | 33 | #### BigQuery, MySQL, SQLite, SQL Server 34 | 35 | To set up the data in these systems, you would need your credentials to be configured in `utils_dialects`. You can then run the following command to set up the databases: 36 | 37 | ``` 38 | python translate_ddl_dialect.py 39 | ``` 40 | 41 | This will create one new SQL file per database per dialect. 42 | For SQLite, the `.db` files will be saved in the folder `sqlite_dbs`. 43 | Note that BigQuery, MySQL and SQLite do not support schemas and hence the SQL files will be modified to skip schema creation. 44 | 45 | If you only want to translate `defog_data` for specific dialects, you can add the `--dialects` argument to the command above. For example, the following command will translate the data for SQLite only 46 | 47 | ``` 48 | python translate_ddl_dialect.py --dialects sqlite 49 | ``` 50 | 51 | ### Python Library 52 | 53 | This is the recommended way to access the schema from the json files in a python environment. To use the python library in your code, navigate to this repository and install it using pip: 54 | 55 | ```sh 56 | pip install -r requirements.txt # get dependencies 57 | pip install -e . 58 | ``` 59 | 60 | The `-e` allows us to edit the code in place, ie if we make changes to the code, we don't have to reinstall the package. 61 | 62 | #### Metadata 63 | 64 | Once you have it installed, you can access the json metadata of each database as a dictionary using the following code: 65 | 66 | ```python 67 | import defog_data.metadata as md 68 | 69 | md.academic 70 | # {'table_metadata': {'cite': [{'data_type': 'bigint', 71 | # 'column_name': 'cited', 72 | # 'column_description': ['ID of the publication being cited']}, 73 | # ... 74 | ``` 75 | 76 | #### Supplementary 77 | 78 | We also have joinable columns, split by database in [supplementary.py](defog_data/supplementary.py). To access them, use the following code: 79 | 80 | ```python 81 | import defog_data.supplementary as sup 82 | 83 | # columns that can be joined on 84 | sup.columns_join 85 | ``` 86 | 87 | ## Organization 88 | 89 | ### Databases 90 | 91 | Each database (eg `academic`) is organized in a folder with the following structure: 92 | 93 | ```sh 94 | academic 95 | ├── academic.json 96 | └── academic.sql 97 | ``` 98 | 99 | The json contains the metadata of the database along with the column and table descriptions, while the sql file contains the dump of the database (metadata + data). This is to facilitate easier importing of the data into a database, without worrying about the sequence of inserts, especially foreign key constraints which require the primary key from the parent table to be present before inserting into the child table. 100 | 101 | ## Testing 102 | 103 | The test in `tests.py` just ensures that we are able to access the respective metadata for each table in each database. To run the tests, use the following command: 104 | 105 | ```sh 106 | python -m unittest tests.py 107 | ``` 108 | 109 | ## Release 110 | 111 | To build for release, first bump the version in [setup.py](setup.py) and then run the following commands: 112 | 113 | ```sh 114 | python setup.py sdist bdist_wheel 115 | twine upload dist/defog* 116 | ``` 117 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/defog-ai/defog-data/856295d8f0aa8a0b0fb71b9623e86f363469797a/__init__.py -------------------------------------------------------------------------------- /defog_data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/defog-ai/defog-data/856295d8f0aa8a0b0fb71b9623e86f363469797a/defog_data/__init__.py -------------------------------------------------------------------------------- /defog_data/academic/academic.json: -------------------------------------------------------------------------------- 1 | { 2 | "table_metadata": { 3 | "cite": [ 4 | { 5 | "data_type": "bigint", 6 | "column_name": "cited", 7 | "column_description": "ID of the publication being cited" 8 | }, 9 | { 10 | "data_type": "bigint", 11 | "column_name": "citing", 12 | "column_description": "ID of the publication that is citing another publication" 13 | } 14 | ], 15 | "author": [ 16 | { 17 | "data_type": "bigint", 18 | "column_name": "aid", 19 | "column_description": "Unique identifier for each author" 20 | }, 21 | { 22 | "data_type": "bigint", 23 | "column_name": "oid", 24 | "column_description": "Foreign key referencing the organization the author belongs to" 25 | }, 26 | { 27 | "data_type": "text", 28 | "column_name": "homepage", 29 | "column_description": "URL of the author's personal website" 30 | }, 31 | { 32 | "data_type": "text", 33 | "column_name": "name", 34 | "column_description": "Name of the author" 35 | } 36 | ], 37 | "domain": [ 38 | { 39 | "data_type": "bigint", 40 | "column_name": "did", 41 | "column_description": "Unique identifier for a domain" 42 | }, 43 | { 44 | "data_type": "text", 45 | "column_name": "name", 46 | "column_description": "Name of the domain" 47 | } 48 | ], 49 | "writes": [ 50 | { 51 | "data_type": "bigint", 52 | "column_name": "aid", 53 | "column_description": "Foreign key referencing the author table's primary key" 54 | }, 55 | { 56 | "data_type": "bigint", 57 | "column_name": "pid", 58 | "column_description": "Foreign key referencing the publication table's primary key" 59 | } 60 | ], 61 | "journal": [ 62 | { 63 | "data_type": "bigint", 64 | "column_name": "jid", 65 | "column_description": "Unique identifier for a journal" 66 | }, 67 | { 68 | "data_type": "text", 69 | "column_name": "homepage", 70 | "column_description": "The homepage URL for the journal" 71 | }, 72 | { 73 | "data_type": "text", 74 | "column_name": "name", 75 | "column_description": "The name of the journal" 76 | } 77 | ], 78 | "keyword": [ 79 | { 80 | "data_type": "bigint", 81 | "column_name": "kid", 82 | "column_description": "Unique identifier for a keyword" 83 | }, 84 | { 85 | "data_type": "text", 86 | "column_name": "keyword", 87 | "column_description": "The actual keyword" 88 | } 89 | ], 90 | "conference": [ 91 | { 92 | "data_type": "bigint", 93 | "column_name": "cid", 94 | "column_description": "Unique identifier for a conference" 95 | }, 96 | { 97 | "data_type": "text", 98 | "column_name": "homepage", 99 | "column_description": "The homepage URL for the conference" 100 | }, 101 | { 102 | "data_type": "text", 103 | "column_name": "name", 104 | "column_description": "The name of the conference" 105 | } 106 | ], 107 | "publication": [ 108 | { 109 | "data_type": "bigint", 110 | "column_name": "year", 111 | "column_description": "The year of publication" 112 | }, 113 | { 114 | "data_type": "bigint", 115 | "column_name": "cid", 116 | "column_description": "The ID of the conference where the publication was presented" 117 | }, 118 | { 119 | "data_type": "bigint", 120 | "column_name": "citation_num", 121 | "column_description": "The number of citations received by the publication" 122 | }, 123 | { 124 | "data_type": "bigint", 125 | "column_name": "jid", 126 | "column_description": "The ID of the journal where the publication was published" 127 | }, 128 | { 129 | "data_type": "bigint", 130 | "column_name": "pid", 131 | "column_description": "The unique ID of the publication" 132 | }, 133 | { 134 | "data_type": "bigint", 135 | "column_name": "reference_num", 136 | "column_description": "The number of references cited by the publication" 137 | }, 138 | { 139 | "data_type": "text", 140 | "column_name": "title", 141 | "column_description": "The title of the publication" 142 | }, 143 | { 144 | "data_type": "text", 145 | "column_name": "abstract", 146 | "column_description": "The abstract of the publication" 147 | } 148 | ], 149 | "organization": [ 150 | { 151 | "data_type": "bigint", 152 | "column_name": "oid", 153 | "column_description": "Unique identifier for the organization" 154 | }, 155 | { 156 | "data_type": "text", 157 | "column_name": "continent", 158 | "column_description": "Continent where the organization is located" 159 | }, 160 | { 161 | "data_type": "text", 162 | "column_name": "homepage", 163 | "column_description": "URL of the organization's homepage" 164 | }, 165 | { 166 | "data_type": "text", 167 | "column_name": "name", 168 | "column_description": "Name of the organization" 169 | } 170 | ], 171 | "domain_author": [ 172 | { 173 | "data_type": "bigint", 174 | "column_name": "aid", 175 | "column_description": "Foreign key referencing the author table's primary key" 176 | }, 177 | { 178 | "data_type": "bigint", 179 | "column_name": "did", 180 | "column_description": "Foreign key referencing the domain table's primary key" 181 | } 182 | ], 183 | "domain_journal": [ 184 | { 185 | "data_type": "bigint", 186 | "column_name": "did", 187 | "column_description": "Foreign key referencing the domain table's primary key" 188 | }, 189 | { 190 | "data_type": "bigint", 191 | "column_name": "jid", 192 | "column_description": "Foreign key referencing the journal table's primary key" 193 | } 194 | ], 195 | "domain_keyword": [ 196 | { 197 | "data_type": "bigint", 198 | "column_name": "did", 199 | "column_description": "Foreign key referencing the 'did' column of the 'domain' table" 200 | }, 201 | { 202 | "data_type": "bigint", 203 | "column_name": "kid", 204 | "column_description": "Foreign key referencing the 'kid' column of the 'keyword' table" 205 | } 206 | ], 207 | "domain_conference": [ 208 | { 209 | "data_type": "bigint", 210 | "column_name": "cid", 211 | "column_description": "Foreign key referencing the cid column in the conference table" 212 | }, 213 | { 214 | "data_type": "bigint", 215 | "column_name": "did", 216 | "column_description": "Foreign key referencing the did column in the domain table" 217 | } 218 | ], 219 | "domain_publication": [ 220 | { 221 | "data_type": "bigint", 222 | "column_name": "did", 223 | "column_description": "Foreign key referencing the domain table's primary key column (did)" 224 | }, 225 | { 226 | "data_type": "bigint", 227 | "column_name": "pid", 228 | "column_description": "Foreign key referencing the publication table's primary key column (pid)" 229 | } 230 | ], 231 | "publication_keyword": [ 232 | { 233 | "data_type": "bigint", 234 | "column_name": "pid", 235 | "column_description": "Foreign key referencing the publication table's primary key (pid)" 236 | }, 237 | { 238 | "data_type": "bigint", 239 | "column_name": "kid", 240 | "column_description": "Foreign key referencing the keyword table's primary key (kid)" 241 | } 242 | ] 243 | }, 244 | "glossary": "" 245 | } -------------------------------------------------------------------------------- /defog_data/academic/academic.sql: -------------------------------------------------------------------------------- 1 | 2 | CREATE TABLE public.author ( 3 | aid bigint NOT NULL, 4 | homepage text, 5 | name text, 6 | oid bigint 7 | ); 8 | 9 | 10 | CREATE TABLE public.cite ( 11 | cited bigint, 12 | citing bigint 13 | ); 14 | 15 | 16 | CREATE TABLE public.conference ( 17 | cid bigint NOT NULL, 18 | homepage text, 19 | name text 20 | ); 21 | 22 | 23 | CREATE TABLE public.domain ( 24 | did bigint NOT NULL, 25 | name text 26 | ); 27 | 28 | 29 | 30 | CREATE TABLE public.domain_author ( 31 | aid bigint NOT NULL, 32 | did bigint NOT NULL 33 | ); 34 | 35 | 36 | CREATE TABLE public.domain_conference ( 37 | cid bigint NOT NULL, 38 | did bigint NOT NULL 39 | ); 40 | 41 | 42 | CREATE TABLE public.domain_journal ( 43 | did bigint NOT NULL, 44 | jid bigint NOT NULL 45 | ); 46 | 47 | 48 | CREATE TABLE public.domain_keyword ( 49 | did bigint NOT NULL, 50 | kid bigint NOT NULL 51 | ); 52 | 53 | 54 | CREATE TABLE public.domain_publication ( 55 | did bigint NOT NULL, 56 | pid bigint NOT NULL 57 | ); 58 | 59 | 60 | 61 | CREATE TABLE public.journal ( 62 | homepage text, 63 | jid bigint NOT NULL, 64 | name text 65 | ); 66 | 67 | 68 | CREATE TABLE public.keyword ( 69 | keyword text, 70 | kid bigint NOT NULL 71 | ); 72 | 73 | 74 | CREATE TABLE public.organization ( 75 | continent text, 76 | homepage text, 77 | name text, 78 | oid bigint NOT NULL 79 | ); 80 | 81 | 82 | CREATE TABLE public.publication ( 83 | abstract text, 84 | cid bigint, 85 | citation_num bigint, 86 | jid bigint, 87 | pid bigint NOT NULL, 88 | reference_num bigint, 89 | title text, 90 | year bigint 91 | ); 92 | 93 | 94 | CREATE TABLE public.publication_keyword ( 95 | pid bigint NOT NULL, 96 | kid bigint NOT NULL 97 | ); 98 | 99 | 100 | 101 | CREATE TABLE public.writes ( 102 | aid bigint NOT NULL, 103 | pid bigint NOT NULL 104 | ); 105 | 106 | 107 | INSERT INTO public.author (aid, homepage, name, oid) VALUES 108 | (1, 'www.larry.com', 'Larry Summers', 2), 109 | (2, 'www.ashish.com', 'Ashish Vaswani', 3), 110 | (3, 'www.noam.com', 'Noam Shazeer', 3), 111 | (4, 'www.martin.com', 'Martin Odersky', 4), 112 | (5, NULL, 'Kempinski', NULL) 113 | ; 114 | 115 | 116 | INSERT INTO public.cite (cited, citing) VALUES 117 | (1, 2), 118 | (1, 3), 119 | (1, 4), 120 | (1, 5), 121 | (2, 3), 122 | (2, 5), 123 | (3, 4), 124 | (3, 5), 125 | (4, 5) 126 | ; 127 | 128 | 129 | INSERT INTO public.conference (cid, homepage, name) VALUES 130 | (1, 'www.isa.com', 'ISA'), 131 | (2, 'www.aaas.com', 'AAAS'), 132 | (3, 'www.icml.com', 'ICML') 133 | ; 134 | 135 | 136 | INSERT INTO public.domain (did, name) VALUES 137 | (1, 'Data Science'), 138 | (2, 'Natural Sciences'), 139 | (3, 'Computer Science'), 140 | (4, 'Sociology'), 141 | (5, 'Machine Learning') 142 | ; 143 | 144 | 145 | INSERT INTO public.domain_author (aid, did) VALUES 146 | (1, 2), 147 | (1, 4), 148 | (2, 3), 149 | (2, 1), 150 | (2, 5), 151 | (3, 5), 152 | (3, 3), 153 | (4, 3) 154 | ; 155 | 156 | 157 | INSERT INTO public.domain_conference (cid, did) VALUES 158 | (1, 2), 159 | (2, 4), 160 | (3, 5) 161 | ; 162 | 163 | 164 | INSERT INTO public.domain_journal (did, jid) VALUES 165 | (1, 2), 166 | (2, 3), 167 | (5, 4) 168 | ; 169 | 170 | 171 | INSERT INTO public.domain_keyword (did, kid) VALUES 172 | (1, 2), 173 | (2, 3) 174 | ; 175 | 176 | 177 | INSERT INTO public.domain_publication (did, pid) VALUES 178 | (4, 1), 179 | (2, 2), 180 | (1, 3), 181 | (3, 4), 182 | (3, 5), 183 | (5, 5) 184 | ; 185 | 186 | 187 | INSERT INTO public.journal (homepage, jid, name) VALUES 188 | ('www.aijournal.com', 1, 'Journal of Artificial Intelligence Research'), 189 | ('www.nature.com', 2, 'Nature'), 190 | ('www.science.com', 3, 'Science'), 191 | ('www.ml.com', 4, 'Journal of Machine Learning Research') 192 | ; 193 | 194 | 195 | INSERT INTO public.keyword (keyword, kid) VALUES 196 | ('AI', 1), 197 | ('Neuroscience', 2), 198 | ('Machine Learning', 3), 199 | ('Keyword 4', 4) 200 | ; 201 | 202 | 203 | INSERT INTO public.organization (continent, homepage, name, oid) VALUES 204 | ('Asia', 'www.organization1.com', 'Organization 1', 1), 205 | ('North America', 'www.organization2.com', 'Organization 2', 2), 206 | ('North America', 'www.organization3.com', 'Organization 3', 3), 207 | ('Europe', 'www.epfl.com', 'École Polytechnique Fédérale de Lausanne 4', 4), 208 | ('Europe', 'www.organization5.com', 'Organization 5', 5) 209 | ; 210 | 211 | 212 | INSERT INTO public.publication (abstract, cid, citation_num, jid, pid, reference_num, title, year) VALUES 213 | ('Abstract 1', 1, 4, 1, 1, 0, 'The Effects of Climate Change on Agriculture', 2020), 214 | ('Abstract 2', 2, 2, 2, 2, 1, 'A Study on the Effects of Social Media on Mental Health', 2020), 215 | ('Abstract 3', 3, 2, 2, 3, 2, 'Data Mining Techniques', 2021), 216 | ('Abstract 4', 3, 1, 2, 4, 2, 'Optimizing GPU Throughput', 2021), 217 | ('Abstract 5', 3, 0, 4, 5, 4, 'Attention is all you need', 2021) 218 | ; 219 | 220 | 221 | INSERT INTO public.publication_keyword (pid, kid) VALUES 222 | (1, 2), 223 | (2, 3) 224 | ; 225 | 226 | 227 | INSERT INTO public.writes (aid, pid) VALUES 228 | (1, 1), 229 | (1, 2), 230 | (2, 3), 231 | (2, 4), 232 | (2, 5), 233 | (3, 5) 234 | ; 235 | -------------------------------------------------------------------------------- /defog_data/advising/advising.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE public.area ( 2 | course_id bigint, 3 | area text 4 | ); 5 | 6 | 7 | CREATE TABLE public.comment_instructor ( 8 | instructor_id bigint DEFAULT '0'::bigint NOT NULL, 9 | student_id bigint DEFAULT '0'::bigint NOT NULL, 10 | score bigint, 11 | comment_text text 12 | ); 13 | 14 | 15 | CREATE TABLE public.course ( 16 | course_id bigint DEFAULT '0'::bigint NOT NULL, 17 | name text, 18 | department text, 19 | number text, 20 | credits text, 21 | advisory_requirement text, 22 | enforced_requirement text, 23 | description text, 24 | num_semesters bigint, 25 | num_enrolled bigint, 26 | has_discussion boolean, 27 | has_lab boolean, 28 | has_projects boolean, 29 | has_exams boolean, 30 | num_reviews bigint, 31 | clarity_score bigint, 32 | easiness_score bigint, 33 | helpfulness_score bigint 34 | ); 35 | 36 | 37 | CREATE TABLE public.course_offering ( 38 | offering_id bigint DEFAULT '0'::bigint NOT NULL, 39 | course_id bigint, 40 | semester bigint, 41 | section_number bigint, 42 | start_time time, 43 | end_time time, 44 | monday text, 45 | tuesday text, 46 | wednesday text, 47 | thursday text, 48 | friday text, 49 | saturday text, 50 | sunday text, 51 | has_final_project boolean, 52 | has_final_exam boolean, 53 | textbook text, 54 | class_address text, 55 | allow_audit text DEFAULT 'false'::text 56 | ); 57 | 58 | 59 | CREATE TABLE public.course_prerequisite ( 60 | pre_course_id bigint NOT NULL, 61 | course_id bigint NOT NULL 62 | ); 63 | 64 | 65 | CREATE TABLE public.course_tags_count ( 66 | course_id bigint DEFAULT '0'::bigint NOT NULL, 67 | clear_grading bigint DEFAULT '0'::bigint, 68 | pop_quiz bigint DEFAULT '0'::bigint, 69 | group_projects bigint DEFAULT '0'::bigint, 70 | inspirational bigint DEFAULT '0'::bigint, 71 | long_lectures bigint DEFAULT '0'::bigint, 72 | extra_credit bigint DEFAULT '0'::bigint, 73 | few_tests bigint DEFAULT '0'::bigint, 74 | good_feedback bigint DEFAULT '0'::bigint, 75 | tough_tests bigint DEFAULT '0'::bigint, 76 | heavy_papers bigint DEFAULT '0'::bigint, 77 | cares_for_students bigint DEFAULT '0'::bigint, 78 | heavy_assignments bigint DEFAULT '0'::bigint, 79 | respected bigint DEFAULT '0'::bigint, 80 | participation bigint DEFAULT '0'::bigint, 81 | heavy_reading bigint DEFAULT '0'::bigint, 82 | tough_grader bigint DEFAULT '0'::bigint, 83 | hilarious bigint DEFAULT '0'::bigint, 84 | would_take_again bigint DEFAULT '0'::bigint, 85 | good_lecture bigint DEFAULT '0'::bigint, 86 | no_skip bigint DEFAULT '0'::bigint 87 | ); 88 | 89 | 90 | CREATE TABLE public.instructor ( 91 | instructor_id bigint DEFAULT '0'::bigint NOT NULL, 92 | name text, 93 | uniqname text 94 | ); 95 | 96 | 97 | CREATE TABLE public.offering_instructor ( 98 | offering_instructor_id bigint DEFAULT '0'::bigint NOT NULL, 99 | offering_id bigint, 100 | instructor_id bigint 101 | ); 102 | 103 | 104 | CREATE TABLE public.program ( 105 | program_id bigint NOT NULL, 106 | name text, 107 | college text, 108 | introduction text 109 | ); 110 | 111 | 112 | CREATE TABLE public.program_course ( 113 | program_id bigint DEFAULT '0'::bigint NOT NULL, 114 | course_id bigint DEFAULT '0'::bigint NOT NULL, 115 | workload bigint, 116 | category text DEFAULT ''::text NOT NULL 117 | ); 118 | 119 | 120 | CREATE TABLE public.program_requirement ( 121 | program_id bigint NOT NULL, 122 | category text NOT NULL, 123 | min_credit bigint, 124 | additional_req text 125 | ); 126 | 127 | 128 | CREATE TABLE public.semester ( 129 | semester_id bigint NOT NULL, 130 | semester text, 131 | year bigint 132 | ); 133 | 134 | 135 | CREATE TABLE public.student ( 136 | student_id bigint NOT NULL, 137 | lastname text, 138 | firstname text, 139 | program_id bigint, 140 | declare_major text, 141 | total_credit bigint, 142 | total_gpa numeric, 143 | entered_as text DEFAULT 'firstyear'::text, 144 | admit_term date, 145 | predicted_graduation_semester date, 146 | degree text, 147 | minor text, 148 | internship text 149 | ); 150 | 151 | 152 | CREATE TABLE public.student_record ( 153 | student_id bigint NOT NULL, 154 | course_id bigint NOT NULL, 155 | semester bigint NOT NULL, 156 | grade text, 157 | how text, 158 | transfer_source text, 159 | earn_credit text DEFAULT 'y'::text NOT NULL, 160 | repeat_term text, 161 | test_id text, 162 | offering_id bigint 163 | ); 164 | 165 | 166 | INSERT INTO public.area (course_id, area) VALUES 167 | (1, 'Computer Science'), 168 | (2, 'Mathematics'), 169 | (3, 'Physics'), 170 | (4, 'Computer Science') 171 | ; 172 | 173 | INSERT INTO public.comment_instructor (instructor_id, student_id, score, comment_text) VALUES 174 | (1, 1, 5, 'John Smith is a great instructor.'), 175 | (2, 2, 4, 'Jane Doe explains concepts clearly.') 176 | ; 177 | 178 | INSERT INTO public.course (course_id, name, department, number, credits, advisory_requirement, enforced_requirement, description, num_semesters, num_enrolled, has_discussion, has_lab, has_projects, has_exams, num_reviews, clarity_score, easiness_score, helpfulness_score) VALUES 179 | (1, 'Introduction to Computer Science', 'Computer Science', 'CS101', '3', NULL, NULL, 'This course introduces the basics of computer science.', 2, 3, true, false, true, false, 10, 5, 3, 4), 180 | (2, 'Advanced Calculus', 'Mathematics', 'MATH201', '4', 'CS101', NULL, 'This course covers advanced topics in calculus.', 1, 5, false, false, true, true, 5, 4, 2, 3), 181 | (3, 'Introduction to Physics', 'Physics', 'PHYS101', '3', NULL, 'MATH201', 'This course provides an introduction to physics principles.', 2, 1, true, true, true, true, 8, 4, 3, 5), 182 | (4, 'Distributed Databases', 'Computer Science', 'CS302', '3', NULL, 'CS101', 'This course provides an introduction to distributed databases.', 2, 3, true, true, false, true, 4, 2, 1, 5) 183 | ; 184 | 185 | INSERT INTO public.course_offering (offering_id, course_id, semester, section_number, start_time, end_time, monday, tuesday, wednesday, thursday, friday, saturday, sunday, has_final_project, has_final_exam, textbook, class_address, allow_audit) VALUES 186 | (1, 1, 1, 1, '08:00:00', '10:00:00', 'John Smith', NULL, NULL, 'Jane Doe', NULL, NULL, NULL, true, false, 'Introduction to Computer Science', '123 Main St', 'true'), 187 | (2, 2, 1, 1, '10:00:00', '12:00:00', NULL, NULL, 'Gilbert Strang', NULL, NULL, NULL, NULL, true, true, 'Advanced Calculus', '456 Elm St', 'false'), 188 | (3, 3, 2, 1, '08:00:00', '10:00:00', 'John Smith', NULL, NULL, 'Jane Doe', NULL, NULL, NULL, false, true, 'Introduction to Physics', '789 Oak St', 'true'), 189 | (4, 4, 2, 1, '16:00:00', '18:00:00', NULL, NULL, 'John Smith', 'Brendan Burns', NULL, NULL, NULL, false, true, 'Distributed Systems', '789 Oak St', 'true'), 190 | (5, 1, 3, 1, '08:00:00', '10:00:00', NULL, 'John Smith', 'Jane Doe', NULL, NULL, NULL, NULL, true, false, 'Introduction to Computer Science', '123 Main St', 'true'), 191 | (6, 2, 3, 1, '10:00:00', '12:00:00', 'Gilbert Strang', NULL, NULL, NULL, NULL, NULL, NULL, true, true, 'Advanced Calculus', '456 Elm St', 'false'), 192 | (7, 3, 4, 1, '14:00:00', '16:00:00', NULL, NULL, 'Jane Doe', NULL, 'John Smith', NULL, NULL, false, true, 'Introduction to Physics', '789 Oak St', 'true'), 193 | (8, 4, 4, 1, '16:00:00', '18:00:00', NULL, NULL, 'John Smith', NULL, 'Brendan Burns', NULL, NULL, false, true, 'Distributed Systems', '789 Oak St', 'true') 194 | ; 195 | 196 | INSERT INTO public.course_prerequisite (pre_course_id, course_id) VALUES 197 | (1, 2), 198 | (2, 3) 199 | ; 200 | 201 | INSERT INTO public.course_tags_count (course_id, clear_grading, pop_quiz, group_projects, inspirational, long_lectures, extra_credit, few_tests, good_feedback, tough_tests, heavy_papers, cares_for_students, heavy_assignments, respected, participation, heavy_reading, tough_grader, hilarious, would_take_again, good_lecture, no_skip) VALUES 202 | (1, 5, 2, 3, 4, 2, 1, 3, 4, 2, 1, 5, 3, 4, 2, 1, 5, 3, 4, 2, NULL), 203 | (2, 4, 1, 2, 3, 1, 2, 2, 3, 1, 2, 4, 2, 3, 1, 2, 4, 2, 3, 1, NULL), 204 | (3, 3, 2, 1, 2, 3, 1, 1, 2, 3, 1, 3, 1, 2, 3, 1, 3, 1, 2, 3, NULL), 205 | (4, 2, 3, 0, 2, 3, 1, 1, 2, 3, 0, 3, 4, 2, 3, 5, 3, 1, 2, 3, NULL) 206 | ; 207 | 208 | 209 | INSERT INTO public.instructor (instructor_id, name, uniqname) VALUES 210 | (1, 'John Smith', 'jsmith'), 211 | (2, 'Jane Doe', 'jdoe'), 212 | (3, 'Gilbert Strang', 'gstrang'), 213 | (4, 'Brendan Burns', 'bburns') 214 | ; 215 | 216 | INSERT INTO public.offering_instructor (offering_instructor_id, offering_id, instructor_id) VALUES 217 | (1, 1, 1), 218 | (2, 1, 2), 219 | (3, 2, 3), 220 | (4, 3, 1), 221 | (5, 3, 2), 222 | (6, 4, 1), 223 | (7, 4, 4), 224 | (8, 5, 1), 225 | (9, 5, 2), 226 | (10, 6, 3), 227 | (11, 7, 2), 228 | (12, 7, 1), 229 | (13, 8, 1), 230 | (14, 8, 4) 231 | ; 232 | 233 | INSERT INTO public.program (program_id, name, college, introduction) VALUES 234 | (1, 'Computer Science', 'Engineering', 'This program focuses on computer science principles and applications.'), 235 | (2, 'Mathematics', 'Arts and Sciences', 'This program provides a comprehensive study of mathematical concepts and theories.'), 236 | (3, 'Physics', 'Arts and Sciences', 'This program explores the fundamental principles of physics and their applications.') 237 | ; 238 | 239 | INSERT INTO public.program_course (program_id, course_id, workload, category) VALUES 240 | (1, 1, 100, 'Core'), 241 | (1, 4, 80, 'Elective'), 242 | (2, 2, 90, 'Core'), 243 | (3, 3, 70, 'Core') 244 | ; 245 | 246 | INSERT INTO public.program_requirement (program_id, category, min_credit, additional_req) VALUES 247 | (1, 'Core', 120, NULL), 248 | (2, 'Core', 90, NULL), 249 | (3, 'Core', 200, NULL) 250 | ; 251 | 252 | INSERT INTO public.semester (semester_id, semester, year) VALUES 253 | (1, 'Fall', 2020), 254 | (2, 'Spring', 2021), 255 | (3, 'Summer', 2021), 256 | (4, 'Fall', 2021) 257 | ; 258 | 259 | INSERT INTO public.student (student_id, lastname, firstname, program_id, declare_major, total_credit, total_gpa, entered_as, admit_term, predicted_graduation_semester, degree, minor, internship) VALUES 260 | (1, 'Smith', 'John', 1, 'Computer Science', 13, 3.5, 'Freshman','2018-01-01', '2022-05-01', 'Bachelor of Science', NULL, NULL), 261 | (2, 'Doe', 'Jane', 1, 'Computer Science', 7, 3.2, 'Freshman', '2018-01-01', '2022-05-01', 'Bachelor of Science', NULL, NULL), 262 | (3, 'Johnson', 'David', 2, 'Mathematics', 7, 3.6, 'Freshman', '2019-01-01', '2022-05-01', 'Bachelor of Arts', 'Mathematics', NULL), 263 | (4, 'Brown', 'Sarah', 3, 'Physics', 7, 3.8, 'Freshman', CURRENT_DATE - INTERVAL '15 years', CURRENT_DATE - INTERVAL '11 years', 'Bachelor of Science', 'Physics', NULL), 264 | (5, 'Wilson', 'Michael', 1, 'Computer Science', 7, 3.2, 'Freshman', CURRENT_DATE - INTERVAL '13 years', CURRENT_DATE - INTERVAL '9 years', 'Bachelor of Science', NULL, NULL) 265 | ; 266 | 267 | INSERT INTO public.student_record (student_id, course_id, semester, grade, how, transfer_source, earn_credit, repeat_term, test_id, offering_id) VALUES 268 | (1, 1, 1, 'A', 'in-person', NULL, 'Yes', NULL, '1', 1), 269 | (1, 2, 1, 'A', 'in-person', NULL, 'Yes', NULL, '1', 2), 270 | (1, 3, 2, 'A', 'in-person', NULL, 'Yes', NULL, '1', 3), 271 | (1, 4, 2, 'A', 'in-person', NULL, 'Yes', NULL, '1', 4), 272 | (2, 2, 1, 'C', 'in-person', NULL, 'Yes', NULL, '1', 2), 273 | (2, 1, 1, 'B', 'online', NULL, 'Yes', NULL, '1', 1), 274 | (3, 2, 1, 'B+', 'in-person', NULL, 'Yes', NULL, '1', 2), 275 | (3, 4, 2, 'B+', 'in-person', NULL, 'Yes', NULL, '1', 4), 276 | (4, 2, 1, 'C', 'in-person', NULL, 'Yes', NULL, '1', 2), 277 | (4, 1, 1, 'B', 'online', NULL, 'Yes', NULL, '1', 1), 278 | (5, 2, 1, 'B+', 'in-person', NULL, 'Yes', NULL, '1', 2), 279 | (5, 4, 2, 'B+', 'in-person', NULL, 'Yes', NULL, '1', 4) 280 | ; 281 | -------------------------------------------------------------------------------- /defog_data/atis/atis.sql: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | CREATE TABLE public.aircraft ( 11 | aircraft_code text, 12 | aircraft_description text, 13 | manufacturer text, 14 | basic_type text, 15 | engines bigint, 16 | propulsion text, 17 | wide_body text, 18 | wing_span bigint, 19 | length bigint, 20 | weight bigint, 21 | capacity bigint, 22 | pay_load bigint, 23 | cruising_speed bigint, 24 | range_miles bigint, 25 | pressurized text 26 | ); 27 | 28 | 29 | CREATE TABLE public.airline ( 30 | airline_code text, 31 | airline_name text, 32 | note text 33 | ); 34 | 35 | 36 | CREATE TABLE public.airport ( 37 | airport_code text, 38 | airport_name text, 39 | airport_location text, 40 | state_code text, 41 | country_name text, 42 | time_zone_code text, 43 | minimum_connect_time bigint 44 | ); 45 | 46 | 47 | CREATE TABLE public.airport_service ( 48 | city_code text, 49 | airport_code text, 50 | miles_distant bigint, 51 | direction text, 52 | minutes_distant bigint 53 | ); 54 | 55 | 56 | CREATE TABLE public.city ( 57 | city_code text, 58 | city_name text, 59 | state_code text, 60 | country_name text, 61 | time_zone_code text 62 | ); 63 | 64 | 65 | CREATE TABLE public.class_of_service ( 66 | booking_class text DEFAULT ''::text NOT NULL, 67 | rank bigint, 68 | class_description text 69 | ); 70 | 71 | 72 | CREATE TABLE public.code_description ( 73 | code text DEFAULT ''::text NOT NULL, 74 | description text 75 | ); 76 | 77 | 78 | CREATE TABLE public.compartment_class ( 79 | compartment text, 80 | class_type text 81 | ); 82 | 83 | 84 | CREATE TABLE public.days ( 85 | days_code text, 86 | day_name text 87 | ); 88 | 89 | 90 | CREATE TABLE public.dual_carrier ( 91 | main_airline text, 92 | low_flight_number bigint, 93 | high_flight_number bigint, 94 | dual_airline text, 95 | service_name text 96 | ); 97 | 98 | 99 | CREATE TABLE public.equipment_sequence ( 100 | aircraft_code_sequence text, 101 | aircraft_code text 102 | ); 103 | 104 | 105 | CREATE TABLE public.fare ( 106 | fare_id bigint DEFAULT '0'::bigint NOT NULL, 107 | from_airport text, 108 | to_airport text, 109 | fare_basis_code text, 110 | fare_airline text, 111 | restriction_code text, 112 | one_direction_cost bigint, 113 | round_trip_cost bigint, 114 | round_trip_required text 115 | ); 116 | 117 | 118 | CREATE TABLE public.fare_basis ( 119 | fare_basis_code text, 120 | booking_class text, 121 | class_type text, 122 | premium text, 123 | economy text, 124 | discounted text, 125 | night text, 126 | season text, 127 | basis_days text 128 | ); 129 | 130 | 131 | CREATE TABLE public.flight ( 132 | flight_id bigint DEFAULT '0'::bigint NOT NULL, 133 | flight_days text, 134 | from_airport text, 135 | to_airport text, 136 | departure_time bigint, 137 | arrival_time bigint, 138 | airline_flight text, 139 | airline_code text, 140 | flight_number text, 141 | aircraft_code_sequence text, 142 | meal_code text, 143 | stops bigint, 144 | connections bigint, 145 | dual_carrier text, 146 | time_elapsed bigint 147 | ); 148 | 149 | 150 | CREATE TABLE public.flight_fare ( 151 | flight_id bigint, 152 | fare_id bigint 153 | ); 154 | 155 | 156 | CREATE TABLE public.flight_leg ( 157 | flight_id bigint, 158 | leg_number bigint, 159 | leg_flight bigint 160 | ); 161 | 162 | 163 | CREATE TABLE public.flight_stop ( 164 | flight_id bigint, 165 | stop_number bigint, 166 | stop_days text, 167 | stop_airport text, 168 | arrival_time bigint, 169 | arrival_airline text, 170 | arrival_flight_number text, 171 | departure_time bigint, 172 | departure_airline text, 173 | departure_flight_number text, 174 | stop_time bigint 175 | ); 176 | 177 | 178 | CREATE TABLE public.food_service ( 179 | meal_code text, 180 | meal_number bigint, 181 | compartment text, 182 | meal_description text 183 | ); 184 | 185 | 186 | CREATE TABLE public.ground_service ( 187 | city_code text, 188 | airport_code text, 189 | transport_type text, 190 | ground_fare bigint 191 | ); 192 | 193 | 194 | CREATE TABLE public.month ( 195 | month_number bigint, 196 | month_name text 197 | ); 198 | 199 | 200 | CREATE TABLE public.restriction ( 201 | restriction_code text, 202 | advance_purchase bigint, 203 | stopovers text, 204 | saturday_stay_required text, 205 | minimum_stay bigint, 206 | maximum_stay bigint, 207 | application text, 208 | no_discounts text 209 | ); 210 | 211 | 212 | CREATE TABLE public.state ( 213 | state_code text, 214 | state_name text, 215 | country_name text 216 | ); 217 | 218 | 219 | CREATE TABLE public.time_interval ( 220 | period text, 221 | begin_time bigint, 222 | end_time bigint 223 | ); 224 | 225 | 226 | CREATE TABLE public.time_zone ( 227 | time_zone_code text, 228 | time_zone_name text, 229 | hours_from_gmt bigint 230 | ); 231 | 232 | 233 | INSERT INTO public.aircraft (aircraft_code, aircraft_description, manufacturer, basic_type, engines, propulsion, wide_body, wing_span, length, weight, capacity, pay_load, cruising_speed, range_miles, pressurized) VALUES 234 | ('B747', 'The Boeing 747 is a wide-body airliner.', 'Boeing', 'Jet', 4, 'Jet', 'Yes', 224, 231, 987000, 416, 60000, 570, 8555, 'Yes'), 235 | ('A320', 'The Airbus A320 is a narrow-body airliner.', 'Airbus', 'Jet', 2, 'Jet', 'No', 111, 123, 162000, 240, 30000, 511, 3300, 'Yes'), 236 | ('B737', 'The Boeing 737 is a narrow-body airliner.', 'Boeing', 'Jet', 2, 'Jet', 'No', 117, 128, 174200, 230, 35000, 514, 3850, 'Yes'), 237 | ('A380', 'The Airbus A380 is a wide-body airliner.', 'Airbus', 'Jet', 4, 'Jet', 'Yes', 261, 238, 1235000, 853, 140000, 560, 8000, 'Yes'), 238 | ('B777', 'The Boeing 777 is a wide-body airliner.', 'Boeing', 'Jet', 2, 'Jet', 'Yes', 199, 242, 775000, 550, 70000, 560, 8555, 'Yes'), 239 | ('A330', 'The Airbus A330 is a wide-body airliner.', 'Airbus', 'Jet', 2, 'Jet', 'Yes', 197, 193, 503000, 440, 65000, 560, 6350, 'Yes'), 240 | ('B787', 'The Boeing 787 is a wide-body airliner.', 'Boeing', 'Jet', 2, 'Jet', 'Yes', 197, 186, 485000, 330, 55000, 593, 7530, 'Yes'), 241 | ('A350', 'The Airbus A350 is a wide-body airliner.', 'Airbus', 'Jet', 2, 'Jet', 'Yes', 212, 242, 556000, 440, 70000, 568, 8000, 'Yes'), 242 | ('E190', 'The Embraer E190 is a narrow-body airliner.', 'Embraer', 'Jet', 2, 'Jet', 'No', 94, 118, 114000, 114, 15000, 542, 2400, 'Yes'), 243 | ('CRJ200', 'The Bombardier CRJ200 is a regional jet.', 'Bombardier', 'Jet', 2, 'Jet', 'No', 76, 88, 51000, 50, 6300, 534, 1735, 'Yes') 244 | ; 245 | 246 | INSERT INTO public.airline (airline_code, airline_name, note) VALUES 247 | ('AA', 'American Airlines', NULL), 248 | ('UA', 'United Airlines', NULL), 249 | ('DL', 'Delta Air Lines', NULL), 250 | ('WN', 'Southwest Airlines', NULL), 251 | ('AS', 'Alaska Airlines', NULL), 252 | ('B6', 'JetBlue Airways', NULL), 253 | ('NK', 'Spirit Airlines', NULL), 254 | ('F9', 'Frontier Airlines', NULL), 255 | ('HA', 'Hawaiian Airlines', NULL), 256 | ('VX', 'Virgin America', NULL) 257 | ; 258 | 259 | INSERT INTO public.airport (airport_code, airport_name, airport_location, state_code, country_name, time_zone_code, minimum_connect_time) VALUES 260 | ('JFK', 'John F. Kennedy International Airport', 'New York City', 'NY', 'United States', 'EST', 23), 261 | ('LAX', 'Los Angeles International Airport', 'Los Angeles', 'CA', 'United States', 'PST', 20), 262 | ('ORD', 'O’Hare International Airport', 'Chicago', 'IL', 'United States', 'CST', 24), 263 | ('DFW', 'Dallas/Fort Worth International Airport', 'Dallas', 'TX', 'United States', 'CST', 40), 264 | ('DEN', 'Denver International Airport', 'Denver', 'CO', 'United States', 'MST', 42), 265 | ('ATL', 'Hartsfield-Jackson Atlanta International Airport', 'Atlanta', 'GA', 'United States', 'EST', 10), 266 | ('SFO', 'San Francisco International Airport', 'San Francisco', 'CA', 'United States', 'PST', 49), 267 | ('SEA', 'Seattle-Tacoma International Airport', 'Seattle', 'WA', 'United States', 'PST', 50), 268 | ('LAS', 'McCarran International Airport', 'Las Vegas', 'NV', 'United States', 'PST', 30), 269 | ('MCO', 'Orlando International Airport', 'Orlando', 'FL', 'United States', 'EST', 50) 270 | ; 271 | 272 | INSERT INTO public.airport_service (city_code, airport_code, miles_distant, direction, minutes_distant) VALUES 273 | ('NYC', 'JFK', 10, 'North', 20), 274 | ('NYC', 'JFK', 20, 'South', 40), 275 | ('NYC', 'JFK', 30, 'East', 60), 276 | ('NYC', 'JFK', 40, 'West', 80), 277 | ('NYC', 'JFK', 50, 'Northeast', 100), 278 | ('NYC', 'JFK', 60, 'Northwest', 120), 279 | ('NYC', 'JFK', 70, 'Southeast', 140), 280 | ('NYC', 'JFK', 80, 'Southwest', 160), 281 | ('NYC', 'JFK', 90, 'North', 180), 282 | ('NYC', 'JFK', 100, 'South', 200), 283 | ('LA', 'LAX', 15, 'West', 30), 284 | ('LA', 'LAX', 25, 'East', 50), 285 | ('DA', 'DAL Love Field', 5, 'North', 10), 286 | ('DA', 'DAL Love Field', 10, 'South', 20), 287 | ('SF', 'SFO', 12, 'North', 24), 288 | ('SF', 'SFO', 22, 'South', 44) 289 | ; 290 | 291 | INSERT INTO public.city (city_code, city_name, state_code, country_name, time_zone_code) VALUES 292 | ('NYC', 'New York', 'NY', 'United States', 'EST'), 293 | ('LA', 'Los Angeles', 'CA', 'United States', 'PST'), 294 | ('CHI', 'Chicago', 'IL', 'United States', 'CST'), 295 | ('DA', 'Dallas', 'TX', 'United States', 'CST'), 296 | ('DEN', 'Denver', 'CO', 'United States', 'MST'), 297 | ('ATL', 'Atlanta', 'GA', 'United States', 'EST'), 298 | ('SF', 'San Francisco', 'CA', 'United States', 'PST'), 299 | ('SEA', 'Seattle', 'WA', 'United States', 'PST'), 300 | ('LAS', 'Las Vegas', 'NV', 'United States', 'PST'), 301 | ('ORL', 'Orlando', 'FL', 'United States', 'EST') 302 | ; 303 | 304 | INSERT INTO public.class_of_service (booking_class, rank, class_description) VALUES 305 | ('First', 1, 'First Class'), 306 | ('Business', 2, 'Business Class'), 307 | ('Economy', 3, 'Economy Class') 308 | ; 309 | 310 | INSERT INTO public.code_description (code, description) VALUES 311 | ('ABC', 'Code ABC'), 312 | ('DEF', 'Code DEF'), 313 | ('GHI', 'Code GHI'), 314 | ('JKL', 'Code JKL'), 315 | ('MNO', 'Code MNO'), 316 | ('PQR', 'Code PQR'), 317 | ('STU', 'Code STU'), 318 | ('VWX', 'Code VWX'), 319 | ('YZ', 'Code YZ'), 320 | ('AAA', 'Code AAA') 321 | ; 322 | 323 | INSERT INTO public.compartment_class (compartment, class_type) VALUES 324 | ('First', 'First Class'), 325 | ('Business', 'Business Class'), 326 | ('Economy', 'Economy Class') 327 | ; 328 | 329 | INSERT INTO public.days (days_code, day_name) VALUES 330 | ('1', 'Monday'), 331 | ('2', 'Tuesday'), 332 | ('3', 'Wednesday'), 333 | ('4', 'Thursday'), 334 | ('5', 'Friday'), 335 | ('6', 'Saturday'), 336 | ('7', 'Sunday') 337 | ; 338 | 339 | INSERT INTO public.dual_carrier (main_airline, low_flight_number, high_flight_number, dual_airline, service_name) VALUES 340 | ('AA', 1, 10, 'VX', 'Dual Service 1'), 341 | ('UA', 11, 20, 'DL', 'Dual Service 2'), 342 | ('DL', 21, 30, 'UA', 'Dual Service 3'), 343 | ('WN', 31, 40, 'AS', 'Dual Service 4'), 344 | ('AS', 41, 50, 'WN', 'Dual Service 5'), 345 | ('B6', 51, 60, 'NK', 'Dual Service 6'), 346 | ('NK', 61, 70, 'B6', 'Dual Service 7'), 347 | ('F9', 71, 80, 'HA', 'Dual Service 8'), 348 | ('HA', 81, 90, 'F9', 'Dual Service 9'), 349 | ('VX', 91, 100, 'AA', 'Dual Service 10') 350 | ; 351 | 352 | INSERT INTO public.equipment_sequence (aircraft_code_sequence, aircraft_code) VALUES 353 | ('1', 'B747'), 354 | ('2', 'A320'), 355 | ('3', 'B737'), 356 | ('4', 'A380'), 357 | ('5', 'B777'), 358 | ('6', 'A330'), 359 | ('7', 'B787'), 360 | ('8', 'A350'), 361 | ('9', 'E190'), 362 | ('10', 'CRJ200') 363 | ; 364 | 365 | INSERT INTO public.fare (fare_id, from_airport, to_airport, fare_basis_code, fare_airline, restriction_code, one_direction_cost, round_trip_cost, round_trip_required) VALUES 366 | (1, 'ORD', 'JFK', 'ABC', 'AA', 'NONE', 200, 300, 'Yes'), 367 | (2, 'ORD', 'JFK', 'DEF', 'UA', 'NONE', 150, 280, 'No'), 368 | (3, 'ORD', 'JFK', 'GHI', 'AA', 'NONE', 180, 300, 'No'), 369 | (4, 'ORD', 'JFK', 'JKL', 'WN', 'NONE', 250, 350, 'Yes'), 370 | (5, 'ORD', 'LAX', 'MNO', 'AS', 'BLACKOUT', 220, 400, 'Yes'), 371 | (6, 'JFK', 'ORD', 'PQR', 'AA', 'BLACKOUT', 190, 350, 'Yes'), 372 | (7, 'JFK', 'ORD', 'STU', 'UA', 'NONE', 210, 400, 'Yes'), 373 | (8, 'JFK', 'LAX', 'VWX', 'F9', 'NONE', 230, 400, 'No'), 374 | (9, 'LAX', 'ORD', 'YZ', 'HA', 'NONE', 240, 400, 'No'), 375 | (10, 'LAX', 'ORD', 'AAA', 'VX', 'NONE', 270, 500, 'No') 376 | ; 377 | 378 | INSERT INTO public.fare_basis (fare_basis_code, booking_class, class_type, premium, economy, discounted, night, season, basis_days) VALUES 379 | ('ABC', 'First', 'First Class', 'Yes', 'No', 'No', 'No', 'Regular', '30'), 380 | ('DEF', 'Business', 'Business Class', 'Yes', 'No', 'No', 'No', 'Regular', '30'), 381 | ('GHI', 'Economy', 'Economy Class', 'No', 'Yes', 'Yes', 'No', 'Regular', '30'), 382 | ('JKL', 'First', 'First Class', 'Yes', 'No', 'No', 'No', 'Regular', '30'), 383 | ('MNO', 'Business', 'Business Class', 'Yes', 'No', 'No', 'No', 'Regular', '30'), 384 | ('PQR', 'Economy', 'Economy Class', 'No', 'Yes', 'Yes', 'No', 'Regular', '30'), 385 | ('STU', 'First', 'First Class', 'Yes', 'No', 'No', 'No', 'Regular', '30'), 386 | ('VWX', 'Business', 'Business Class', 'Yes', 'No', 'No', 'No', 'Regular', '30'), 387 | ('YZ', 'Economy', 'Economy Class', 'No', 'Yes', 'Yes', 'No', 'Regular', '30'), 388 | ('AAA', 'First', 'First Class', 'Yes', 'No', 'No', 'No', 'Regular', '30') 389 | ; 390 | 391 | INSERT INTO public.flight (flight_id, flight_days, from_airport, to_airport, departure_time, arrival_time, airline_flight, airline_code, flight_number, aircraft_code_sequence, meal_code, stops, connections, dual_carrier, time_elapsed) VALUES 392 | (1, 'mon,wed', 'ORD', 'JFK', 1577836800, 1577840400, 'AA123', 'AA', 'AA123', '1', 'BF', 0, 0, 'AA123', 3600), 393 | (2, 'tue,thu', 'ORD', 'JFK', 1577844000, 1577854000, 'UA456', 'UA', 'UA456', '2', 'LN', 1, 1, 'UA456', 10000), 394 | (3, 'wed', 'ORD', 'JFK', 1577851200, 1577854900, 'AA789', 'AA', 'AA789', '3', 'DN', 0, 0, 'AA789', 3700), 395 | (4, 'thu', 'ORD', 'JFK', 1577858400, 1577873400, 'WN012', 'WN', 'WN012', '4', 'BS', 1, 1, 'WN012', 15000), 396 | (5, 'fri', 'ORD', 'LAX', 1577865600, 1577869600, 'AS345', 'AS', 'AS345', '5', 'BF', 0, 0, 'AS345', 4000), 397 | (6, 'sat,mon', 'JFK', 'ORD', 1577872800, 1577884800, 'AA124', 'AA', 'AA123', '6', 'LN', 1, 1, 'B678', 12000), 398 | (7, 'sun', 'JFK', 'ORD', 1577880000, 1577883700, 'UA457', 'UA', 'UA457', '7', 'DN', 0, 0, 'UA457', 3700), 399 | (8, 'mon', 'JFK', 'LAX', 1577887200, 1577897200, 'F934', 'F9', 'F934', '8', 'BS', 1, 1, 'F934', 10000), 400 | (9, 'tue', 'LAX', 'ORD', 1577894400, 1577898400, 'HA567', 'HA', 'HA567', '9', 'LS', 0, 0, 'HA567', 4000), 401 | (10, 'wed,mon', 'LAX', 'ORD', 1577901600, 1577921600, 'VX890', 'VX', 'VX890', '10', 'DS', 1, 1, 'VX890', 20000) 402 | ; 403 | 404 | INSERT INTO public.flight_fare (flight_id, fare_id) VALUES 405 | (1, 1), 406 | (2, 2), 407 | (3, 3), 408 | (4, 4), 409 | (5, 5), 410 | (6, 6), 411 | (7, 7), 412 | (8, 8), 413 | (9, 9), 414 | (10, 10) 415 | ; 416 | 417 | INSERT INTO public.flight_leg (flight_id, leg_number, leg_flight) VALUES 418 | (1, 1, 1), 419 | (2, 1, 2), 420 | (3, 1, 3), 421 | (4, 1, 4), 422 | (5, 1, 5), 423 | (6, 1, 6), 424 | (7, 1, 7), 425 | (8, 1, 8), 426 | (9, 1, 9), 427 | (10, 1, 10) 428 | ; 429 | 430 | INSERT INTO public.flight_stop (flight_id, stop_number, stop_days, stop_airport, arrival_time, arrival_airline, arrival_flight_number, departure_time, departure_airline, departure_flight_number, stop_time) VALUES 431 | (2, 1, '2', 'DFW', 1577847600, 'UA', 'UA456', 1577851200, 'UA', 'UA456', 3600), 432 | (4, 1, '4', 'DEN', 1577862000, 'WN', 'WN012', 1577865600, 'WN', 'WN012', 3600), 433 | (6, 1, '6', 'DFW', 1577876400, 'AA', 'AA123', 1577880000, 'AA', 'AA123', 3600), 434 | (8, 1, '1', 'LAX', 1577890800, 'F9', 'F934', 1577894400, 'F9', 'F934', 3600), 435 | (10, 1, '3', 'JFK', 1577905200, 'VX', 'VX890', 1577908800, 'VX', 'VX890', 3600) 436 | ; 437 | 438 | INSERT INTO public.food_service (meal_code, meal_number, compartment, meal_description) VALUES 439 | ('BF', 1, 'First Class', 'Breakfast'), 440 | ('LN', 2, 'First Class', 'Lunch'), 441 | ('DN', 3, 'First Class', 'Dinner'), 442 | ('BS', 4, 'Economy', 'Breakfast'), 443 | ('LS', 5, 'Economy', 'Lunch'), 444 | ('DS', 6, 'Economy', 'Dinner') 445 | ; 446 | 447 | INSERT INTO public.ground_service (city_code, airport_code, transport_type, ground_fare) VALUES 448 | ('NYC', 'JFK', 'Taxi', 50), 449 | ('NYC', 'JFK', 'Shuttle', 40), 450 | ('NYC', 'JFK', 'Bus', 30), 451 | ('NYC', 'JFK', 'Car Rental', 60), 452 | ('NYC', 'JFK', 'Limousine', 70), 453 | ('NYC', 'JFK', 'Train', 80), 454 | ('NYC', 'JFK', 'Subway', 90), 455 | ('NYC', 'JFK', 'Private Car', 100), 456 | ('NYC', 'JFK', 'Shared Ride', 110), 457 | ('NYC', 'JFK', 'Helicopter', 120) 458 | ; 459 | 460 | INSERT INTO public.month (month_number, month_name) VALUES 461 | (1, 'January'), 462 | (2, 'February'), 463 | (3, 'March'), 464 | (4, 'April'), 465 | (5, 'May'), 466 | (6, 'June'), 467 | (7, 'July'), 468 | (8, 'August'), 469 | (9, 'September'), 470 | (10, 'October'), 471 | (11, 'November'), 472 | (12, 'December') 473 | ; 474 | 475 | INSERT INTO public.restriction (restriction_code, advance_purchase, stopovers, saturday_stay_required, minimum_stay, maximum_stay, application, no_discounts) VALUES 476 | ('NONE', 14, '2', 'No', 7, 30, 'One-Way', 'Yes'), 477 | ('NONE', 14, '2', 'No', 7, 30, 'One-Way', 'Yes'), 478 | ('NONE', 14, '2', 'No', 7, 30, 'One-Way', 'Yes'), 479 | ('NONE', 14, '2', 'No', 7, 30, 'One-Way', 'Yes'), 480 | ('NONE', 14, '2', 'No', 7, 30, 'One-Way', 'Yes'), 481 | ('NONE', 14, '2', 'No', 7, 30, 'One-Way', 'Yes'), 482 | ('NONE', 14, '2', 'No', 7, 30, 'One-Way', 'Yes'), 483 | ('NONE', 14, '2', 'No', 7, 30, 'One-Way', 'Yes'), 484 | ('NONE', 14, '2', 'No', 7, 30, 'One-Way', 'Yes'), 485 | ('NONE', 14, '2', 'No', 7, 30, 'One-Way', 'Yes') 486 | ; 487 | 488 | INSERT INTO public.state (state_code, state_name, country_name) VALUES 489 | ('NY', 'New York', 'United States'), 490 | ('CA', 'California', 'United States'), 491 | ('IL', 'Illinois', 'United States'), 492 | ('TX', 'Texas', 'United States'), 493 | ('CO', 'Colorado', 'United States'), 494 | ('GA', 'Georgia', 'United States'), 495 | ('WA', 'Washington', 'United States'), 496 | ('NV', 'Nevada', 'United States'), 497 | ('FL', 'Florida', 'United States') 498 | ; 499 | 500 | INSERT INTO public.time_interval (period, begin_time, end_time) VALUES 501 | ('daily', 1577836800, 1577840400), 502 | ('daily', 1577844000, 1577847600), 503 | ('daily', 1577851200, 1577854800), 504 | ('daily', 1577858400, 1577862000), 505 | ('daily', 1577865600, 1577869200), 506 | ('daily', 1577872800, 1577876400), 507 | ('daily', 1577880000, 1577883600), 508 | ('daily', 1577887200, 1577890800), 509 | ('daily', 1577894400, 1577898000), 510 | ('daily', 1577901600, 1577905200) 511 | ; 512 | 513 | INSERT INTO public.time_zone (time_zone_code, time_zone_name, hours_from_gmt) VALUES 514 | ('PST', 'Pacific Standard Time', -8), 515 | ('MST', 'Mountain Standard Time', -7), 516 | ('CST', 'Central Standard Time', -6), 517 | ('EST', 'Eastern Standard Time', -5) 518 | ; 519 | 520 | 521 | 522 | -------------------------------------------------------------------------------- /defog_data/broker/broker.json: -------------------------------------------------------------------------------- 1 | { 2 | "table_metadata": { 3 | "sbCustomer": [ 4 | { 5 | "data_type": "varchar(20)", 6 | "column_name": "sbCustId", 7 | "column_description": "" 8 | }, 9 | { 10 | "data_type": "varchar(100)", 11 | "column_name": "sbCustName", 12 | "column_description": "" 13 | }, 14 | { 15 | "data_type": "varchar(100)", 16 | "column_name": "sbCustEmail", 17 | "column_description": "" 18 | }, 19 | { 20 | "data_type": "varchar(20)", 21 | "column_name": "sbCustPhone", 22 | "column_description": "" 23 | }, 24 | { 25 | "data_type": "varchar(200)", 26 | "column_name": "sbCustAddress1", 27 | "column_description": "" 28 | }, 29 | { 30 | "data_type": "varchar(200)", 31 | "column_name": "sbCustAddress2", 32 | "column_description": "" 33 | }, 34 | { 35 | "data_type": "varchar(50)", 36 | "column_name": "sbCustCity", 37 | "column_description": "" 38 | }, 39 | { 40 | "data_type": "varchar(20)", 41 | "column_name": "sbCustState", 42 | "column_description": "" 43 | }, 44 | { 45 | "data_type": "varchar(50)", 46 | "column_name": "sbCustCountry", 47 | "column_description": "" 48 | }, 49 | { 50 | "data_type": "varchar(20)", 51 | "column_name": "sbCustPostalCode", 52 | "column_description": "" 53 | }, 54 | { 55 | "data_type": "date", 56 | "column_name": "sbCustJoinDate", 57 | "column_description": "" 58 | }, 59 | { 60 | "data_type": "varchar(20)", 61 | "column_name": "sbCustStatus", 62 | "column_description": "possible values: active, inactive, suspended, closed" 63 | } 64 | ], 65 | "sbTicker": [ 66 | { 67 | "data_type": "varchar(20)", 68 | "column_name": "sbTickerId", 69 | "column_description": "" 70 | }, 71 | { 72 | "data_type": "varchar(10)", 73 | "column_name": "sbTickerSymbol", 74 | "column_description": "" 75 | }, 76 | { 77 | "data_type": "varchar(100)", 78 | "column_name": "sbTickerName", 79 | "column_description": "" 80 | }, 81 | { 82 | "data_type": "varchar(20)", 83 | "column_name": "sbTickerType", 84 | "column_description": "possible values: stock, etf, mutualfund" 85 | }, 86 | { 87 | "data_type": "varchar(50)", 88 | "column_name": "sbTickerExchange", 89 | "column_description": "" 90 | }, 91 | { 92 | "data_type": "varchar(10)", 93 | "column_name": "sbTickerCurrency", 94 | "column_description": "" 95 | }, 96 | { 97 | "data_type": "varchar(20)", 98 | "column_name": "sbTickerDb2x", 99 | "column_description": "2 letter exchange code" 100 | }, 101 | { 102 | "data_type": "boolean", 103 | "column_name": "sbTickerIsActive", 104 | "column_description": "" 105 | } 106 | ], 107 | "sbDailyPrice": [ 108 | { 109 | "data_type": "varchar(20)", 110 | "column_name": "sbDpTickerId", 111 | "column_description": "" 112 | }, 113 | { 114 | "data_type": "date", 115 | "column_name": "sbDpDate", 116 | "column_description": "" 117 | }, 118 | { 119 | "data_type": "numeric(10,2)", 120 | "column_name": "sbDpOpen", 121 | "column_description": "" 122 | }, 123 | { 124 | "data_type": "numeric(10,2)", 125 | "column_name": "sbDpHigh", 126 | "column_description": "" 127 | }, 128 | { 129 | "data_type": "numeric(10,2)", 130 | "column_name": "sbDpLow", 131 | "column_description": "" 132 | }, 133 | { 134 | "data_type": "numeric(10,2)", 135 | "column_name": "sbDpClose", 136 | "column_description": "" 137 | }, 138 | { 139 | "data_type": "bigint", 140 | "column_name": "sbDpVolume", 141 | "column_description": "" 142 | }, 143 | { 144 | "data_type": "bigint", 145 | "column_name": "sbDpEpochMs", 146 | "column_description": "epoch milliseconds for timestamp" 147 | }, 148 | { 149 | "data_type": "varchar(50)", 150 | "column_name": "sbDpSource", 151 | "column_description": "" 152 | } 153 | ], 154 | "sbTransaction": [ 155 | { 156 | "data_type": "varchar(50)", 157 | "column_name": "sbTxId", 158 | "column_description": "" 159 | }, 160 | { 161 | "data_type": "varchar(20)", 162 | "column_name": "sbTxCustId", 163 | "column_description": "" 164 | }, 165 | { 166 | "data_type": "varchar(20)", 167 | "column_name": "sbTxTickerId", 168 | "column_description": "" 169 | }, 170 | { 171 | "data_type": "timestamp", 172 | "column_name": "sbTxDateTime", 173 | "column_description": "" 174 | }, 175 | { 176 | "data_type": "varchar(20)", 177 | "column_name": "sbTxType", 178 | "column_description": "possible values: buy, sell" 179 | }, 180 | { 181 | "data_type": "numeric(10,2)", 182 | "column_name": "sbTxShares", 183 | "column_description": "" 184 | }, 185 | { 186 | "data_type": "numeric(10,2)", 187 | "column_name": "sbTxPrice", 188 | "column_description": "" 189 | }, 190 | { 191 | "data_type": "numeric(10,2)", 192 | "column_name": "sbTxAmount", 193 | "column_description": "" 194 | }, 195 | { 196 | "data_type": "varchar(10)", 197 | "column_name": "sbTxCcy", 198 | "column_description": "transaction currency" 199 | }, 200 | { 201 | "data_type": "numeric(10,2)", 202 | "column_name": "sbTxTax", 203 | "column_description": "" 204 | }, 205 | { 206 | "data_type": "numeric(10,2)", 207 | "column_name": "sbTxCommission", 208 | "column_description": "" 209 | }, 210 | { 211 | "data_type": "varchar(10)", 212 | "column_name": "sbTxKpx", 213 | "column_description": "internal code" 214 | }, 215 | { 216 | "data_type": "varchar(25)", 217 | "column_name": "sbTxSettlementDateStr", 218 | "column_description": "settlement date as string in yyyyMMdd HH:mm:ss format. NULL if not settled" 219 | }, 220 | { 221 | "data_type": "varchar(10)", 222 | "column_name": "sbTxStatus", 223 | "column_description": "possible values: success, fail, pending" 224 | } 225 | ] 226 | }, 227 | "glossary": "- sbTicker can be joined to sbDailyPrice on sbTickerId\n- sbCustomer can be joined to sbTransaction on sbCustId\n- sbTicker can be joined to sbTransaction on sbTickerId\n- ADV (Average Daily Volume) for a ticker = AVG(sbDpVolume) from sbDailyPrice table for that ticker\n- ATH (All Time High) price for a ticker = MAX(sbDpHigh) from sbDailyPrice table for that ticker\n- ATP (Average Transaction Price) for a customer = SUM(sbTxAmount)/SUM(sbTxShares) from sbTransaction table for that customer\n- NCT (Net Commission Total) = SUM(sbTxCommission) from sbTransaction table" 228 | } -------------------------------------------------------------------------------- /defog_data/car_dealership/car_dealership.json: -------------------------------------------------------------------------------- 1 | { 2 | "table_metadata": { 3 | "cars": [ 4 | { 5 | "data_type": "SERIAL", 6 | "column_name": "id", 7 | "column_description": "Primary key for the cars table" 8 | }, 9 | { 10 | "data_type": "TEXT", 11 | "column_name": "make", 12 | "column_description": "Manufacturer of the car" 13 | }, 14 | { 15 | "data_type": "TEXT", 16 | "column_name": "model", 17 | "column_description": "Model name of the car" 18 | }, 19 | { 20 | "data_type": "INTEGER", 21 | "column_name": "year", 22 | "column_description": "Year of manufacture" 23 | }, 24 | { 25 | "data_type": "TEXT", 26 | "column_name": "color", 27 | "column_description": "Color of the car" 28 | }, 29 | { 30 | "data_type": "VARCHAR(17)", 31 | "column_name": "vin_number", 32 | "column_description": "Vehicle Identification Number" 33 | }, 34 | { 35 | "data_type": "TEXT", 36 | "column_name": "engine_type", 37 | "column_description": "Type of engine (e.g., V6, V8, Electric)" 38 | }, 39 | { 40 | "data_type": "TEXT", 41 | "column_name": "transmission", 42 | "column_description": "Type of transmission (e.g., Automatic, Manual)" 43 | }, 44 | { 45 | "data_type": "NUMERIC(10, 2)", 46 | "column_name": "cost", 47 | "column_description": "Cost of the car" 48 | }, 49 | { 50 | "data_type": "TIMESTAMP", 51 | "column_name": "crtd_ts", 52 | "column_description": "Timestamp when the car was added to the system" 53 | } 54 | ], 55 | "salespersons": [ 56 | { 57 | "data_type": "SERIAL", 58 | "column_name": "id", 59 | "column_description": "Unique identifier for each salesperson" 60 | }, 61 | { 62 | "data_type": "TEXT", 63 | "column_name": "first_name", 64 | "column_description": "" 65 | }, 66 | { 67 | "data_type": "TEXT", 68 | "column_name": "last_name", 69 | "column_description": "" 70 | }, 71 | { 72 | "data_type": "VARCHAR(255)", 73 | "column_name": "email", 74 | "column_description": "" 75 | }, 76 | { 77 | "data_type": "VARCHAR(20)", 78 | "column_name": "phone", 79 | "column_description": "First 3 digits in brackets is the area code. Format: (123)-456-7890" 80 | }, 81 | { 82 | "data_type": "DATE", 83 | "column_name": "hire_date", 84 | "column_description": "" 85 | }, 86 | { 87 | "data_type": "DATE", 88 | "column_name": "termination_date", 89 | "column_description": "" 90 | }, 91 | { 92 | "data_type": "TIMESTAMP", 93 | "column_name": "crtd_ts", 94 | "column_description": "Timestamp when the salesperson record was created" 95 | } 96 | ], 97 | "customers": [ 98 | { 99 | "data_type": "SERIAL", 100 | "column_name": "id", 101 | "column_description": "Primary key for the customers table" 102 | }, 103 | { 104 | "data_type": "TEXT", 105 | "column_name": "first_name", 106 | "column_description": "" 107 | }, 108 | { 109 | "data_type": "TEXT", 110 | "column_name": "last_name", 111 | "column_description": "" 112 | }, 113 | { 114 | "data_type": "VARCHAR(255)", 115 | "column_name": "email", 116 | "column_description": "" 117 | }, 118 | { 119 | "data_type": "VARCHAR(20)", 120 | "column_name": "phone", 121 | "column_description": "" 122 | }, 123 | { 124 | "data_type": "TEXT", 125 | "column_name": "address", 126 | "column_description": "" 127 | }, 128 | { 129 | "data_type": "TEXT", 130 | "column_name": "city", 131 | "column_description": "" 132 | }, 133 | { 134 | "data_type": "TEXT", 135 | "column_name": "state", 136 | "column_description": "" 137 | }, 138 | { 139 | "data_type": "VARCHAR(10)", 140 | "column_name": "zip_code", 141 | "column_description": "" 142 | }, 143 | { 144 | "data_type": "TIMESTAMP", 145 | "column_name": "crtd_ts", 146 | "column_description": "Timestamp when the customer record was created" 147 | } 148 | ], 149 | "sales": [ 150 | { 151 | "data_type": "SERIAL", 152 | "column_name": "id", 153 | "column_description": "Primary key that uniquely identifies each sale" 154 | }, 155 | { 156 | "data_type": "INTEGER", 157 | "column_name": "car_id", 158 | "column_description": "Foreign key referencing the cars table" 159 | }, 160 | { 161 | "data_type": "INTEGER", 162 | "column_name": "salesperson_id", 163 | "column_description": "Foreign key referencing the salespersons table" 164 | }, 165 | { 166 | "data_type": "INTEGER", 167 | "column_name": "customer_id", 168 | "column_description": "Foreign key referencing the customers table" 169 | }, 170 | { 171 | "data_type": "NUMERIC(10, 2)", 172 | "column_name": "sale_price", 173 | "column_description": "Price at which the car was sold" 174 | }, 175 | { 176 | "data_type": "DATE", 177 | "column_name": "sale_date", 178 | "column_description": "Date when the car was sold" 179 | }, 180 | { 181 | "data_type": "TIMESTAMP", 182 | "column_name": "crtd_ts", 183 | "column_description": "Timestamp when the sale record was created" 184 | } 185 | ], 186 | "inventory_snapshots": [ 187 | { 188 | "data_type": "SERIAL", 189 | "column_name": "id", 190 | "column_description": "Primary key for the inventory_snapshots table" 191 | }, 192 | { 193 | "data_type": "DATE", 194 | "column_name": "snapshot_date", 195 | "column_description": "Date of the inventory snapshot" 196 | }, 197 | { 198 | "data_type": "INTEGER", 199 | "column_name": "car_id", 200 | "column_description": "Foreign key referencing the cars table" 201 | }, 202 | { 203 | "data_type": "BOOLEAN", 204 | "column_name": "is_in_inventory", 205 | "column_description": "Indicates if the car was in inventory on the snapshot date" 206 | }, 207 | { 208 | "data_type": "TIMESTAMP", 209 | "column_name": "crtd_ts", 210 | "column_description": "Timestamp when the inventory snapshot record was created" 211 | } 212 | ], 213 | "payments_received": [ 214 | { 215 | "data_type": "SERIAL", 216 | "column_name": "id", 217 | "column_description": "Uniquely identifies each payment received record" 218 | }, 219 | { 220 | "data_type": "INTEGER", 221 | "column_name": "sale_id", 222 | "column_description": "Foreign key referencing the sales table" 223 | }, 224 | { 225 | "data_type": "DATE", 226 | "column_name": "payment_date", 227 | "column_description": "Date when the payment was received. Can take place after the sale date, or in installments." 228 | }, 229 | { 230 | "data_type": "NUMERIC(10, 2)", 231 | "column_name": "payment_amount", 232 | "column_description": "Amount of the payment received. Can be less than the sale price if the payment is made in installments." 233 | }, 234 | { 235 | "data_type": "TEXT", 236 | "column_name": "payment_method", 237 | "column_description": "Method of payment (e.g., cash, check, card, banktrf, trp01)" 238 | }, 239 | { 240 | "data_type": "TIMESTAMP", 241 | "column_name": "crtd_ts", 242 | "column_description": "Timestamp when the payment received record was created" 243 | } 244 | ], 245 | "payments_made": [ 246 | { 247 | "data_type": "SERIAL", 248 | "column_name": "id", 249 | "column_description": "Primary key for the payments_made table" 250 | }, 251 | { 252 | "data_type": "TEXT", 253 | "column_name": "vendor_name", 254 | "column_description": "Name of the vendor to whom the payment was made" 255 | }, 256 | { 257 | "data_type": "DATE", 258 | "column_name": "payment_date", 259 | "column_description": "Date when the payment was made" 260 | }, 261 | { 262 | "data_type": "NUMERIC(10, 2)", 263 | "column_name": "payment_amount", 264 | "column_description": "Amount of the payment made" 265 | }, 266 | { 267 | "data_type": "TEXT", 268 | "column_name": "payment_method", 269 | "column_description": "Method of payment (e.g., check, bank_transfer, credit_card)" 270 | }, 271 | { 272 | "data_type": "VARCHAR(50)", 273 | "column_name": "invoice_number", 274 | "column_description": "Invoice number associated with the payment" 275 | }, 276 | { 277 | "data_type": "DATE", 278 | "column_name": "invoice_date", 279 | "column_description": "Date of the invoice" 280 | }, 281 | { 282 | "data_type": "DATE", 283 | "column_name": "due_date", 284 | "column_description": "Due date of the invoice" 285 | }, 286 | { 287 | "data_type": "TIMESTAMP", 288 | "column_name": "crtd_ts", 289 | "column_description": "Timestamp when the payment made record was created" 290 | } 291 | ] 292 | }, 293 | "glossary": "- `cars.id` can be joined with `car_id` from `sales` and `inventory_snapshots` tables\n- `salespersons.id` can be joined with `salesperson_id` from `sales` table\n- `customers.id` can be joined with `customer_id` from `sales` table\n- `sales.id` can be joined with `sale_id` from `payments_received` table\n- Total Sales = SUM(sale_price) from `sales` table\n- Total Payments Received = SUM(payment_amount) from `payments_received` table\n- Total Payments Made = SUM(payment_amount) from `payments_made` table\n- Profit = Total Sales - Total Payments Made\n- Always use sales.sale_date instead of sales.crtd_ts when getting date of sales." 294 | } -------------------------------------------------------------------------------- /defog_data/car_dealership/car_dealership.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE cars ( 2 | id SERIAL PRIMARY KEY, 3 | make TEXT NOT NULL, -- manufacturer of the car 4 | model TEXT NOT NULL, -- model name of the car 5 | year INTEGER NOT NULL, -- year of manufacture 6 | color TEXT NOT NULL, -- color of the car 7 | vin_number VARCHAR(17) NOT NULL UNIQUE, -- Vehicle Identification Number 8 | engine_type TEXT NOT NULL, -- type of engine (e.g., V6, V8, Electric) 9 | transmission TEXT NOT NULL, -- type of transmission (e.g., Automatic, Manual) 10 | cost NUMERIC(10, 2) NOT NULL, -- cost of the car 11 | crtd_ts TIMESTAMP NOT NULL DEFAULT NOW() -- timestamp when the car was added to the system 12 | ); 13 | 14 | CREATE TABLE salespersons ( 15 | id SERIAL PRIMARY KEY, 16 | first_name TEXT NOT NULL, 17 | last_name TEXT NOT NULL, 18 | email VARCHAR(255) NOT NULL UNIQUE, 19 | phone VARCHAR(20) NOT NULL, 20 | hire_date DATE NOT NULL, 21 | termination_date DATE, 22 | crtd_ts TIMESTAMP NOT NULL DEFAULT NOW() 23 | ); 24 | 25 | CREATE TABLE customers ( 26 | id SERIAL PRIMARY KEY, 27 | first_name TEXT NOT NULL, 28 | last_name TEXT NOT NULL, 29 | email VARCHAR(255) NOT NULL UNIQUE, 30 | phone VARCHAR(20) NOT NULL, 31 | address TEXT NOT NULL, 32 | city TEXT NOT NULL, 33 | state TEXT NOT NULL, 34 | zip_code VARCHAR(10) NOT NULL, 35 | crtd_ts TIMESTAMP NOT NULL DEFAULT NOW() 36 | ); 37 | 38 | CREATE TABLE sales ( 39 | id SERIAL PRIMARY KEY, 40 | car_id INTEGER NOT NULL REFERENCES cars(id), 41 | salesperson_id INTEGER NOT NULL REFERENCES salespersons(id), 42 | customer_id INTEGER NOT NULL REFERENCES customers(id), 43 | sale_price NUMERIC(10, 2) NOT NULL, 44 | sale_date DATE NOT NULL, 45 | crtd_ts TIMESTAMP NOT NULL DEFAULT NOW() 46 | ); 47 | 48 | CREATE TABLE inventory_snapshots ( 49 | id SERIAL PRIMARY KEY, 50 | snapshot_date DATE NOT NULL, 51 | car_id INTEGER NOT NULL REFERENCES cars(id), 52 | is_in_inventory BOOLEAN NOT NULL, 53 | crtd_ts TIMESTAMP NOT NULL DEFAULT NOW() 54 | ); 55 | 56 | CREATE TABLE payments_received ( 57 | id SERIAL PRIMARY KEY, 58 | sale_id INTEGER NOT NULL REFERENCES sales(id), 59 | payment_date DATE NOT NULL, 60 | payment_amount NUMERIC(10, 2) NOT NULL, 61 | payment_method TEXT NOT NULL, -- values: cash, check, credit_card, debit_card, financing 62 | crtd_ts TIMESTAMP NOT NULL DEFAULT NOW() 63 | ); 64 | 65 | CREATE TABLE payments_made ( 66 | id SERIAL PRIMARY KEY, 67 | vendor_name TEXT NOT NULL, 68 | payment_date DATE NOT NULL, 69 | payment_amount NUMERIC(10, 2) NOT NULL, 70 | payment_method TEXT NOT NULL, -- values: check, bank_transfer, credit_card 71 | invoice_number VARCHAR(50) NOT NULL, 72 | invoice_date DATE NOT NULL, 73 | due_date DATE NOT NULL, 74 | crtd_ts TIMESTAMP NOT NULL DEFAULT NOW() 75 | ); 76 | 77 | 78 | -- cars 79 | INSERT INTO cars (id, make, model, year, color, vin_number, engine_type, transmission, cost) 80 | VALUES 81 | (1, 'Toyota', 'Camry', 2022, 'Silver', '4T1BF1FK3CU510984', 'V6', 'Automatic', 28500.00), 82 | (2, 'Honda', 'Civic', 2021, 'platinum/grey', '2HGFC2F53MH522780', 'Inline 4', 'CVT', 22000.00), 83 | (3, 'Ford', 'Mustang', 2023, 'blue', '1FA6P8TH4M5100001', 'V8', 'Manual', 45000.00), 84 | (4, 'Tesla', 'Model 3', 2022, 'fuschia', '5YJ3E1EB7MF123456', 'Electric', 'Automatic', 41000.00), 85 | (5, 'Chevrolet', 'Equinox', 2021, 'midnight blue', '2GNAXUEV1M6290124', 'Inline 4', 'Automatic', 26500.00), 86 | (6, 'Nissan', 'Altima', 2022, 'Jet black', '1N4BL4BV4NN123456', 'V6', 'CVT', 25000.00), 87 | (7, 'BMW', 'X5', 2023, 'Titan Silver', '5UXCR6C56M9A12345', 'V8', 'Automatic', 62000.00), 88 | (8, 'Audi', 'A4', 2022, 'Blue', 'WAUBNAF47MA098765', 'Inline 4', 'Automatic', 39000.00), 89 | (9, 'Lexus', 'RX350', 2021, 'Fiery red', '2T2BZMCA7MC143210', 'V6', 'Automatic', 45500.00), 90 | (10, 'Subaru', 'Outback', 2022, 'Jade', '4S4BSANC2N3246801', 'Boxer 4', 'CVT', 28000.00), 91 | (11, 'Mazda', 'CX-5', 2022, 'Royal Purple', 'JM3KE4DY4N0123456', 'Inline 4', 'Automatic', 29000.00), 92 | (12, 'Hyundai', 'Tucson', 2023, 'black', 'KM8J3CAL3NU123456', 'Inline 4', 'Automatic', 32000.00), 93 | (13, 'Kia', 'Sorento', 2021, 'ebony black', '5XYPH4A50MG987654', 'V6', 'Automatic', 32000.00), 94 | (14, 'Jeep', 'Wrangler', 2022, 'Harbor Gray', '1C4HJXDG3NW123456', 'V6', 'Automatic', 38000.00), 95 | (15, 'GMC', 'Sierra 1500', 2023, 'Snow White', '1GTU9CED3NZ123456', 'V8', 'Automatic', 45000.00), 96 | (16, 'Ram', '1500', 2022, 'baby blue', '1C6SRFFT3NN123456', 'V8', 'Automatic', 42000.00), 97 | (17, 'Mercedes-Benz', 'E-Class', 2021, 'Silver', 'W1KZF8DB1MA123456', 'Inline 6', 'Automatic', 62000.00), 98 | (18, 'Volkswagen', 'Tiguan', 2022, 'Red', '3VV2B7AX1NM123456', 'Inline 4', 'Automatic', 32000.00), 99 | (19, 'Volvo', 'XC90', 2023, 'black', 'YV4A22PK3N1234567', 'Inline 4', 'Automatic', 65000.00), 100 | (20, 'Porsche', '911', 2022, 'white', 'WP0AA2A93NS123456', 'Flat 6', 'Automatic', 120000.00), 101 | (21, 'Cadillac', 'Escalade', 2023, 'Black', '1GYS4HKJ3MR123456', 'V8', 'Automatic', 85000.00); 102 | 103 | -- salespersons 104 | INSERT INTO salespersons (id, first_name, last_name, email, phone, hire_date, termination_date) 105 | VALUES 106 | (1, 'John', 'Doe', 'john.doe@autonation.com', '(555)-123-4567', CURRENT_DATE - INTERVAL '2 years', NULL), 107 | (2, 'Jane', 'Smith', 'jane.smith@autonation.com', '(415)-987-6543', CURRENT_DATE - INTERVAL '3 years', NULL), 108 | (3, 'Michael', 'Johnson', 'michael.johnson@autonation.com', '(555)-456-7890', CURRENT_DATE - INTERVAL '1 year', NULL), 109 | (4, 'Emily', 'Brown', 'emily.brown@sonicauto.com', '(444)-111-2222', CURRENT_DATE - INTERVAL '1 year', CURRENT_DATE - INTERVAL '1 month'), 110 | (5, 'David', 'Wilson', 'david.wilson@sonicauto.com', '(444)-333-4444', CURRENT_DATE - INTERVAL '2 years', NULL), 111 | (6, 'Sarah', 'Taylor', 'sarah.taylor@sonicauto.com', '(123)-555-6666', '2018-09-01', '2022-09-01'), 112 | (7, 'Daniel', 'Anderson', 'daniel.anderson@sonicauto.com', '(555)-777-8888', '2021-07-12', NULL), 113 | (8, 'Olivia', 'Thomas', 'olivia.thomas@pensake.com', '(333)-415-0000', '2023-01-25', '2023-07-25'), 114 | (9, 'James', 'Jackson', 'james.jackson@pensake.com', '(555)-212-3333', '2019-04-30', NULL), 115 | (10, 'Sophia', 'White', 'sophia.white@pensake.com', '(555)-444-5555', '2022-08-18', NULL), 116 | (11, 'Robert', 'Johnson', 'robert.johnson@pensake.com', '(001)-415-5678', CURRENT_DATE - INTERVAL '15 days', NULL), 117 | (12, 'Jennifer', 'Davis', 'jennifer.davis@directauto.com', '(555)-345-6789', CURRENT_DATE - INTERVAL '20 days', NULL), 118 | (13, 'Jessica', 'Rodriguez', 'jessica.rodriguez@directauto.com', '(555)-789-0123', '2022-06-01', NULL); 119 | 120 | -- customers 121 | INSERT INTO customers (id, first_name, last_name, email, phone, address, city, state, zip_code, crtd_ts) 122 | VALUES 123 | (1, 'William', 'Davis', 'william.davis@example.com', '555-888-9999', '123 Main St', 'New York', 'NY', '10001', NOW() - INTERVAL '5 years'), 124 | (2, 'Ava', 'Miller', 'ava.miller@example.com', '555-777-6666', '456 Oak Ave', 'Los Angeles', 'CA', '90001', NOW() - INTERVAL '4 years'), 125 | (3, 'Benjamin', 'Wilson', 'benjamin.wilson@example.com', '555-666-5555', '789 Elm St', 'Chicago', 'IL', '60007', NOW() - INTERVAL '3 years'), 126 | (4, 'Mia', 'Moore', 'mia.moore@example.com', '555-555-4444', '321 Pine Rd', 'Houston', 'TX', '77001', NOW() - INTERVAL '2 years'), 127 | (5, 'Henry', 'Taylor', 'henry.taylor@example.com', '555-444-3333', '654 Cedar Ln', 'Phoenix', 'AZ', '85001', NOW() - INTERVAL '1 year'), 128 | (6, 'Charlotte', 'Anderson', 'charlotte.anderson@example.com', '555-333-2222', '987 Birch Dr', 'Philadelphia', 'PA', '19019', NOW() - INTERVAL '5 years'), 129 | (7, 'Alexander', 'Thomas', 'alexander.thomas@example.com', '555-222-1111', '741 Walnut St', 'San Antonio', 'TX', '78006', NOW() - INTERVAL '4 years'), 130 | (8, 'Amelia', 'Jackson', 'amelia.jackson@gmail.com', '555-111-0000', '852 Maple Ave', 'San Diego', 'CA', '92101', NOW() - INTERVAL '3 years'), 131 | (9, 'Daniel', 'White', 'daniel.white@youtube.com', '555-000-9999', '963 Oak St', 'Dallas', 'TX', '75001', NOW() - INTERVAL '2 years'), 132 | (10, 'Abigail', 'Harris', 'abigail.harris@company.io', '555-999-8888', '159 Pine Ave', 'San Jose', 'CA', '95101', NOW() - INTERVAL '1 year'), 133 | (11, 'Christopher', 'Brown', 'christopher.brown@ai.com', '555-456-7890', '753 Maple Rd', 'Miami', 'FL', '33101', NOW() - INTERVAL '5 months'), 134 | (12, 'Sophia', 'Lee', 'sophia.lee@microsoft.com', '555-567-8901', '951 Oak Ln', 'Seattle', 'WA', '98101', NOW() - INTERVAL '6 months'), 135 | (13, 'Michael', 'Chen', 'michael.chen@company.com', '(555)-456-7890', '123 Oak St', 'San Francisco', 'CA', '94101', NOW() - INTERVAL '3 months'); 136 | 137 | -- sales 138 | INSERT INTO sales (id, car_id, salesperson_id, customer_id, sale_price, sale_date) 139 | VALUES 140 | (1, 1, 2, 3, 30500.00, '2023-03-15'), 141 | (2, 3, 1, 5, 47000.00, '2023-03-20'), 142 | (3, 6, 4, 2, 26500.00, '2023-03-22'), 143 | (4, 8, 7, 9, 38000.00, '2023-03-25'), 144 | (5, 2, 4, 7, 23500.00, '2023-03-28'), 145 | (6, 10, 6, 1, 30000.00, '2023-04-01'), 146 | (7, 5, 3, 6, 26800.00, '2023-04-05'), 147 | (8, 7, 2, 10, 63000.00, '2023-04-10'), 148 | (9, 4, 6, 8, 42500.00, '2023-04-12'), 149 | (10, 9, 2, 4, 44500.00, '2023-04-15'), 150 | (11, 1, 7, 11, 28900.00, CURRENT_DATE - INTERVAL '32 days'), 151 | (12, 3, 3, 12, 46500.00, CURRENT_DATE - INTERVAL '10 days'), 152 | (13, 6, 1, 11, 26000.00, CURRENT_DATE - INTERVAL '15 days'), 153 | (14, 2, 3, 1, 23200.00, CURRENT_DATE - INTERVAL '21 days'), 154 | (15, 8, 6, 12, 43500.00, CURRENT_DATE - INTERVAL '3 days'), 155 | (16, 10, 4, 2, 29500.00, CURRENT_DATE - INTERVAL '5 days'), 156 | (17, 3, 2, 3, 46000.00, DATE_TRUNC('week', CURRENT_DATE) - INTERVAL '1 week' + INTERVAL '1 day'), 157 | (18, 3, 2, 7, 47500.00, DATE_TRUNC('week', CURRENT_DATE) - INTERVAL '1 week'), 158 | (19, 3, 2, 10, 46500.00, DATE_TRUNC('week', CURRENT_DATE) - INTERVAL '1 week' - INTERVAL '1 day'), 159 | (20, 4, 1, 3, 48000.00, DATE_TRUNC('week', CURRENT_DATE) - INTERVAL '8 week' + INTERVAL '1 day'), 160 | (21, 4, 1, 7, 45000.00, DATE_TRUNC('week', CURRENT_DATE) - INTERVAL '8 week'), 161 | (22, 4, 1, 10, 49000.00, DATE_TRUNC('week', CURRENT_DATE) - INTERVAL '8 week' - INTERVAL '1 day'); 162 | 163 | 164 | -- inventory_snapshots 165 | INSERT INTO inventory_snapshots (id, snapshot_date, car_id, is_in_inventory) 166 | VALUES 167 | (1, '2023-03-15', 1, TRUE), 168 | (2, '2023-03-15', 2, TRUE), 169 | (3, '2023-03-15', 3, TRUE), 170 | (4, '2023-03-15', 4, TRUE), 171 | (5, '2023-03-15', 5, TRUE), 172 | (6, '2023-03-15', 6, TRUE), 173 | (7, '2023-03-15', 7, TRUE), 174 | (8, '2023-03-15', 8, TRUE), 175 | (9, '2023-03-15', 9, TRUE), 176 | (10, '2023-03-15', 10, TRUE), 177 | (11, '2023-03-20', 1, FALSE), 178 | (12, '2023-03-20', 3, FALSE), 179 | (13, '2023-03-22', 6, FALSE), 180 | (14, '2023-03-25', 8, FALSE), 181 | (15, '2023-03-28', 2, FALSE), 182 | (16, '2023-04-01', 10, FALSE), 183 | (17, '2023-04-05', 5, FALSE), 184 | (18, '2023-04-10', 7, FALSE), 185 | (19, '2023-04-12', 4, FALSE), 186 | (20, '2023-04-15', 9, FALSE), 187 | (21, '2023-03-28', 1, TRUE), 188 | (22, '2023-03-28', 3, TRUE), 189 | (23, '2023-03-28', 4, FALSE); 190 | 191 | -- payments_received 192 | INSERT INTO payments_received (id, sale_id, payment_date, payment_amount, payment_method) 193 | VALUES 194 | (1, 1, '2023-03-15', 5000.00, 'check'), 195 | (2, 1, '2023-03-20', 22500.00, 'financing'), 196 | (3, 2, '2023-03-20', 44000.00, 'credit_card'), 197 | (4, 3, '2023-03-22', 24500.00, 'debit_card'), 198 | (5, 4, '2023-03-25', 38000.00, 'financing'), 199 | (6, 5, '2023-03-28', 21500.00, 'cash'), 200 | (7, 6, '2023-04-01', 27000.00, 'credit_card'), 201 | (8, 7, '2023-04-05', 26000.00, 'debit_card'), 202 | (9, 8, '2023-04-10', 60000.00, 'financing'), 203 | (10, 9, '2023-04-12', 40000.00, 'check'), 204 | (11, 10, '2023-04-15', 44500.00, 'credit_card'), 205 | (12, 11, CURRENT_DATE - INTERVAL '30 days', 28000.00, 'cash'), 206 | (13, 12, CURRENT_DATE - INTERVAL '3 days', 43500.00, 'credit_card'), 207 | (14, 13, CURRENT_DATE - INTERVAL '6 days', 24000.00, 'debit_card'), 208 | (15, 14, CURRENT_DATE - INTERVAL '1 days', 17200.00, 'financing'), 209 | (16, 15, CURRENT_DATE - INTERVAL '1 days', 37500.00, 'credit_card'), 210 | (17, 16, CURRENT_DATE - INTERVAL '5 days', 26500.00, 'debit_card'), 211 | (18, 17, DATE_TRUNC('week', CURRENT_DATE) - INTERVAL '1 week' + INTERVAL '1 day', 115000.00, 'financing'), 212 | (19, 18, DATE_TRUNC('week', CURRENT_DATE) - INTERVAL '1 week', 115000.00, 'credit_card'), 213 | (20, 19, DATE_TRUNC('week', CURRENT_DATE) - INTERVAL '1 week' - INTERVAL '1 day', 115000.00, 'debit_card'), 214 | (21, 20, DATE_TRUNC('week', CURRENT_DATE) - INTERVAL '8 week' + INTERVAL '1 day', 115000.00, 'cash'), 215 | (22, 21, DATE_TRUNC('week', CURRENT_DATE) - INTERVAL '8 week', 115000.00, 'check'), 216 | (23, 22, DATE_TRUNC('week', CURRENT_DATE) - INTERVAL '8 week' - INTERVAL '1 day', 115000.00, 'credit_card'); 217 | 218 | -- payments_made 219 | INSERT INTO payments_made (id, vendor_name, payment_date, payment_amount, payment_method, invoice_number, invoice_date, due_date) 220 | VALUES 221 | (1, 'Car Manufacturer Inc', '2023-03-01', 150000.00, 'bank_transfer', 'INV-001', '2023-02-25', '2023-03-25'), 222 | (2, 'Auto Parts Supplier', '2023-03-10', 25000.00, 'check', 'INV-002', '2023-03-05', '2023-04-04'), 223 | (3, 'Utility Company', '2023-03-15', 1500.00, 'bank_transfer', 'INV-003', '2023-03-01', '2023-03-31'), 224 | (4, 'Marketing Agency', '2023-03-20', 10000.00, 'credit_card', 'INV-004', '2023-03-15', '2023-04-14'), 225 | (5, 'Insurance Provider', '2023-03-25', 5000.00, 'bank_transfer', 'INV-005', '2023-03-20', '2023-04-19'), 226 | (6, 'Cleaning Service', '2023-03-31', 2000.00, 'check', 'INV-006', '2023-03-25', '2023-04-24'), 227 | (7, 'Car Manufacturer Inc', '2023-04-01', 200000.00, 'bank_transfer', 'INV-007', '2023-03-25', '2023-04-24'), 228 | (8, 'Auto Parts Supplier', '2023-04-10', 30000.00, 'check', 'INV-008', '2023-04-05', '2023-05-05'), 229 | (9, 'Utility Company', '2023-04-15', 1500.00, 'bank_transfer', 'INV-009', '2023-04-01', '2023-04-30'), 230 | (10, 'Marketing Agency', '2023-04-20', 15000.00, 'credit_card', 'INV-010', '2023-04-15', '2023-05-15'), 231 | (11, 'Insurance Provider', '2023-04-25', 5000.00, 'bank_transfer', 'INV-011', '2023-04-20', '2023-05-20'), 232 | (12, 'Cleaning Service', '2023-04-30', 2000.00, 'check', 'INV-012', '2023-04-25', '2023-05-25'), 233 | (13, 'Toyota Auto Parts', CURRENT_DATE - INTERVAL '5 days', 12500.00, 'bank_transfer', 'INV-013', CURRENT_DATE - INTERVAL '10 days', CURRENT_DATE + INTERVAL '20 days'), 234 | (14, 'Honda Manufacturing', CURRENT_DATE - INTERVAL '3 days', 18000.00, 'check', 'INV-014', CURRENT_DATE - INTERVAL '8 days', CURRENT_DATE + INTERVAL '22 days'), 235 | (15, 'Ford Supplier Co', CURRENT_DATE - INTERVAL '2 days', 22000.00, 'bank_transfer', 'INV-015', CURRENT_DATE - INTERVAL '7 days', CURRENT_DATE + INTERVAL '23 days'), 236 | (16, 'Tesla Parts Inc', CURRENT_DATE - INTERVAL '1 day', 15000.00, 'credit_card', 'INV-016', CURRENT_DATE - INTERVAL '6 days', CURRENT_DATE + INTERVAL '24 days'), 237 | (17, 'Chevrolet Auto', CURRENT_DATE, 20000.00, 'bank_transfer', 'INV-017', CURRENT_DATE - INTERVAL '5 days', CURRENT_DATE + INTERVAL '25 days'); -------------------------------------------------------------------------------- /defog_data/derm_treatment/derm_treatment.json: -------------------------------------------------------------------------------- 1 | { 2 | "table_metadata": { 3 | "doctors": [ 4 | { 5 | "data_type": "SERIAL", 6 | "column_name": "doc_id", 7 | "column_description": "" 8 | }, 9 | { 10 | "data_type": "VARCHAR(50)", 11 | "column_name": "first_name", 12 | "column_description": "" 13 | }, 14 | { 15 | "data_type": "VARCHAR(50)", 16 | "column_name": "last_name", 17 | "column_description": "" 18 | }, 19 | { 20 | "data_type": "TEXT", 21 | "column_name": "specialty", 22 | "column_description": "possible values: dermatology, immunology, general" 23 | }, 24 | { 25 | "data_type": "INT", 26 | "column_name": "year_reg", 27 | "column_description": "year the doctor was registered and obtained license" 28 | }, 29 | { 30 | "data_type": "VARCHAR(100)", 31 | "column_name": "med_school_name", 32 | "column_description": "" 33 | }, 34 | { 35 | "data_type": "VARCHAR(50)", 36 | "column_name": "loc_city", 37 | "column_description": "" 38 | }, 39 | { 40 | "data_type": "CHAR(2)", 41 | "column_name": "loc_state", 42 | "column_description": "" 43 | }, 44 | { 45 | "data_type": "VARCHAR(10)", 46 | "column_name": "loc_zip", 47 | "column_description": "" 48 | }, 49 | { 50 | "data_type": "VARCHAR(20)", 51 | "column_name": "bd_cert_num", 52 | "column_description": "board certification number" 53 | } 54 | ], 55 | "patients": [ 56 | { 57 | "data_type": "SERIAL", 58 | "column_name": "patient_id", 59 | "column_description": "" 60 | }, 61 | { 62 | "data_type": "VARCHAR(50)", 63 | "column_name": "first_name", 64 | "column_description": "" 65 | }, 66 | { 67 | "data_type": "VARCHAR(50)", 68 | "column_name": "last_name", 69 | "column_description": "" 70 | }, 71 | { 72 | "data_type": "DATE", 73 | "column_name": "date_of_birth", 74 | "column_description": "" 75 | }, 76 | { 77 | "data_type": "DATE", 78 | "column_name": "date_of_registration", 79 | "column_description": "" 80 | }, 81 | { 82 | "data_type": "VARCHAR(10)", 83 | "column_name": "gender", 84 | "column_description": "Male, Female, Others" 85 | }, 86 | { 87 | "data_type": "VARCHAR(100)", 88 | "column_name": "email", 89 | "column_description": "" 90 | }, 91 | { 92 | "data_type": "VARCHAR(20)", 93 | "column_name": "phone", 94 | "column_description": "" 95 | }, 96 | { 97 | "data_type": "VARCHAR(100)", 98 | "column_name": "addr_street", 99 | "column_description": "" 100 | }, 101 | { 102 | "data_type": "VARCHAR(50)", 103 | "column_name": "addr_city", 104 | "column_description": "" 105 | }, 106 | { 107 | "data_type": "CHAR(2)", 108 | "column_name": "addr_state", 109 | "column_description": "" 110 | }, 111 | { 112 | "data_type": "VARCHAR(10)", 113 | "column_name": "addr_zip", 114 | "column_description": "" 115 | }, 116 | { 117 | "data_type": "TEXT", 118 | "column_name": "ins_type", 119 | "column_description": "Insurance type. Possible values: private, medicare, medicaid, uninsured" 120 | }, 121 | { 122 | "data_type": "VARCHAR(20)", 123 | "column_name": "ins_policy_num", 124 | "column_description": "Insurance policy number" 125 | }, 126 | { 127 | "data_type": "FLOAT", 128 | "column_name": "height_cm", 129 | "column_description": "" 130 | }, 131 | { 132 | "data_type": "FLOAT", 133 | "column_name": "weight_kg", 134 | "column_description": "" 135 | } 136 | ], 137 | "drugs": [ 138 | { 139 | "data_type": "SERIAL", 140 | "column_name": "drug_id", 141 | "column_description": "" 142 | }, 143 | { 144 | "data_type": "VARCHAR(100)", 145 | "column_name": "drug_name", 146 | "column_description": "" 147 | }, 148 | { 149 | "data_type": "VARCHAR(100)", 150 | "column_name": "manufacturer", 151 | "column_description": "" 152 | }, 153 | { 154 | "data_type": "TEXT", 155 | "column_name": "drug_type", 156 | "column_description": "possible values: biologic, small molecule, topical" 157 | }, 158 | { 159 | "data_type": "TEXT", 160 | "column_name": "moa", 161 | "column_description": "mechanism of action" 162 | }, 163 | { 164 | "data_type": "DATE", 165 | "column_name": "fda_appr_dt", 166 | "column_description": "FDA approval date. NULL if drug is still under trial." 167 | }, 168 | { 169 | "data_type": "TEXT", 170 | "column_name": "admin_route", 171 | "column_description": "possible values: oral, injection, topical" 172 | }, 173 | { 174 | "data_type": "DECIMAL(10,2)", 175 | "column_name": "dos_amt", 176 | "column_description": "recommended dosage amount" 177 | }, 178 | { 179 | "data_type": "VARCHAR(20)", 180 | "column_name": "dos_unit", 181 | "column_description": "recommended dosage unit" 182 | }, 183 | { 184 | "data_type": "INT", 185 | "column_name": "dos_freq_hrs", 186 | "column_description": "recommended number of hours between dosages" 187 | }, 188 | { 189 | "data_type": "VARCHAR(20)", 190 | "column_name": "ndc", 191 | "column_description": "National Drug Code" 192 | } 193 | ], 194 | "diagnoses": [ 195 | { 196 | "data_type": "SERIAL", 197 | "column_name": "diag_id", 198 | "column_description": "" 199 | }, 200 | { 201 | "data_type": "VARCHAR(10)", 202 | "column_name": "diag_code", 203 | "column_description": "" 204 | }, 205 | { 206 | "data_type": "VARCHAR(100)", 207 | "column_name": "diag_name", 208 | "column_description": "" 209 | }, 210 | { 211 | "data_type": "TEXT", 212 | "column_name": "diag_desc", 213 | "column_description": "" 214 | } 215 | ], 216 | "treatments": [ 217 | { 218 | "data_type": "SERIAL", 219 | "column_name": "treatment_id", 220 | "column_description": "" 221 | }, 222 | { 223 | "data_type": "INT", 224 | "column_name": "patient_id", 225 | "column_description": "" 226 | }, 227 | { 228 | "data_type": "INT", 229 | "column_name": "doc_id", 230 | "column_description": "" 231 | }, 232 | { 233 | "data_type": "INT", 234 | "column_name": "drug_id", 235 | "column_description": "" 236 | }, 237 | { 238 | "data_type": "INT", 239 | "column_name": "diag_id", 240 | "column_description": "" 241 | }, 242 | { 243 | "data_type": "DATE", 244 | "column_name": "start_dt", 245 | "column_description": "" 246 | }, 247 | { 248 | "data_type": "DATE", 249 | "column_name": "end_dt", 250 | "column_description": "NULL if treatment is ongoing" 251 | }, 252 | { 253 | "data_type": "BOOLEAN", 254 | "column_name": "is_placebo", 255 | "column_description": "" 256 | }, 257 | { 258 | "data_type": "DECIMAL(10,2)", 259 | "column_name": "tot_drug_amt", 260 | "column_description": "" 261 | }, 262 | { 263 | "data_type": "TEXT", 264 | "column_name": "drug_unit", 265 | "column_description": "possible values: mg, ml, g" 266 | } 267 | ], 268 | "outcomes": [ 269 | { 270 | "data_type": "SERIAL", 271 | "column_name": "outcome_id", 272 | "column_description": "" 273 | }, 274 | { 275 | "data_type": "INT", 276 | "column_name": "treatment_id", 277 | "column_description": "" 278 | }, 279 | { 280 | "data_type": "DATE", 281 | "column_name": "assess_dt", 282 | "column_description": "" 283 | }, 284 | { 285 | "data_type": "INT", 286 | "column_name": "day7_lesion_cnt", 287 | "column_description": "lesion counts on day 7." 288 | }, 289 | { 290 | "data_type": "INT", 291 | "column_name": "day30_lesion_cnt", 292 | "column_description": "" 293 | }, 294 | { 295 | "data_type": "INT", 296 | "column_name": "day100_lesion_cnt", 297 | "column_description": "" 298 | }, 299 | { 300 | "data_type": "DECIMAL(4,1)", 301 | "column_name": "day7_pasi_score", 302 | "column_description": "PASI score range 0-72" 303 | }, 304 | { 305 | "data_type": "DECIMAL(4,1)", 306 | "column_name": "day30_pasi_score", 307 | "column_description": "" 308 | }, 309 | { 310 | "data_type": "DECIMAL(4,1)", 311 | "column_name": "day100_pasi_score", 312 | "column_description": "" 313 | }, 314 | { 315 | "data_type": "DECIMAL(5,2)", 316 | "column_name": "day7_tewl", 317 | "column_description": "in g/m^2/h" 318 | }, 319 | { 320 | "data_type": "DECIMAL(5,2)", 321 | "column_name": "day30_tewl", 322 | "column_description": "" 323 | }, 324 | { 325 | "data_type": "DECIMAL(5,2)", 326 | "column_name": "day100_tewl", 327 | "column_description": "" 328 | }, 329 | { 330 | "data_type": "INT", 331 | "column_name": "day7_itch_vas", 332 | "column_description": "visual analog scale 0-100" 333 | }, 334 | { 335 | "data_type": "INT", 336 | "column_name": "day30_itch_vas", 337 | "column_description": "" 338 | }, 339 | { 340 | "data_type": "INT", 341 | "column_name": "day100_itch_vas", 342 | "column_description": "" 343 | }, 344 | { 345 | "data_type": "DECIMAL(4,1)", 346 | "column_name": "day7_hfg", 347 | "column_description": "hair growth factor range 0-5" 348 | }, 349 | { 350 | "data_type": "DECIMAL(4,1)", 351 | "column_name": "day30_hfg", 352 | "column_description": "" 353 | }, 354 | { 355 | "data_type": "DECIMAL(4,1)", 356 | "column_name": "day100_hfg", 357 | "column_description": "" 358 | } 359 | ], 360 | "adverse_events": [ 361 | { 362 | "data_type": "SERIAL", 363 | "column_name": "id", 364 | "column_description": "1 row per adverse event per treatment_id" 365 | }, 366 | { 367 | "data_type": "INT", 368 | "column_name": "treatment_id", 369 | "column_description": "" 370 | }, 371 | { 372 | "data_type": "DATE", 373 | "column_name": "reported_dt", 374 | "column_description": "" 375 | }, 376 | { 377 | "data_type": "TEXT", 378 | "column_name": "description", 379 | "column_description": "" 380 | } 381 | ], 382 | "concomitant_meds": [ 383 | { 384 | "data_type": "SERIAL", 385 | "column_name": "id", 386 | "column_description": "1 row per med per treatment_id" 387 | }, 388 | { 389 | "data_type": "INT", 390 | "column_name": "treatment_id", 391 | "column_description": "" 392 | }, 393 | { 394 | "data_type": "VARCHAR(100)", 395 | "column_name": "med_name", 396 | "column_description": "" 397 | }, 398 | { 399 | "data_type": "TEXT", 400 | "column_name": "start_dt", 401 | "column_description": "YYYY-MM-DD" 402 | }, 403 | { 404 | "data_type": "TEXT", 405 | "column_name": "end_dt", 406 | "column_description": "YYYY-MM-DD NULL if still taking" 407 | }, 408 | { 409 | "data_type": "DECIMAL(10,2)", 410 | "column_name": "dose_amt", 411 | "column_description": "" 412 | }, 413 | { 414 | "data_type": "TEXT", 415 | "column_name": "dose_unit", 416 | "column_description": "possible values: mg, ml, g" 417 | }, 418 | { 419 | "data_type": "INT", 420 | "column_name": "freq_hrs", 421 | "column_description": "" 422 | } 423 | ] 424 | }, 425 | "glossary": "- All string columns should be matched exactly unless specified otherwise\n- `patients.first_name`, `patients.last_name`, `doctors.first_name`, `doctors.last_name` can be filtered with ILIKE '%%'\n- `drugs.drug_name`, `diagnoses.diag_name` should be matched case insensitively\n- day30_* will be empty if treatment has yet to progress to day 30. same for day100.\n- PASI75D30 (75% reduction in PASI score on day 30) = COUNT(CASE WHEN day30_pasi_score <= 0.25 * day0_pasi_score THEN 1 END) / COUNT(day0_pasi_score)\n- Mean change in TEWL = AVG(day30_tewl) - AVG(day0_tewl) \n- Proportion with improved itch = COUNT(CASE WHEN day30_itch_vas < day0_itch_vas THEN 1 END) / COUNT(*)\n- Mean percent change in hair growth = AVG((day30_hfg - day0_hfg) / day0_hfg * 100)\n\nAdditional outcomes documented:\n- day0_pasi_score: Baseline PASI score before treatment \n- day0_tewl: Baseline TEWL before treatment\n- day0_itch_vas: Baseline itch VAS before treatment \n- day0_hfg: Baseline hair growth factor before treatment" 426 | } -------------------------------------------------------------------------------- /defog_data/derm_treatment/derm_treatment.sql: -------------------------------------------------------------------------------- 1 | -- doctor dimension table 2 | CREATE TABLE doctors ( 3 | doc_id SERIAL PRIMARY KEY, 4 | first_name VARCHAR(50), 5 | last_name VARCHAR(50), 6 | specialty TEXT, -- possible values: dermatology, immunology, general, oncology 7 | year_reg INT, -- year the doctor was registered and obtained license 8 | med_school_name VARCHAR(100), 9 | loc_city VARCHAR(50), 10 | loc_state CHAR(2), 11 | loc_zip VARCHAR(10), 12 | bd_cert_num VARCHAR(20) -- board certification number 13 | ); 14 | 15 | -- patient dimension table 16 | CREATE TABLE patients ( 17 | patient_id SERIAL PRIMARY KEY, 18 | first_name VARCHAR(50), 19 | last_name VARCHAR(50), 20 | date_of_birth DATE, 21 | date_of_registration DATE, 22 | gender VARCHAR(10), -- Male, Female, Others 23 | email VARCHAR(100), 24 | phone VARCHAR(20), 25 | addr_street VARCHAR(100), 26 | addr_city VARCHAR(50), 27 | addr_state CHAR(2), 28 | addr_zip VARCHAR(10), 29 | ins_type TEXT, -- possible values: private, medicare, medicaid, uninsured 30 | ins_policy_num VARCHAR(20), 31 | height_cm FLOAT, 32 | weight_kg FLOAT 33 | ); 34 | 35 | -- drug dimension table 36 | CREATE TABLE drugs ( 37 | drug_id SERIAL PRIMARY KEY, 38 | drug_name VARCHAR(100), 39 | manufacturer VARCHAR(100), 40 | drug_type TEXT, -- possible values: biologic, small molecule, topical 41 | moa TEXT, -- mechanism of action 42 | fda_appr_dt DATE, -- FDA approval date. NULL if drug is still under trial. 43 | admin_route TEXT, -- possible values: oral, injection, topical 44 | dos_amt DECIMAL(10,2), 45 | dos_unit VARCHAR(20), 46 | dos_freq_hrs INT, 47 | ndc VARCHAR(20) -- National Drug Code 48 | ); 49 | 50 | -- diagnosis dimension table 51 | CREATE TABLE diagnoses ( 52 | diag_id SERIAL PRIMARY KEY, 53 | diag_code VARCHAR(10), 54 | diag_name VARCHAR(100), 55 | diag_desc TEXT 56 | ); 57 | 58 | -- treatment fact table 59 | CREATE TABLE treatments ( 60 | treatment_id SERIAL PRIMARY KEY, 61 | patient_id INT REFERENCES patients(patient_id), 62 | doc_id INT REFERENCES doctors(doc_id), 63 | drug_id INT REFERENCES drugs(drug_id), 64 | diag_id INT REFERENCES diagnoses(diag_id), 65 | start_dt DATE, 66 | end_dt DATE, -- NULL if treatment is ongoing 67 | is_placebo BOOLEAN, 68 | tot_drug_amt DECIMAL(10,2), 69 | drug_unit TEXT -- possible values: mg, ml, g 70 | ); 71 | 72 | -- outcome fact table 73 | CREATE TABLE outcomes ( 74 | outcome_id SERIAL PRIMARY KEY, 75 | treatment_id INT REFERENCES treatments(treatment_id), 76 | assess_dt DATE, 77 | day7_lesion_cnt INT, -- lesion counts on day 7. 78 | day30_lesion_cnt INT, 79 | day100_lesion_cnt INT, 80 | day7_pasi_score DECIMAL(4,1), -- PASI score range 0-72 81 | day30_pasi_score DECIMAL(4,1), 82 | day100_pasi_score DECIMAL(4,1), 83 | day7_tewl DECIMAL(5,2), -- in g/m^2/h 84 | day30_tewl DECIMAL(5,2), 85 | day100_tewl DECIMAL(5,2), 86 | day7_itch_vas INT, -- visual analog scale 0-100 87 | day30_itch_vas INT, 88 | day100_itch_vas INT, 89 | day7_hfg DECIMAL(4,1), -- hair growth factor range 0-5 90 | day30_hfg DECIMAL(4,1), 91 | day100_hfg DECIMAL(4,1) 92 | ); 93 | 94 | CREATE TABLE adverse_events ( 95 | id SERIAL PRIMARY KEY, -- 1 row per adverse event per treatment_id 96 | treatment_id INT REFERENCES treatments(treatment_id), 97 | reported_dt DATE, 98 | description TEXT 99 | ); 100 | 101 | CREATE TABLE concomitant_meds ( 102 | id SERIAL PRIMARY KEY, -- 1 row per med per treatment_id 103 | treatment_id INT REFERENCES treatments(treatment_id), 104 | med_name VARCHAR(100), 105 | start_dt TEXT, -- YYYY-MM-DD 106 | end_dt TEXT, -- YYYY-MM-DD NULL if still taking 107 | dose_amt DECIMAL(10,2), 108 | dose_unit TEXT, -- possible values: mg, ml, g 109 | freq_hrs INT 110 | ); 111 | 112 | -- insert into dimension tables first 113 | 114 | INSERT INTO doctors (doc_id, first_name, last_name, specialty, year_reg, med_school_name, loc_city, loc_state, loc_zip, bd_cert_num) 115 | VALUES 116 | (1, 'John', 'Doe', 'dermatology', EXTRACT(YEAR FROM CURRENT_DATE) - 2, 'Johns Hopkins University', 'Baltimore', 'MD', '21201', 'ABC123'), 117 | (2,'Jane', 'Smith', 'immunology', EXTRACT(YEAR FROM CURRENT_DATE) - 2, 'Harvard Medical School', 'Boston', 'MA', '02115', 'XYZ789'), 118 | (3, 'David', 'Johnson', 'general', 1998, 'University of Pennsylvania', 'Philadelphia', 'PA', '19104', 'DEF456'), 119 | (4, 'Emily', 'Brown', 'dermatology', 2015, 'Stanford University', 'Palo Alto', 'CA', '94304', 'GHI012'), 120 | (5, 'Michael', 'Davis', 'immunology', 2008, 'Duke University', 'Durham', 'NC', '27708', 'JKL345'), 121 | (6, 'Sarah', 'Wilson', 'oncology', EXTRACT(YEAR FROM CURRENT_DATE) - 1, 'University of California, San Francisco', 'San Francisco', 'CA', '94143', 'MNO678'), 122 | (7, 'Robert', 'Taylor', 'dermatology', 2012, 'Yale University', 'New Haven', 'CT', '06510', 'PQR901'), 123 | (8, 'Laura', 'Martinez', 'immunology', 2006, 'University of Michigan', 'Ann Arbor', 'MI', '48109', 'STU234'), 124 | (9, 'Daniel', 'Garcia', 'general', EXTRACT(YEAR FROM CURRENT_DATE) - 3, 'University of Chicago', 'Chicago', 'IL', '60637', 'VWX567'), 125 | (10, 'Olivia', 'Anderson', 'dermatology', 2018, 'Columbia University', 'New York', 'NY', '10027', 'YZA890'); 126 | 127 | INSERT INTO patients (patient_id, first_name, last_name, date_of_birth, date_of_registration, gender, email, phone, addr_street, addr_city, addr_state, addr_zip, ins_type, ins_policy_num, height_cm, weight_kg) 128 | VALUES 129 | (1, 'Alice', 'Johnson', '1985-03-15', '2023-01-03', 'Female', 'alice@email.com', '555-123-4567', '123 Main St', 'Anytown', 'CA', '12345', 'private', 'ABC123456', 165, 60), 130 | (2, 'Bob', 'Smith', '1978-11-23', '2023-01-10', 'Male', 'bob@email.com', '555-987-6543', '456 Oak Ave', 'Somecity', 'NY', '54321', 'medicare', 'XYZ789012', 180, 85), 131 | (3, 'Carol', 'Davis', '1992-07-08', '2022-01-03', 'Female', 'carol@email.com', '555-246-8135', '789 Elm Rd', 'Anothercity', 'TX', '67890', 'private', 'DEF345678', 158, 52), 132 | (4, 'David', 'Wilson', '1965-09-30', '2022-07-12', 'Male', 'david@email.com', '555-369-2580', '321 Pine Ln', 'Somewhere', 'FL', '13579', 'medicaid', 'GHI901234', 175, 78), 133 | (5, 'Eve', 'Brown', '2000-01-01', '2023-08-03', 'Female', 'eve@email.com', '555-147-2589', '654 Cedar St', 'Nowhere', 'WA', '97531', 'uninsured', NULL, 160, 55), 134 | (6, 'Frank', 'Taylor', '1988-05-12', '2021-12-21', 'Male', 'frank@email.com', '555-753-9514', '987 Birch Dr', 'Anyplace', 'CO', '24680', 'private', 'JKL567890', 183, 90), 135 | (7, 'Grace', 'Anderson', '1975-12-25', '2023-09-04', 'Others', 'grace@email.com', '555-951-7532', '159 Maple Rd', 'Somewhere', 'OH', '86420', 'medicare', 'MNO246810', 170, 68), 136 | (8, 'Hannah', 'Garcia', '1982-08-05', '2023-03-23', 'Female', 'hannah@email.com', '555-369-1470', '753 Walnut Ave', 'Somewhere', 'CA', '97531', 'private', 'PQR135790', 162, 57), 137 | (9, 'Isaac', 'Martinez', '1995-02-18', '2021-11-13', 'Male', 'isaac@email.com', '555-147-8520', '951 Spruce Blvd', 'Anytown', 'TX', '13579', 'medicaid', 'STU024680', 178, 82), 138 | (10, 'John', 'Richter', '1980-01-01', '2021-11-24', 'Male', 'john@qwik.com', '555-123-4567', '123 Main St', 'Anytown', 'CA', '12345', 'private', 'ABC123456', 180, 80), 139 | (11, 'Kelly', 'Smith', '1985-05-15', '2024-02-28', 'Female', 'kelly@fsda.org', '555-987-6543', '456 Oak Ave', 'Somecity', 'NY', '54321', 'medicare', 'XYZ789012', 165, 60); 140 | 141 | 142 | 143 | INSERT INTO drugs (drug_id, drug_name, manufacturer, drug_type, moa, fda_appr_dt, admin_route, dos_amt, dos_unit, dos_freq_hrs, ndc) 144 | VALUES 145 | (1, 'Drugalin', 'Pharma Inc', 'biologic', 'TNF-alpha inhibitor', '2010-01-15', 'injection', 40, 'mg', 336, '12345-678-90'), 146 | (2, 'Medicol', 'Acme Pharma', 'small molecule', 'IL-17A inhibitor', '2015-06-30', 'oral', 30, 'mg', 24, '54321-012-34'), 147 | (3, 'Topizol', 'BioMed Ltd', 'topical', 'PDE4 inhibitor', '2018-11-01', 'topical', 15, 'g', 12, '98765-432-10'), 148 | (4, 'Biologic-X', 'Innova Biologics', 'biologic', 'IL-23 inhibitor', NULL, 'injection', 100, 'mg', 672, '13579-246-80'), 149 | (5, 'Smallazine', 'Chem Co', 'small molecule', 'JAK inhibitor', '2020-03-15', 'oral', 5, 'mg', 24, '97531-864-20'), 150 | (6, 'Topicort', 'Derma Rx', 'topical', 'Corticosteroid', '2005-09-30', 'topical', 30, 'g', 12, '24680-135-79'), 151 | (7, 'Biologic-Y', 'BioPharm Inc', 'biologic', 'IL-12/23 inhibitor', '2012-07-01', 'injection', 50, 'mg', 504, '75319-951-46'), 152 | (8, 'Smallitol', 'PharmaGen', 'small molecule', 'IL-6 inhibitor', '2017-04-15', 'oral', 10, 'mg', 24, '36915-258-07'), 153 | (9, 'Topicalin', 'DermiCare', 'topical', 'Calcineurin inhibitor', '2019-10-01', 'topical', 20, 'g', 12, '14785-369-02'), 154 | (10, 'Biologic-Z', 'BioMed Ltd', 'biologic', 'IL-17F inhibitor', '2021-01-01', 'injection', 80, 'mg', 336, '95146-753-19'); 155 | 156 | INSERT INTO diagnoses (diag_id, diag_code, diag_name, diag_desc) 157 | VALUES 158 | (1, 'L40.0', 'Psoriasis vulgaris', 'Plaque psoriasis, the most common form'), 159 | (2, 'L40.1', 'Generalized pustular psoriasis', 'Widespread pustules on top of red skin'), 160 | (3, 'L40.4', 'Guttate psoriasis', 'Small, teardrop-shaped lesions'), 161 | (4, 'L40.8', 'Other psoriasis', 'Includes flexural, erythrodermic, and other rare types'), 162 | (5, 'L40.9', 'Psoriasis, unspecified', 'Psoriasis not further specified'), 163 | (6, 'L40.50', 'Arthropathic psoriasis, unspecified', 'Psoriatic arthritis, unspecified'), 164 | (7, 'L40.51', 'Distal interphalangeal psoriatic arthropathy', 'Psoriatic arthritis mainly affecting the ends of fingers and toes'), 165 | (8, 'L40.52', 'Psoriatic arthritis mutilans', 'Severe, deforming psoriatic arthritis'), 166 | (9, 'L40.53', 'Psoriatic spondylitis', 'Psoriatic arthritis of the spine'), 167 | (10, 'L40.59', 'Other psoriatic arthropathy', 'Other specified types of psoriatic arthritis'); 168 | 169 | -- insert into fact tables 170 | INSERT INTO treatments (treatment_id, patient_id, doc_id, drug_id, diag_id, start_dt, end_dt, is_placebo, tot_drug_amt, drug_unit) 171 | VALUES 172 | (1, 1, 1, 1, 1, '2022-01-01', '2022-06-30', false, 240, 'mg'), 173 | (2, 2, 2, 2, 2, '2022-02-15', '2022-08-14', true, 180, 'mg'), 174 | (3, 3, 3, 3, 3, '2022-03-10', '2022-09-09', false, 360, 'g'), 175 | (4, 4, 4, 4, 4, '2022-04-01', NULL, false, 200, 'mg'), 176 | (5, 5, 5, 5, 5, '2022-05-01', '2022-10-31', false, 180, 'mg'), 177 | (6, 6, 6, 6, 6, '2022-06-15', '2022-12-14', false, 720, 'g'), 178 | (7, 1, 7, 1, 7, '2022-07-01', '2022-12-31', true, 240, 'mg'), 179 | (8, 2, 1, 2, 8, '2022-08-01', '2023-01-31', false, 180, 'mg'), 180 | (9, 3, 2, 3, 9, '2022-09-01', '2023-02-28', false, 360, 'g'), 181 | (10, 4, 3, 4, 10, '2022-10-01', NULL, true, 0, NULL), 182 | (11, 5, 4, 5, 1, '2022-11-01', '2023-04-30', true, 180, 'mg'), 183 | (12, 6, 5, 6, 2, '2022-12-01', '2023-05-31', false, 720, 'g'), 184 | (13, 7, 6, 1, 3, '2023-01-01', '2023-06-30', false, 240, 'mg'), 185 | (14, 1, 7, 2, 4, '2023-02-01', '2023-07-31', false, 180, 'mg'), 186 | (15, 2, 1, 3, 5, '2023-03-01', '2023-08-31', false, 360, 'g'), 187 | (16, 1, 2, 4, 6, DATE_TRUNC('month', CURRENT_DATE) - INTERVAL '2 year', DATE_TRUNC('month', CURRENT_DATE) - INTERVAL '2 months', false, 300, 'mg'), 188 | (17, 2, 5, 1, 8, DATE_TRUNC('month', CURRENT_DATE) - INTERVAL '1 year', DATE_TRUNC('month', CURRENT_DATE) - INTERVAL '4 months', false, 80, 'mg'), 189 | (18, 3, 6, 2, 9, DATE_TRUNC('month', CURRENT_DATE) - INTERVAL '5 months', NULL, true, 200, 'mg'), 190 | (19, 1, 7, 3, 10, DATE_TRUNC('month', CURRENT_DATE) - INTERVAL '4 months', NULL, false, 150, 'g'), 191 | (20, 2, 1, 4, 1, DATE_TRUNC('month', CURRENT_DATE) - INTERVAL '3 months', NULL, false, 100, 'mg'), 192 | (21, 3, 2, 5, 2, DATE_TRUNC('month', CURRENT_DATE) - INTERVAL '2 months', NULL, false, 250, 'mg'), 193 | (22, 1, 3, 6, 3, DATE_TRUNC('month', CURRENT_DATE) - INTERVAL '1 month', NULL, false, 300, 'g'), 194 | (23, 2, 4, 1, 4, CURRENT_DATE, NULL, true, 200, 'mg'), 195 | (24, 3, 5, 2, 5, CURRENT_DATE, NULL, false, 150, 'mg'), 196 | (25, 9, 1, 1, 1, CURRENT_DATE - INTERVAL '6 months', CURRENT_DATE - INTERVAL '3 months', false, 240, 'mg'), 197 | (26, 10, 2, 2, 2, CURRENT_DATE - INTERVAL '5 months', CURRENT_DATE - INTERVAL '2 months', false, 180, 'mg'); 198 | 199 | INSERT INTO outcomes (outcome_id, treatment_id, assess_dt, day7_lesion_cnt, day30_lesion_cnt, day100_lesion_cnt, day7_pasi_score, day30_pasi_score, day100_pasi_score, day7_tewl, day30_tewl, day100_tewl, day7_itch_vas, day30_itch_vas, day100_itch_vas, day7_hfg, day30_hfg, day100_hfg) 200 | VALUES 201 | (1, 1, '2022-01-08', 20, 15, 5, 12.5, 8.2, 2.1, 18.2, 15.6, 12.1, 60, 40, 20, 1.5, 2.5, 4.0), 202 | (2, 2, '2022-02-22', 25, 18, 8, 15.0, 10.1, 3.5, 20.1, 17.2, 13.5, 70, 50, 30, 1.0, 2.0, 3.5), 203 | (3, 3, '2022-03-17', 18, 12, 3, 10.8, 6.4, 1.2, 16.5, 14.0, 10.8, 55, 35, 15, 2.0, 3.0, 4.5), 204 | (4, 4, '2022-04-08', 30, 25, 12, 18.2, 13.9, 5.8, 22.4, 19.1, 15.2, 80, 60, 40, 0.5, 1.5, 3.0), 205 | (5, 5, '2022-05-08', 22, 16, 6, 13.1, 8.7, 2.6, 19.0, 16.3, 12.7, 65, 45, 25, 1.2, 2.2, 3.8), 206 | (6, 6, '2022-06-22', 28, 21, 10, 16.7, 11.5, 4.3, 21.3, 18.1, 14.3, 75, 55, 35, 0.8, 1.8, 3.3), 207 | (7, 7, '2022-07-08', 19, 13, 4, 11.2, 6.9, 1.5, 17.1, 14.5, 11.2, 58, 38, 18, 1.8, 2.8, 4.3), 208 | (8, 8, '2022-08-08', 26, 19, 9, 15.6, 10.6, 3.8, 20.7, 17.6, 13.9, 72, 52, 32, 0.7, 1.7, 3.2), 209 | (9, 9, '2022-09-08', 21, 15, 5, 12.3, 8.0, 2.0, 18.6, 15.9, 12.4, 62, 42, 22, 1.4, 2.4, 3.9), 210 | (10, 10, '2022-10-08', 32, 30, 25, 19.5, 17.8, 14.1, 23.2, 21.4, 18.7, 85, 80, 70, 0.2, 0.4, 0.8), 211 | (11, 11, '2022-11-08', 23, 17, 7, 13.7, 9.2, 2.9, 19.5, 16.8, 13.1, 68, 48, 28, 1.1, 2.1, 3.6), 212 | (12, 12, '2022-12-08', 29, 23, 11, 17.4, 12.3, 4.9, 21.8, 18.7, 14.8, 78, 58, 38, 0.6, 1.6, 3.1), 213 | (13, 13, '2023-01-08', 18, 12, 3, 10.5, 6.1, 1.0, 16.9, 14.3, 11.0, 56, 36, 16, 1.9, 2.9, 4.4), 214 | (14, 14, '2023-02-08', 27, 20, 10, 16.2, 11.1, 4.1, 21.0, 17.9, 14.1, 74, 54, 34, 0.5, 1.5, 3.0), 215 | (15, 15, '2023-03-08', 20, 14, 4, 11.8, 7.3, 1.7, 17.8, 15.2, 11.8, 60, 40, 20, 1.6, 2.6, 4.1), 216 | (16, 16, DATE_TRUNC('month', CURRENT_DATE) - INTERVAL '5 months' + INTERVAL '7 days', 24, 18, 8, 14.4, 9.6, 3.2, 20.4, 17.4, 13.7, 70, 50, 30, 0.9, 1.9, 3.4), 217 | (17, 17, DATE_TRUNC('month', CURRENT_DATE) - INTERVAL '1 month' + INTERVAL '7 days', 22, 16, NULL, 13.2, 8.8, NULL, 19.1, 16.3, NULL, 65, 45, NULL, 1.3, 2.3, NULL), 218 | (18, 25, CURRENT_DATE - INTERVAL '6 months' + INTERVAL '7 days', 30, NULL, NULL, 18.0, NULL, NULL, 22.0, NULL, NULL, 80, NULL, NULL, 1.0, NULL, NULL), 219 | (19, 25, CURRENT_DATE - INTERVAL '2 months', 30, 18, 10, 18.0, 12.0, 4.0, 22.0, 19.0, 15.0, 80, 60, 40, 1.0, 2.0, 3.0), 220 | (20, 26, CURRENT_DATE - INTERVAL '5 months' + INTERVAL '7 days', 25, NULL, NULL, 15.0, NULL, NULL, 20.0, NULL, NULL, 75, NULL, NULL, 0.5, NULL, NULL), 221 | (21, 26, CURRENT_DATE - INTERVAL '1 month', 25, 18, 10, 15.0, 10.0, 5.0, 20.0, 17.0, 13.0, 75, 55, 35, 0.5, 1.5, 3.0); 222 | 223 | INSERT INTO adverse_events (id, treatment_id, reported_dt, description) 224 | VALUES 225 | (1, 1, '2022-01-15', 'Mild injection site reaction'), 226 | (2, 2, '2022-02-28', 'Headache, nausea'), 227 | (3, 4, '2022-04-10', 'Severe allergic reaction, hospitalization required'), 228 | (4, 5, '2022-05-20', 'Upper respiratory infection'), 229 | (5, 7, '2022-07-22', 'Mild injection site reaction'), 230 | (6, 9, '2022-09-18', 'Diarrhea'), 231 | (7, 11, '2022-11-12', 'Elevated liver enzymes'), 232 | (8, 14, '2023-02-05', 'Mild skin rash'); 233 | 234 | INSERT INTO concomitant_meds (id, treatment_id, med_name, start_dt, end_dt, dose_amt, dose_unit, freq_hrs) 235 | VALUES 236 | (1, 1, 'Acetaminophen', '2022-01-01', '2022-01-07', 500, 'mg', 6), 237 | (2, 1, 'Ibuprofen', '2022-01-08', '2022-01-14', 200, 'mg', 8), 238 | (3, 2, 'Loratadine', '2022-02-15', '2022-03-15', 10, 'mg', 24), 239 | (4, 3, 'Multivitamin', '2022-03-10', NULL, 1, 'tablet', 24), 240 | (5, 4, 'Epinephrine', '2022-04-10', '2022-04-10', 0.3, 'mg', NULL), 241 | (6, 4, 'Diphenhydramine', '2022-04-10', '2022-04-17', 50, 'mg', 6), 242 | (7, 5, 'Amoxicillin', '2022-05-20', '2022-05-30', 500, 'mg', 8), 243 | (8, 6, 'Calcium supplement', '2022-06-15', NULL, 600, 'mg', 24), 244 | (9, 7, 'Acetaminophen', '2022-07-15', '2022-07-21', 500, 'mg', 6), 245 | (10, 8, 'Cetirizine', '2022-08-01', '2022-08-14', 10, 'mg', 24), 246 | (11, 9, 'Loperamide', '2022-09-18', '2022-09-20', 4, 'mg', 6), 247 | (12, 11, 'Ursodiol', '2022-11-30', '2022-12-30', 300, 'mg', 8), 248 | (13, 12, 'Vitamin D', '2022-12-01', NULL, 1000, 'IU', 24), 249 | (14, 13, 'Acetaminophen', '2023-01-08', '2023-01-14', 500, 'mg', 6), 250 | (15, 14, 'Hydrocortisone cream', '2023-02-25', '2023-03-07', 10, 'g', 12); 251 | -------------------------------------------------------------------------------- /defog_data/ewallet/ewallet.json: -------------------------------------------------------------------------------- 1 | { 2 | "table_metadata": { 3 | "consumer_div.users": [ 4 | { 5 | "data_type": "BIGINT", 6 | "column_name": "uid", 7 | "column_description": "" 8 | }, 9 | { 10 | "data_type": "VARCHAR(50)", 11 | "column_name": "username", 12 | "column_description": "" 13 | }, 14 | { 15 | "data_type": "VARCHAR(100)", 16 | "column_name": "email", 17 | "column_description": "" 18 | }, 19 | { 20 | "data_type": "VARCHAR(20)", 21 | "column_name": "phone_number", 22 | "column_description": "" 23 | }, 24 | { 25 | "data_type": "TIMESTAMP", 26 | "column_name": "created_at", 27 | "column_description": "" 28 | }, 29 | { 30 | "data_type": "TIMESTAMP", 31 | "column_name": "last_login_at", 32 | "column_description": "" 33 | }, 34 | { 35 | "data_type": "VARCHAR(20)", 36 | "column_name": "user_type", 37 | "column_description": "possible values: individual, business, admin" 38 | }, 39 | { 40 | "data_type": "VARCHAR(20)", 41 | "column_name": "status", 42 | "column_description": "possible values: active, inactive, suspended, deleted" 43 | }, 44 | { 45 | "data_type": "VARCHAR(2)", 46 | "column_name": "country", 47 | "column_description": "2-letter country code" 48 | }, 49 | { 50 | "data_type": "TEXT", 51 | "column_name": "address_billing", 52 | "column_description": "" 53 | }, 54 | { 55 | "data_type": "TEXT", 56 | "column_name": "address_delivery", 57 | "column_description": "" 58 | }, 59 | { 60 | "data_type": "VARCHAR(20)", 61 | "column_name": "kyc_status", 62 | "column_description": "possible values: pending, approved, rejected" 63 | }, 64 | { 65 | "data_type": "TIMESTAMP", 66 | "column_name": "kyc_verified_at", 67 | "column_description": "" 68 | } 69 | ], 70 | "consumer_div.merchants": [ 71 | { 72 | "data_type": "BIGINT", 73 | "column_name": "mid", 74 | "column_description": "" 75 | }, 76 | { 77 | "data_type": "VARCHAR(100)", 78 | "column_name": "name", 79 | "column_description": "" 80 | }, 81 | { 82 | "data_type": "TEXT", 83 | "column_name": "description", 84 | "column_description": "" 85 | }, 86 | { 87 | "data_type": "VARCHAR(200)", 88 | "column_name": "website_url", 89 | "column_description": "" 90 | }, 91 | { 92 | "data_type": "VARCHAR(200)", 93 | "column_name": "logo_url", 94 | "column_description": "" 95 | }, 96 | { 97 | "data_type": "TIMESTAMP", 98 | "column_name": "created_at", 99 | "column_description": "" 100 | }, 101 | { 102 | "data_type": "VARCHAR(2)", 103 | "column_name": "country", 104 | "column_description": "2-letter country code" 105 | }, 106 | { 107 | "data_type": "VARCHAR(50)", 108 | "column_name": "state", 109 | "column_description": "" 110 | }, 111 | { 112 | "data_type": "VARCHAR(50)", 113 | "column_name": "city", 114 | "column_description": "" 115 | }, 116 | { 117 | "data_type": "VARCHAR(20)", 118 | "column_name": "postal_code", 119 | "column_description": "" 120 | }, 121 | { 122 | "data_type": "TEXT", 123 | "column_name": "address", 124 | "column_description": "" 125 | }, 126 | { 127 | "data_type": "VARCHAR(20)", 128 | "column_name": "status", 129 | "column_description": "possible values: active, inactive, suspended" 130 | }, 131 | { 132 | "data_type": "VARCHAR(50)", 133 | "column_name": "category", 134 | "column_description": "" 135 | }, 136 | { 137 | "data_type": "VARCHAR(50)", 138 | "column_name": "sub_category", 139 | "column_description": "" 140 | }, 141 | { 142 | "data_type": "INT", 143 | "column_name": "mcc", 144 | "column_description": "Merchant Category Code" 145 | }, 146 | { 147 | "data_type": "VARCHAR(100)", 148 | "column_name": "contact_name", 149 | "column_description": "" 150 | }, 151 | { 152 | "data_type": "VARCHAR(100)", 153 | "column_name": "contact_email", 154 | "column_description": "" 155 | }, 156 | { 157 | "data_type": "VARCHAR(20)", 158 | "column_name": "contact_phone", 159 | "column_description": "" 160 | } 161 | ], 162 | "consumer_div.coupons": [ 163 | { 164 | "data_type": "BIGINT", 165 | "column_name": "cid", 166 | "column_description": "Unique identifier for the coupon" 167 | }, 168 | { 169 | "data_type": "BIGINT", 170 | "column_name": "merchant_id", 171 | "column_description": "ID of merchant that issued the coupon" 172 | }, 173 | { 174 | "data_type": "VARCHAR(20)", 175 | "column_name": "code", 176 | "column_description": "" 177 | }, 178 | { 179 | "data_type": "TEXT", 180 | "column_name": "description", 181 | "column_description": "" 182 | }, 183 | { 184 | "data_type": "DATE", 185 | "column_name": "start_date", 186 | "column_description": "" 187 | }, 188 | { 189 | "data_type": "DATE", 190 | "column_name": "end_date", 191 | "column_description": "" 192 | }, 193 | { 194 | "data_type": "VARCHAR(20)", 195 | "column_name": "discount_type", 196 | "column_description": "possible values: percentage, fixed_amount" 197 | }, 198 | { 199 | "data_type": "DECIMAL(10,2)", 200 | "column_name": "discount_value", 201 | "column_description": "" 202 | }, 203 | { 204 | "data_type": "DECIMAL(10,2)", 205 | "column_name": "min_purchase_amount", 206 | "column_description": "" 207 | }, 208 | { 209 | "data_type": "DECIMAL(10,2)", 210 | "column_name": "max_discount_amount", 211 | "column_description": "" 212 | }, 213 | { 214 | "data_type": "INT", 215 | "column_name": "redemption_limit", 216 | "column_description": "" 217 | }, 218 | { 219 | "data_type": "VARCHAR(20)", 220 | "column_name": "status", 221 | "column_description": "possible values: active, inactive, expired" 222 | }, 223 | { 224 | "data_type": "TIMESTAMP", 225 | "column_name": "created_at", 226 | "column_description": "" 227 | }, 228 | { 229 | "data_type": "TIMESTAMP", 230 | "column_name": "updated_at", 231 | "column_description": "" 232 | } 233 | ], 234 | "consumer_div.wallet_transactions_daily": [ 235 | { 236 | "data_type": "SERIAL", 237 | "column_name": "txid", 238 | "column_description": "" 239 | }, 240 | { 241 | "data_type": "BIGINT", 242 | "column_name": "sender_id", 243 | "column_description": "" 244 | }, 245 | { 246 | "data_type": "INT", 247 | "column_name": "sender_type", 248 | "column_description": "0 for user, 1 for merchant" 249 | }, 250 | { 251 | "data_type": "BIGINT", 252 | "column_name": "receiver_id", 253 | "column_description": "" 254 | }, 255 | { 256 | "data_type": "INT", 257 | "column_name": "receiver_type", 258 | "column_description": "0 for user, 1 for merchant" 259 | }, 260 | { 261 | "data_type": "DECIMAL(10,2)", 262 | "column_name": "amount", 263 | "column_description": "" 264 | }, 265 | { 266 | "data_type": "VARCHAR(20)", 267 | "column_name": "status", 268 | "column_description": "possible values: pending, success, failed, refunded" 269 | }, 270 | { 271 | "data_type": "VARCHAR(20)", 272 | "column_name": "type", 273 | "column_description": "possible values: credit, debit" 274 | }, 275 | { 276 | "data_type": "TEXT", 277 | "column_name": "description", 278 | "column_description": "" 279 | }, 280 | { 281 | "data_type": "BIGINT", 282 | "column_name": "coupon_id", 283 | "column_description": "NULL if transaction doesn't involve a coupon" 284 | }, 285 | { 286 | "data_type": "TIMESTAMP", 287 | "column_name": "created_at", 288 | "column_description": "" 289 | }, 290 | { 291 | "data_type": "TIMESTAMP", 292 | "column_name": "completed_at", 293 | "column_description": "NULL if failed" 294 | }, 295 | { 296 | "data_type": "VARCHAR(36)", 297 | "column_name": "transaction_ref", 298 | "column_description": "randomly generated uuid4 for users' reference" 299 | }, 300 | { 301 | "data_type": "VARCHAR(50)", 302 | "column_name": "gateway_name", 303 | "column_description": "" 304 | }, 305 | { 306 | "data_type": "VARCHAR(50)", 307 | "column_name": "gateway_ref", 308 | "column_description": "" 309 | }, 310 | { 311 | "data_type": "VARCHAR(50)", 312 | "column_name": "device_id", 313 | "column_description": "" 314 | }, 315 | { 316 | "data_type": "VARCHAR(50)", 317 | "column_name": "ip_address", 318 | "column_description": "" 319 | }, 320 | { 321 | "data_type": "TEXT", 322 | "column_name": "user_agent", 323 | "column_description": "" 324 | } 325 | ], 326 | "consumer_div.wallet_user_balance_daily": [ 327 | { 328 | "data_type": "BIGINT", 329 | "column_name": "user_id", 330 | "column_description": "" 331 | }, 332 | { 333 | "data_type": "DECIMAL(10,2)", 334 | "column_name": "balance", 335 | "column_description": "" 336 | }, 337 | { 338 | "data_type": "TIMESTAMP", 339 | "column_name": "updated_at", 340 | "column_description": "" 341 | } 342 | ], 343 | "consumer_div.wallet_merchant_balance_daily": [ 344 | { 345 | "data_type": "BIGINT", 346 | "column_name": "merchant_id", 347 | "column_description": "" 348 | }, 349 | { 350 | "data_type": "DECIMAL(10,2)", 351 | "column_name": "balance", 352 | "column_description": "" 353 | }, 354 | { 355 | "data_type": "TIMESTAMP", 356 | "column_name": "updated_at", 357 | "column_description": "" 358 | } 359 | ], 360 | "consumer_div.notifications": [ 361 | { 362 | "data_type": "SERIAL", 363 | "column_name": "id", 364 | "column_description": "" 365 | }, 366 | { 367 | "data_type": "BIGINT", 368 | "column_name": "user_id", 369 | "column_description": "" 370 | }, 371 | { 372 | "data_type": "TEXT", 373 | "column_name": "message", 374 | "column_description": "" 375 | }, 376 | { 377 | "data_type": "VARCHAR(50)", 378 | "column_name": "type", 379 | "column_description": "possible values: transaction, promotion, security, general" 380 | }, 381 | { 382 | "data_type": "VARCHAR(20)", 383 | "column_name": "status", 384 | "column_description": "possible values: unread, read, archived" 385 | }, 386 | { 387 | "data_type": "TIMESTAMP", 388 | "column_name": "created_at", 389 | "column_description": "" 390 | }, 391 | { 392 | "data_type": "TIMESTAMP", 393 | "column_name": "read_at", 394 | "column_description": "NULL if not read" 395 | }, 396 | { 397 | "data_type": "VARCHAR(10)", 398 | "column_name": "device_type", 399 | "column_description": "possible values: mobile_app, web_app, email, sms" 400 | }, 401 | { 402 | "data_type": "VARCHAR(36)", 403 | "column_name": "device_id", 404 | "column_description": "" 405 | }, 406 | { 407 | "data_type": "TEXT", 408 | "column_name": "action_url", 409 | "column_description": "can be external https or deeplink url within the app" 410 | } 411 | ], 412 | "consumer_div.user_sessions": [ 413 | { 414 | "data_type": "BIGINT", 415 | "column_name": "user_id", 416 | "column_description": "" 417 | }, 418 | { 419 | "data_type": "TIMESTAMP", 420 | "column_name": "session_start_ts", 421 | "column_description": "" 422 | }, 423 | { 424 | "data_type": "TIMESTAMP", 425 | "column_name": "session_end_ts", 426 | "column_description": "" 427 | }, 428 | { 429 | "data_type": "VARCHAR(10)", 430 | "column_name": "device_type", 431 | "column_description": "possible values: mobile_app, web_app, email, sms" 432 | }, 433 | { 434 | "data_type": "VARCHAR(36)", 435 | "column_name": "device_id", 436 | "column_description": "" 437 | } 438 | ], 439 | "consumer_div.user_setting_snapshot": [ 440 | { 441 | "data_type": "BIGINT", 442 | "column_name": "user_id", 443 | "column_description": "" 444 | }, 445 | { 446 | "data_type": "DATE", 447 | "column_name": "snapshot_date", 448 | "column_description": "" 449 | }, 450 | { 451 | "data_type": "DECIMAL(10,2)", 452 | "column_name": "tx_limit_daily", 453 | "column_description": "" 454 | }, 455 | { 456 | "data_type": "DECIMAL(10,2)", 457 | "column_name": "tx_limit_monthly", 458 | "column_description": "" 459 | }, 460 | { 461 | "data_type": "INTEGER", 462 | "column_name": "membership_status", 463 | "column_description": "0 for bronze, 1 for silver, 2 for gold, 3 for platinum, 4 for VIP" 464 | }, 465 | { 466 | "data_type": "VARCHAR(255)", 467 | "column_name": "password_hash", 468 | "column_description": "" 469 | }, 470 | { 471 | "data_type": "VARCHAR(255)", 472 | "column_name": "api_key", 473 | "column_description": "" 474 | }, 475 | { 476 | "data_type": "TEXT", 477 | "column_name": "verified_devices", 478 | "column_description": "comma separated list of device ids" 479 | }, 480 | { 481 | "data_type": "TEXT", 482 | "column_name": "verified_ips", 483 | "column_description": "comma separated list of IP addresses" 484 | }, 485 | { 486 | "data_type": "BOOLEAN", 487 | "column_name": "mfa_enabled", 488 | "column_description": "" 489 | }, 490 | { 491 | "data_type": "BOOLEAN", 492 | "column_name": "marketing_opt_in", 493 | "column_description": "" 494 | }, 495 | { 496 | "data_type": "TIMESTAMP", 497 | "column_name": "created_at", 498 | "column_description": "" 499 | } 500 | ] 501 | }, 502 | "glossary": "- sender_id and receiver_id can be joined with either users.uid or merchants.mid depending on the sender_type/receiver_type\n- if a user applied a coupon to a purchase, there will be 2 rows in wallet_transactions_daily:\n - 1st row where coupon_id is NULL, amount = purchase value \n - 2nd row where coupon_id is NOT NULL, amount = coupon value applied\n - the sender and receiver id will be the same for both rows, but they will have different txid's \n- when using coupons.code, wallet_transactions_daily.gateway_name, filter case insensitively\n- Total Transaction Volume (TTV) = SUM(wallet_transactions_daily.amount)\n- Total Coupon Discount Redeemed (TCDR) = SUM(wallet_transactions_daily.amount) WHERE coupon_id IS NOT NULL\n- Session Density = COUNT(user_sessions.user_id) / COUNT(DISTINCT user_sessions.user_id)\n- Active Merchants Percentage (APM) = COUNT(DISTINCT CASE WHEN sender_type = 1 THEN wallet_transactions_daily.sender_id WHEN receiver_type = 1 THEN wallet_transactions_daily.receiver_id ELSE NULL END) / COUNT(DISTINCT merchants.mid)" 503 | } -------------------------------------------------------------------------------- /defog_data/geography/geography.json: -------------------------------------------------------------------------------- 1 | { 2 | "table_metadata": { 3 | "city": [ 4 | { 5 | "data_type": "bigint", 6 | "column_name": "population", 7 | "column_description": "The population of the city" 8 | }, 9 | { 10 | "data_type": "text", 11 | "column_name": "city_name", 12 | "column_description": "The name of the city" 13 | }, 14 | { 15 | "data_type": "text", 16 | "column_name": "country_name", 17 | "column_description": "The name of the country where the city is located" 18 | }, 19 | { 20 | "data_type": "text", 21 | "column_name": "state_name", 22 | "column_description": "The name of the state where the city is located" 23 | } 24 | ], 25 | "lake": [ 26 | { 27 | "data_type": "double precision", 28 | "column_name": "area", 29 | "column_description": "The area of the lake in square kilometers" 30 | }, 31 | { 32 | "data_type": "text", 33 | "column_name": "lake_name", 34 | "column_description": "The name of the lake" 35 | }, 36 | { 37 | "data_type": "text", 38 | "column_name": "country_name", 39 | "column_description": "The name of the country where the lake is located" 40 | }, 41 | { 42 | "data_type": "text", 43 | "column_name": "state_name", 44 | "column_description": "The name of the state where the lake is located (if applicable)" 45 | } 46 | ], 47 | "river": [ 48 | { 49 | "data_type": "bigint", 50 | "column_name": "length", 51 | "column_description": "The length of the river in meters" 52 | }, 53 | { 54 | "data_type": "text", 55 | "column_name": "river_name", 56 | "column_description": "The name of the river. Names exclude the word 'river' e.g. 'Mississippi' instead of 'Mississippi River'" 57 | }, 58 | { 59 | "data_type": "text", 60 | "column_name": "country_name", 61 | "column_description": "The name of the country the river flows through" 62 | }, 63 | { 64 | "data_type": "text", 65 | "column_name": "traverse", 66 | "column_description": "The cities or landmarks the river passes through. Comma delimited and in title case, eg `New York,Albany,Boston`" 67 | } 68 | ], 69 | "state": [ 70 | { 71 | "data_type": "bigint", 72 | "column_name": "population", 73 | "column_description": "The population of the state" 74 | }, 75 | { 76 | "data_type": "double precision", 77 | "column_name": "area", 78 | "column_description": "The area of the state in square kilometers" 79 | }, 80 | { 81 | "data_type": "double precision", 82 | "column_name": "density", 83 | "column_description": "The population density of the state in people per square kilometer" 84 | }, 85 | { 86 | "data_type": "text", 87 | "column_name": "state_name", 88 | "column_description": "The name of the state" 89 | }, 90 | { 91 | "data_type": "text", 92 | "column_name": "country_name", 93 | "column_description": "The name of the country the state belongs to" 94 | }, 95 | { 96 | "data_type": "text", 97 | "column_name": "capital", 98 | "column_description": "The name of the capital city of the state" 99 | } 100 | ], 101 | "highlow": [ 102 | { 103 | "data_type": "text", 104 | "column_name": "state_name", 105 | "column_description": "The name of the state" 106 | }, 107 | { 108 | "data_type": "text", 109 | "column_name": "highest_elevation", 110 | "column_description": "The highest elevation point in the state in meters above sea level" 111 | }, 112 | { 113 | "data_type": "text", 114 | "column_name": "lowest_point", 115 | "column_description": "The lowest elevation point in the state" 116 | }, 117 | { 118 | "data_type": "text", 119 | "column_name": "highest_point", 120 | "column_description": "The highest point in the state. If unknown, use 'Unnamed location'." 121 | }, 122 | { 123 | "data_type": "text", 124 | "column_name": "lowest_elevation", 125 | "column_description": "The lowest point in the state in meters above sea level" 126 | } 127 | ], 128 | "mountain": [ 129 | { 130 | "data_type": "bigint", 131 | "column_name": "mountain_altitude", 132 | "column_description": "The altitude of the mountain in meters" 133 | }, 134 | { 135 | "data_type": "text", 136 | "column_name": "mountain_name", 137 | "column_description": "The name of the mountain" 138 | }, 139 | { 140 | "data_type": "text", 141 | "column_name": "country_name", 142 | "column_description": "The name of the country where the mountain is located" 143 | }, 144 | { 145 | "data_type": "text", 146 | "column_name": "state_name", 147 | "column_description": "The name of the state or province where the mountain is located (if applicable)" 148 | } 149 | ], 150 | "border_info": [ 151 | { 152 | "data_type": "text", 153 | "column_name": "state_name", 154 | "column_description": "The name of the state that shares a border with another state or country." 155 | }, 156 | { 157 | "data_type": "text", 158 | "column_name": "border", 159 | "column_description": "The name of the state that shares a border with the state specified in the state_name column." 160 | } 161 | ] 162 | }, 163 | "glossary": "" 164 | } -------------------------------------------------------------------------------- /defog_data/geography/geography.sql: -------------------------------------------------------------------------------- 1 | 2 | CREATE TABLE public.border_info ( 3 | state_name text, 4 | border text 5 | ); 6 | 7 | 8 | CREATE TABLE public.city ( 9 | city_name text, 10 | population bigint, 11 | country_name text DEFAULT ''::text NOT NULL, 12 | state_name text 13 | ); 14 | 15 | 16 | CREATE TABLE public.highlow ( 17 | state_name text, 18 | highest_elevation text, 19 | lowest_point text, 20 | highest_point text, 21 | lowest_elevation text 22 | ); 23 | 24 | 25 | CREATE TABLE public.lake ( 26 | lake_name text, 27 | area double precision, 28 | country_name text DEFAULT ''::text NOT NULL, 29 | state_name text 30 | ); 31 | 32 | 33 | CREATE TABLE public.mountain ( 34 | mountain_name text, 35 | mountain_altitude bigint, 36 | country_name text DEFAULT ''::text NOT NULL, 37 | state_name text 38 | ); 39 | 40 | 41 | CREATE TABLE public.river ( 42 | river_name text, 43 | length bigint, 44 | country_name text DEFAULT ''::text NOT NULL, 45 | traverse text 46 | ); 47 | 48 | 49 | CREATE TABLE public.state ( 50 | state_name text, 51 | population bigint, 52 | area double precision, 53 | country_name text DEFAULT ''::text NOT NULL, 54 | capital text, 55 | density double precision 56 | ); 57 | 58 | 59 | INSERT INTO public.border_info (state_name, border) VALUES 60 | ('California', 'Nevada'), 61 | ('California', 'Arizona'), 62 | ('California', 'Oregon'), 63 | ('Texas', 'Louisiana'), 64 | ('Texas', 'Oklahoma'), 65 | ('Texas', 'New Mexico'), 66 | ('Florida', 'Alabama'), 67 | ('Florida', 'Georgia'), 68 | ('Florida', 'Atlantic Ocean'), 69 | ('New York', 'Pennsylvania'), 70 | ('New York', 'Connecticut'), 71 | ('New York', 'Massachusetts') 72 | ; 73 | 74 | INSERT INTO public.city (city_name, population, country_name, state_name) VALUES 75 | ('New York', 1000000, 'United States', 'New York'), 76 | ('Los Angeles', 5000000, 'United States', 'California'), 77 | ('Chicago', 1500000, 'United States', 'Illinois'), 78 | ('Houston', 2000000, 'United States', 'Texas'), 79 | ('Toronto', 800000, 'Canada', 'Ontario'), 80 | ('Mexico City', 600000, 'Mexico', 'Distrito Federal'), 81 | ('Sao Paulo', 3000000, 'Brazil', 'Sao Paulo'), 82 | ('Mumbai', 1200000, 'India', 'Maharashtra'), 83 | ('London', 900000, 'United Kingdom', 'England'), 84 | ('Tokyo', 700000, 'Japan', 'Tokyo') 85 | ; 86 | 87 | INSERT INTO public.highlow (state_name, highest_elevation, lowest_point, highest_point, lowest_elevation) VALUES 88 | ('California', '4421', 'Death Valley', 'Mount Whitney', '-86'), 89 | ('Texas', '2667', 'Gulf of Mexico', 'Guadalupe Peak', '0'), 90 | ('Florida', NULL, 'Atlantic Ocean', 'Unnamed location', '0'), 91 | ('New York', '1629', 'Atlantic Ocean', 'Mount Marcy', '0'), 92 | ('Ontario', NULL, 'Atlantic Ocean', 'Unnamed location', '0'), 93 | ('Sao Paulo', NULL, 'Atlantic Ocean', 'Unnamed location', '0'), 94 | ('Guangdong', NULL, 'South China Sea', 'Unnamed location', '0'), 95 | ('Maharashtra', NULL, 'Arabian Sea', 'Unnamed location', '0'), 96 | ('England', '978', 'North Sea', 'Scafell Pike', '0'), 97 | ('Tokyo', '3776', 'Pacific Ocean', 'Mount Fuji', '0') 98 | ; 99 | 100 | INSERT INTO public.lake (lake_name, area, country_name, state_name) VALUES 101 | ('Lake Superior', 1000, 'United States', 'Michigan'), 102 | ('Lake Michigan', 500, 'United States', 'Michigan'), 103 | ('Lake Huron', 300, 'United States', 'Michigan'), 104 | ('Lake Erie', 200, 'United States', 'Ohio'), 105 | ('Lake Ontario', 400, 'United States', 'New York'), 106 | ('Lake Victoria', 800, 'Tanzania', NULL), 107 | ('Lake Tanganyika', 600, 'Tanzania', NULL), 108 | ('Lake Malawi', 700, 'Tanzania', NULL), 109 | ('Lake Baikal', 900, 'Russia', NULL), 110 | ('Lake Qinghai', 1200, 'China', NULL) 111 | ; 112 | 113 | INSERT INTO public.mountain (mountain_name, mountain_altitude, country_name, state_name) VALUES 114 | ('Mount Everest', 10000, 'Nepal', NULL), 115 | ('K2', 5000, 'Pakistan', NULL), 116 | ('Kangchenjunga', 3000, 'Nepal', NULL), 117 | ('Lhotse', 2000, 'Nepal', NULL), 118 | ('Makalu', 4000, 'Nepal', NULL), 119 | ('Cho Oyu', 8000, 'Nepal', NULL), 120 | ('Dhaulagiri', 6000, 'Nepal', NULL), 121 | ('Manaslu', 7000, 'Nepal', NULL), 122 | ('Nanga Parbat', 9000, 'Pakistan', NULL), 123 | ('Annapurna', 1000, 'Nepal', NULL) 124 | ; 125 | 126 | INSERT INTO public.river (river_name, length, country_name, traverse) VALUES 127 | ('Nile', 1000, 'Egypt', 'Cairo,Luxor,Aswan'), 128 | ('Amazon', 500, 'Brazil', 'Manaus,Belem'), 129 | ('Yangtze', 300, 'China', 'Shanghai,Wuhan,Chongqing'), 130 | ('Mississippi', 200, 'United States', 'New Orleans,Memphis,St. Louis'), 131 | ('Yukon', 400, 'Canada', 'Whitehorse,Dawson City'), 132 | ('Volga', 800, 'Russia', 'Moscow,Samara,Kazan'), 133 | ('Mekong', 600, 'Vietnam', 'Ho Chi Minh City,Phnom Penh'), 134 | ('Danube', 700, 'Germany', 'Passau,Vienna,Budapest'), 135 | ('Rhine', 900, 'Germany', 'Strasbourg,Frankfurt,Cologne'), 136 | ('Po', 100, 'Italy', 'Turin,Milan,Venice') 137 | ; 138 | 139 | INSERT INTO public.state (state_name, population, area, country_name, capital, density) VALUES 140 | ('California', 100000, 10000, 'United States', 'Sacramento', 1000), 141 | ('Texas', 50000, 5000, 'United States', 'Austin', 1000), 142 | ('Florida', 150000, 15000, 'United States', 'Tallahassee', 1000), 143 | ('New York', 200000, 20000, 'United States', 'Albany', 1000), 144 | ('Ontario', 80000, 8000, 'Canada', 'Toronto', 1000), 145 | ('Sao Paulo', 50000, 6000, 'Brazil', 'Sao Paulo', 1000), 146 | ('Guangdong', 200000, 30000, 'China', 'Guangzhou', 1000), 147 | ('Maharashtra', 200000, 12000, 'India', 'Mumbai', 1000), 148 | ('England', 9000, 10000, 'United Kingdom', 'London', 1000), 149 | ('Tokyo', 70000, 50000, 'Japan', 'Tokyo', 1000), 150 | ('Ohio', 90000, 11000, 'United States', 'Columbus', 1000), 151 | ('Michigan', 120000, 9000, 'United States', 'Lansing', 1000) 152 | ; 153 | 154 | 155 | 156 | -------------------------------------------------------------------------------- /defog_data/metadata.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | 5 | script_dir = os.path.dirname(os.path.abspath(__file__)) 6 | logging.debug(f"script_dir: {script_dir}") 7 | 8 | 9 | def get_db(db_name): 10 | script_dir = os.path.dirname(os.path.abspath(__file__)) 11 | file_path = os.path.join(script_dir, f"{db_name}/{db_name}.json") 12 | with open(file_path, "r") as f: 13 | db_schema = json.load(f) 14 | return db_schema 15 | 16 | 17 | # sql-eval datasets 18 | academic = get_db("academic") 19 | advising = get_db("advising") 20 | atis = get_db("atis") 21 | geography = get_db("geography") 22 | restaurants = get_db("restaurants") 23 | scholar = get_db("scholar") 24 | yelp = get_db("yelp") 25 | 26 | # sql-eval-instruct datasets 27 | broker = get_db("broker") 28 | car_dealership = get_db("car_dealership") 29 | derm_treatment = get_db("derm_treatment") 30 | ewallet = get_db("ewallet") 31 | 32 | dbs = { 33 | # sql-eval datasets 34 | "academic": academic, 35 | "advising": advising, 36 | "atis": atis, 37 | "geography": geography, 38 | "restaurants": restaurants, 39 | "scholar": scholar, 40 | "yelp": yelp, 41 | # sql-eval-instruct datasets 42 | "broker": broker, 43 | "car_dealership": car_dealership, 44 | "derm_treatment": derm_treatment, 45 | "ewallet": ewallet, 46 | } 47 | -------------------------------------------------------------------------------- /defog_data/restaurants/restaurants.json: -------------------------------------------------------------------------------- 1 | { 2 | "table_metadata": { 3 | "location": [ 4 | { 5 | "data_type": "bigint", 6 | "column_name": "restaurant_id", 7 | "column_description": "Unique identifier for each restaurant" 8 | }, 9 | { 10 | "data_type": "bigint", 11 | "column_name": "house_number", 12 | "column_description": "The number assigned to the building where the restaurant is located" 13 | }, 14 | { 15 | "data_type": "text", 16 | "column_name": "street_name", 17 | "column_description": "The name of the street where the restaurant is located" 18 | }, 19 | { 20 | "data_type": "text", 21 | "column_name": "city_name", 22 | "column_description": "The name of the city where the restaurant is located" 23 | } 24 | ], 25 | "geographic": [ 26 | { 27 | "data_type": "text", 28 | "column_name": "city_name", 29 | "column_description": "The name of the city" 30 | }, 31 | { 32 | "data_type": "text", 33 | "column_name": "county", 34 | "column_description": "The name of the county" 35 | }, 36 | { 37 | "data_type": "text", 38 | "column_name": "region", 39 | "column_description": "The name of the region" 40 | } 41 | ], 42 | "restaurant": [ 43 | { 44 | "data_type": "bigint", 45 | "column_name": "id", 46 | "column_description": "Unique identifier for each restaurant" 47 | }, 48 | { 49 | "data_type": "real", 50 | "column_name": "rating", 51 | "column_description": "The rating of the restaurant on a scale of 0 to 5" 52 | }, 53 | { 54 | "data_type": "text", 55 | "column_name": "name", 56 | "column_description": "The name of the restaurant" 57 | }, 58 | { 59 | "data_type": "text", 60 | "column_name": "food_type", 61 | "column_description": "The type of food served at the restaurant" 62 | }, 63 | { 64 | "data_type": "text", 65 | "column_name": "city_name", 66 | "column_description": "The city where the restaurant is located" 67 | } 68 | ] 69 | }, 70 | "glossary": "" 71 | } -------------------------------------------------------------------------------- /defog_data/restaurants/restaurants.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE public.geographic ( 2 | city_name text, 3 | county text, 4 | region text 5 | ); 6 | 7 | 8 | CREATE TABLE public.location ( 9 | restaurant_id bigint, 10 | house_number bigint, 11 | street_name text, 12 | city_name text 13 | ); 14 | 15 | 16 | CREATE TABLE public.restaurant ( 17 | id bigint, 18 | name text, 19 | food_type text, 20 | city_name text, 21 | rating real 22 | ); 23 | 24 | 25 | INSERT INTO public.geographic (city_name, county, region) VALUES 26 | ('Los Angeles', 'Los Angeles', 'California'), 27 | ('New York', 'New York', 'New York'), 28 | ('San Francisco', 'San Francisco', 'California'), 29 | ('Miami', 'Miami-Dade', 'Florida'), 30 | ('Chicago', 'Cook', 'Illinois') 31 | ; 32 | 33 | INSERT INTO public.location (restaurant_id, house_number, street_name, city_name) VALUES 34 | (1, 123, 'Main St', 'Los Angeles'), 35 | (2, 456, 'Maple Ave', 'Los Angeles'), 36 | (3, 789, 'Oak St', 'Los Angeles'), 37 | (4, 321, 'Elm St', 'New York'), 38 | (5, 654, 'Pine Ave', 'New York'), 39 | (6, 123, 'Pine Ave', 'New York'), 40 | (7, 12, 'Market St', 'San Francisco'), 41 | (8, 34, 'Mission St', 'San Francisco'), 42 | (9, 56, 'Valencia St', 'San Francisco'), 43 | (10, 78, 'Ocean Dr', 'Miami'), 44 | (11, 90, 'Biscayne Rd', 'Miami') 45 | ; 46 | 47 | INSERT INTO public.restaurant (id, rating, name, food_type, city_name) VALUES 48 | (1, 4.5, 'The Pasta House', 'Italian', 'Los Angeles'), 49 | (2, 3.8, 'The Burger Joint', 'American', 'Los Angeles'), 50 | (3, 4.2, 'The Sushi Bar', 'Japanese', 'Los Angeles'), 51 | (4, 4.7, 'The Pizza Place', 'Italian', 'New York'), 52 | (5, 3.9, 'The Steakhouse', 'American', 'New York'), 53 | (6, 4.3, 'The Ramen Shop', 'Japanese', 'New York'), 54 | (7, 4.1, 'The Tacos & Burritos', 'Mexican', 'San Francisco'), 55 | (8, 4.6, 'The Vegan Cafe', 'Vegan', 'San Francisco'), 56 | (9, 3.7, 'The BBQ Joint', 'American', 'San Francisco'), 57 | (10, 4.4, 'The Seafood Shack', 'Seafood', 'Miami'), 58 | (11, 4.6, 'The Seafood Shack', 'Seafood', 'Miami') 59 | ; 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /defog_data/scholar/scholar.json: -------------------------------------------------------------------------------- 1 | { 2 | "table_metadata": { 3 | "cite": [ 4 | { 5 | "data_type": "bigint", 6 | "column_name": "citingpaperid", 7 | "column_description": "The ID of the paper that is doing the citing." 8 | }, 9 | { 10 | "data_type": "bigint", 11 | "column_name": "citedpaperid", 12 | "column_description": "The ID of the paper that is being cited." 13 | } 14 | ], 15 | "field": [ 16 | { 17 | "data_type": "bigint", 18 | "column_name": "fieldid", 19 | "column_description": "Unique identifier for each field in the table" 20 | } 21 | ], 22 | "paper": [ 23 | { 24 | "data_type": "bigint", 25 | "column_name": "year", 26 | "column_description": "The year the paper was published." 27 | }, 28 | { 29 | "data_type": "bigint", 30 | "column_name": "journalid", 31 | "column_description": "The ID of the journal where the paper was published." 32 | }, 33 | { 34 | "data_type": "bigint", 35 | "column_name": "venueid", 36 | "column_description": "The ID of the venue where the paper was published." 37 | }, 38 | { 39 | "data_type": "bigint", 40 | "column_name": "paperid", 41 | "column_description": "The unique ID of the paper." 42 | }, 43 | { 44 | "data_type": "bigint", 45 | "column_name": "numciting", 46 | "column_description": "The number of papers that this paper cites." 47 | }, 48 | { 49 | "data_type": "bigint", 50 | "column_name": "numcitedby", 51 | "column_description": "The number of papers that cite this paper." 52 | }, 53 | { 54 | "data_type": "text", 55 | "column_name": "title", 56 | "column_description": "The title of the paper, enclosed in double quotes if it contains commas." 57 | } 58 | ], 59 | "venue": [ 60 | { 61 | "data_type": "bigint", 62 | "column_name": "venueid", 63 | "column_description": "Unique identifier for each venue" 64 | }, 65 | { 66 | "data_type": "text", 67 | "column_name": "venuename", 68 | "column_description": "Name of the venue" 69 | } 70 | ], 71 | "author": [ 72 | { 73 | "data_type": "bigint", 74 | "column_name": "authorid", 75 | "column_description": "Unique identifier for each author" 76 | }, 77 | { 78 | "data_type": "text", 79 | "column_name": "authorname", 80 | "column_description": "Name of the author" 81 | } 82 | ], 83 | "writes": [ 84 | { 85 | "data_type": "bigint", 86 | "column_name": "paperid", 87 | "column_description": "The unique identifier for a paper in the writes table." 88 | }, 89 | { 90 | "data_type": "bigint", 91 | "column_name": "authorid", 92 | "column_description": "The unique identifier for an author in the writes table." 93 | } 94 | ], 95 | "dataset": [ 96 | { 97 | "data_type": "bigint", 98 | "column_name": "datasetid", 99 | "column_description": "Unique identifier for each dataset in the table" 100 | }, 101 | { 102 | "data_type": "text", 103 | "column_name": "datasetname", 104 | "column_description": "Name of the dataset" 105 | } 106 | ], 107 | "journal": [ 108 | { 109 | "data_type": "bigint", 110 | "column_name": "journalid", 111 | "column_description": "Unique identifier for each journal entry" 112 | }, 113 | { 114 | "data_type": "text", 115 | "column_name": "journalname", 116 | "column_description": "Name or title of the journal" 117 | } 118 | ], 119 | "keyphrase": [ 120 | { 121 | "data_type": "bigint", 122 | "column_name": "keyphraseid", 123 | "column_description": "Unique identifier for each keyphrase" 124 | }, 125 | { 126 | "data_type": "text", 127 | "column_name": "keyphrasename", 128 | "column_description": "The actual keyphrase text" 129 | } 130 | ], 131 | "paperfield": [ 132 | { 133 | "data_type": "bigint", 134 | "column_name": "fieldid", 135 | "column_description": "Unique identifier for each field in the table" 136 | }, 137 | { 138 | "data_type": "bigint", 139 | "column_name": "paperid", 140 | "column_description": "Unique identifier for each paper in the table" 141 | } 142 | ], 143 | "paperdataset": [ 144 | { 145 | "data_type": "bigint", 146 | "column_name": "paperid", 147 | "column_description": "Unique identifier for each paper in the dataset" 148 | }, 149 | { 150 | "data_type": "bigint", 151 | "column_name": "datasetid", 152 | "column_description": "Unique identifier for each dataset that the paper belongs to" 153 | } 154 | ], 155 | "paperkeyphrase": [ 156 | { 157 | "data_type": "bigint", 158 | "column_name": "paperid", 159 | "column_description": "The ID of the paper associated with the keyphrase." 160 | }, 161 | { 162 | "data_type": "bigint", 163 | "column_name": "keyphraseid", 164 | "column_description": "The ID of the keyphrase associated with the paper." 165 | } 166 | ] 167 | }, 168 | "glossary": "" 169 | } -------------------------------------------------------------------------------- /defog_data/scholar/scholar.sql: -------------------------------------------------------------------------------- 1 | 2 | CREATE TABLE public.author ( 3 | authorid bigint NOT NULL, 4 | authorname text 5 | ); 6 | 7 | 8 | CREATE TABLE public.cite ( 9 | citingpaperid bigint NOT NULL, 10 | citedpaperid bigint NOT NULL 11 | ); 12 | 13 | 14 | CREATE TABLE public.dataset ( 15 | datasetid bigint NOT NULL, 16 | datasetname text 17 | ); 18 | 19 | 20 | CREATE TABLE public.field ( 21 | fieldid bigint 22 | ); 23 | 24 | 25 | CREATE TABLE public.journal ( 26 | journalid bigint NOT NULL, 27 | journalname text 28 | ); 29 | 30 | 31 | CREATE TABLE public.keyphrase ( 32 | keyphraseid bigint NOT NULL, 33 | keyphrasename text 34 | ); 35 | 36 | 37 | CREATE TABLE public.paper ( 38 | paperid bigint NOT NULL, 39 | title text, 40 | venueid bigint, 41 | year bigint, 42 | numciting bigint, 43 | numcitedby bigint, 44 | journalid bigint 45 | ); 46 | 47 | 48 | CREATE TABLE public.paperdataset ( 49 | paperid bigint, 50 | datasetid bigint 51 | ); 52 | 53 | 54 | CREATE TABLE public.paperfield ( 55 | fieldid bigint, 56 | paperid bigint 57 | ); 58 | 59 | 60 | CREATE TABLE public.paperkeyphrase ( 61 | paperid bigint, 62 | keyphraseid bigint 63 | ); 64 | 65 | 66 | CREATE TABLE public.venue ( 67 | venueid bigint NOT NULL, 68 | venuename text 69 | ); 70 | 71 | 72 | CREATE TABLE public.writes ( 73 | paperid bigint, 74 | authorid bigint 75 | ); 76 | 77 | 78 | INSERT INTO public.author (authorid, authorname) VALUES 79 | (1, 'John Smith'), 80 | (2, 'Emily Johnson'), 81 | (3, 'Michael Brown'), 82 | (4, 'Sarah Davis'), 83 | (5, 'David Wilson'), 84 | (6, 'Jennifer Lee'), 85 | (7, 'Robert Moore'), 86 | (8, 'Linda Taylor'), 87 | (9, 'William Anderson'), 88 | (10, 'Karen Martinez') 89 | ; 90 | 91 | INSERT INTO public.cite (citingpaperid, citedpaperid) VALUES 92 | (1, 2), 93 | (2, 3), 94 | (3, 4), 95 | (4, 5), 96 | (5, 1), 97 | (3, 5), 98 | (4, 2), 99 | (1, 4), 100 | (3, 1) 101 | ; 102 | 103 | INSERT INTO public.dataset (datasetid, datasetname) VALUES 104 | (1, 'COVID-19 Research'), 105 | (2, 'Machine Learning Datasets'), 106 | (3, 'Climate Change Data'), 107 | (4, 'Social Media Analysis') 108 | ; 109 | 110 | INSERT INTO public.field (fieldid) VALUES 111 | (1), 112 | (2), 113 | (3), 114 | (4) 115 | ; 116 | 117 | INSERT INTO public.journal (journalid, journalname) VALUES 118 | (1, 'Nature'), 119 | (2, 'Science'), 120 | (3, 'IEEE Transactions on Pattern Analysis and Machine Intelligence'), 121 | (4, 'International Journal of Mental Health') 122 | ; 123 | 124 | INSERT INTO public.keyphrase (keyphraseid, keyphrasename) VALUES 125 | (1, 'Machine Learning'), 126 | (2, 'Climate Change'), 127 | (3, 'Social Media'), 128 | (4, 'COVID-19'), 129 | (5, 'Mental Health') 130 | ; 131 | 132 | INSERT INTO public.paper (paperid, title, venueid, year, numciting, numcitedby, journalid) VALUES 133 | (1, 'A Study on Machine Learning Algorithms', 1, 2020, 2, 2, 3), 134 | (2, 'The Effects of Climate Change on Agriculture', 1, 2020, 1, 2, 1), 135 | (3, 'Social Media and Mental Health', 2, 2019, 3, 1, 4), 136 | (4, 'COVID-19 Impact on Society', 1, 2020, 2, 2, 2), 137 | (5, 'Machine Learning in Tackling Climate Change', 2, 2019, 1, 2, 3) 138 | ; 139 | 140 | INSERT INTO public.paperdataset (paperid, datasetid) VALUES 141 | (1, 2), 142 | (2, 3), 143 | (3, 4), 144 | (4, 1), 145 | (5, 2), 146 | (5, 3) 147 | ; 148 | 149 | INSERT INTO public.paperfield (fieldid, paperid) VALUES 150 | (1, 1), 151 | (2, 2), 152 | (3, 3), 153 | (4, 4), 154 | (1, 5) 155 | ; 156 | 157 | INSERT INTO public.paperkeyphrase (paperid, keyphraseid) VALUES 158 | (1, 1), 159 | (2, 2), 160 | (3, 3), 161 | (3, 5), 162 | (4, 4), 163 | (5, 1), 164 | (5, 2) 165 | ; 166 | 167 | INSERT INTO public.venue (venueid, venuename) VALUES 168 | (1, 'Conference on Machine Learning'), 169 | (2, 'International Journal of Climate Change'), 170 | (3, 'Social Media Analysis Workshop') 171 | ; 172 | 173 | INSERT INTO public.writes (paperid, authorid) VALUES 174 | (1, 1), 175 | (2, 2), 176 | (3, 3), 177 | (4, 4), 178 | (5, 5), 179 | (1, 3), 180 | (1, 4), 181 | (2, 3), 182 | (4, 5), 183 | (5, 1), 184 | (2, 1), 185 | (4, 3), 186 | (4, 6), 187 | (2, 7), 188 | (2, 8), 189 | (2, 9) 190 | ; 191 | 192 | 193 | 194 | -------------------------------------------------------------------------------- /defog_data/supplementary.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | 4 | # get package root directory 5 | root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 6 | 7 | def clean_glossary(glossary: str) -> list[str]: 8 | """ 9 | Clean glossary by removing number bullets and periods, and making sure every line starts with a dash bullet. 10 | """ 11 | if glossary == "": 12 | return [] 13 | glossary = glossary.split("\n") 14 | # remove empty strings 15 | glossary = list(filter(None, glossary)) 16 | cleaned = [] 17 | for line in glossary: 18 | # remove number bullets and periods 19 | line = re.sub(r"^\d+\.?\s?", "", line) 20 | # make sure every line starts with a dash bullet if it does not already 21 | line = re.sub(r"^(?!-)", "- ", line) 22 | cleaned.append(line) 23 | glossary = cleaned 24 | return glossary 25 | 26 | # (pair of tables): list of (column1, column2) tuples that can be joined 27 | # pairs should be lexically ordered, ie (table1 < table2) and (column1 < column2) 28 | columns_join = { 29 | "academic": { 30 | ("author", "domain_author"): [("author.aid", "domain_author.aid")], 31 | ("author", "organization"): [("author.oid", "organization.oid")], 32 | ("author", "writes"): [("author.aid", "writes.aid")], 33 | ("cite", "publication"): [ 34 | ("cite.cited", "publication.pid"), 35 | ("cite.citing", "publication.pid"), 36 | ], 37 | ("conference", "domain_conference"): [ 38 | ("conference.cid", "domain_conference.cid") 39 | ], 40 | ("conference", "publication"): [("conference.cid", "publication.cid")], 41 | ("domain", "domain_author"): [("domain.did", "domain_author.did")], 42 | ("domain", "domain_conference"): [("domain.did", "domain_conference.did")], 43 | ("domain", "domain_journal"): [("domain.did", "domain_journal.did")], 44 | ("domain", "domain_keyword"): [("domain.did", "domain_keyword.did")], 45 | ("domain_journal", "journal"): [("domain_journal.jid", "journal.jid")], 46 | ("domain_keyword", "keyword"): [("domain_keyword.kid", "keyword.kid")], 47 | ("domain_publication", "publication"): [ 48 | ("domain_publication.pid", "publication.pid") 49 | ], 50 | ("journal", "publication"): [("journal.jid", "publication.jid")], 51 | ("keyword", "publication_keyword"): [ 52 | ("keyword.kid", "publication_keyword.kid") 53 | ], 54 | ("publication", "publication_keyword"): [ 55 | ("publication.pid", "publication_keyword.pid") 56 | ], 57 | ("publication", "writes"): [("publication.pid", "writes.pid")], 58 | }, 59 | "advising": { 60 | ("area", "course"): [("area.course_id", "course.course_id")], 61 | ("comment_instructor", "instructor"): [ 62 | ("comment_instructor.instructor_id", "instructor.instructor_id") 63 | ], 64 | ("comment_instructor", "student"): [ 65 | ("comment_instructor.student_id", "student.student_id") 66 | ], 67 | ("course", "course_offering"): [ 68 | ("course.course_id", "course_offering.course_id") 69 | ], 70 | ("course", "course_prerequisite"): [ 71 | ("course.course_id", "course_prerequisite.course_id"), 72 | ("course.course_id", "course_prerequisite.pre_course_id"), 73 | ], 74 | ("course", "course_tags_count"): [ 75 | ("course.course_id", "course_tags_count.course_id") 76 | ], 77 | ("course", "program_course"): [ 78 | ("course.course_id", "program_course.course_id") 79 | ], 80 | ("course", "student_record"): [ 81 | ("course.course_id", "student_record.course_id") 82 | ], 83 | ("course_offering", "offering_instructor"): [ 84 | ("course_offering.offering_id", "offering_instructor.offering_id") 85 | ], 86 | ("course_offering", "student_record"): [ 87 | ("course_offering.offering_id", "student_record.offering_id"), 88 | ("course_offering.course_id", "student_record.course_id"), 89 | ], 90 | ("instructor", "offering_instructor"): [ 91 | ("instructor.instructor_id", "offering_instructor.instructor_id") 92 | ], 93 | ("program", "program_course"): [ 94 | ("program.program_id", "program_course.program_id") 95 | ], 96 | ("program", "program_requirement"): [ 97 | ("program.program_id", "program_requirement.program_id") 98 | ], 99 | ("program", "student"): [("program.program_id", "student.program_id")], 100 | ("student", "student_record"): [ 101 | ("student.student_id", "student_record.student_id") 102 | ], 103 | }, 104 | "atis": { 105 | ("airline", "flight"): [("airline.airline_code", "flight.airline_code")], 106 | ("airline", "flight_stop"): [ 107 | ("airline.airline_code", "flight_stop.departure_airline"), 108 | ("airline.airline_code", "flight_stop.arrival_airline"), 109 | ], 110 | ("airport", "fare"): [ 111 | ("airport.airport_code", "fare.from_airport"), 112 | ("airport.airport_code", "fare.to_airport"), 113 | ], 114 | ("airport", "flight_stop"): [ 115 | ("airport.airport_code", "flight_stop.stop_airport") 116 | ], 117 | ("airport_service", "ground_service"): [ 118 | ("airport_service.city_code", "ground_service.city_code"), 119 | ("airport_service.airport_code", "ground_service.airport_code"), 120 | ], 121 | ("airport", "city"): [ 122 | ("airport.state_code", "city.state_code"), 123 | ("airport.country_name", "city.country_name"), 124 | ("airport.time_zone_code", "city.time_zone_code"), 125 | ], 126 | ("airport", "state"): [ 127 | ("airport.state_code", "state.state_code"), 128 | ], 129 | ("city", "state"): [ 130 | ("city.state_code", "state.state_code"), 131 | ], 132 | ("airport_service", "city"): [ 133 | ("airport_service.city_code", "city.city_code"), 134 | ], 135 | ("ground_service", "city"): [ 136 | ("ground_service.city_code", "city.city_code"), 137 | ], 138 | ("airport", "time_zone"): [ 139 | ("airport.time_zone_code", "time_zone.time_zone_code"), 140 | ], 141 | ("city", "time_zone"): [ 142 | ("city.time_zone_code", "time_zone.time_zone_code"), 143 | ], 144 | ("flight", "flight_fare"): [ 145 | ("flight.flight_id", "flight_fare.flight_id"), 146 | ], 147 | ("flight", "flight_leg"): [ 148 | ("flight.flight_id", "flight_leg.flight_id"), 149 | ], 150 | ("flight", "flight_stop"): [ 151 | ("flight.flight_id", "flight_stop.flight_id"), 152 | ], 153 | ("flight_fare", "flight_leg"): [ 154 | ("flight_fare.flight_id", "flight_leg.flight_id"), 155 | ], 156 | ("flight_fare", "flight_stop"): [ 157 | ("flight_fare.flight_id", "flight_stop.flight_id"), 158 | ], 159 | ("flight_leg", "flight_stop"): [ 160 | ("flight_leg.flight_id", "flight_stop.flight_id"), 161 | ], 162 | ("aircraft", "equipment_sequence"): [ 163 | ("aircraft.aircraft_code", "equipment_sequence.aircraft_code"), 164 | ], 165 | ("flight", "equipment_sequence"): [ 166 | ( 167 | "flight.aircraft_code_sequence", 168 | "equipment_sequence.aircraft_code_sequence", 169 | ), 170 | ], 171 | ("flight", "food_service"): [ 172 | ("flight.meal_code", "food_service.meal_code"), 173 | ], 174 | }, 175 | "yelp": { 176 | ("business", "tip"): [("business.business_id", "tip.business_id")], 177 | ("business", "review"): [("business.business_id", "review.business_id")], 178 | ("business", "checkin"): [("business.business_id", "checkin.business_id")], 179 | ("business", "neighbourhood"): [ 180 | ("business.business_id", "neighbourhood.business_id") 181 | ], 182 | ("business", "category"): [("business.business_id", "category.business_id")], 183 | ("tip", "users"): [("tip.user_id", "users.user_id")], 184 | ("review", "users"): [("review.user_id", "users.user_id")], 185 | }, 186 | "restaurants": { 187 | ("geographic", "location"): [ 188 | ("geographic.city_name", "location.city_name"), 189 | ], 190 | ("geographic", "restaurant"): [ 191 | ("geographic.city_name", "restaurant.city_name"), 192 | ], 193 | ("location", "restaurant"): [ 194 | ("location.restaurant_id", "restaurant.id"), 195 | ], 196 | }, 197 | "geography": { 198 | ("border_info", "city"): [ 199 | ("border_info.state_name", "city.state_name"), 200 | ("border_info.border", "city.state_name"), 201 | ], 202 | ("border_info", "lake"): [ 203 | ("border_info.state_name", "lake.state_name"), 204 | ("border_info.border", "lake.state_name"), 205 | ], 206 | ("border_info", "state"): [ 207 | ("border_info.state_name", "state.state_name"), 208 | ("border_info.border", "state.state_name"), 209 | ], 210 | ("border_info", "highlow"): [ 211 | ("border_info.state_name", "highlow.state_name"), 212 | ("border_info.border", "highlow.state_name"), 213 | ], 214 | ("border_info", "mountain"): [ 215 | ("border_info.state_name", "mountain.state_name"), 216 | ("border_info.border", "mountain.state_name"), 217 | ], 218 | ("city", "lake"): [ 219 | ("city.country_name", "lake.country_name"), 220 | ("city.state_name", "lake.state_name"), 221 | ], 222 | ("city", "river"): [ 223 | ("city.country_name", "river.country_name"), 224 | ], 225 | ("city", "state"): [ 226 | ("city.country_name", "state.country_name"), 227 | ("city.state_name", "state.state_name"), 228 | ], 229 | ("city", "mountain"): [ 230 | ("city.country_name", "mountain.country_name"), 231 | ("city.state_name", "mountain.state_name"), 232 | ], 233 | ("city", "highlow"): [ 234 | ("city.state_name", "highlow.state_name"), 235 | ], 236 | ("highlow", "lake"): [ 237 | ("highlow.state_name", "lake.state_name"), 238 | ], 239 | ("highlow", "state"): [ 240 | ("highlow.state_name", "state.state_name"), 241 | ], 242 | ("highlow", "mountain"): [ 243 | ("highlow.state_name", "mountain.state_name"), 244 | ], 245 | ("lake", "river"): [ 246 | ("lake.country_name", "river.country_name"), 247 | ], 248 | ("lake", "state"): [ 249 | ("lake.country_name", "state.country_name"), 250 | ("lake.state_name", "state.state_name"), 251 | ], 252 | ("lake", "mountain"): [ 253 | ("lake.country_name", "mountain.country_name"), 254 | ("lake.state_name", "mountain.state_name"), 255 | ], 256 | ("river", "state"): [ 257 | ("river.country_name", "state.country_name"), 258 | ], 259 | ("river", "mountain"): [ 260 | ("river.country_name", "mountain.country_name"), 261 | ], 262 | ("state", "mountain"): [ 263 | ("state.country_name", "mountain.country_name"), 264 | ("state.state_name", "mountain.state_nem"), 265 | ], 266 | }, 267 | "scholar": { 268 | ("author", "writes"): [ 269 | ("author.authorid", "writes.authorid"), 270 | ], 271 | ("cite", "paper"): [ 272 | ("cite.citingpaperid", "paper.paperid"), 273 | ("cite.citedpaperid", "paper.paperid"), 274 | ], 275 | ("cite", "paperdataset"): [ 276 | ("cite.citingpaperid", "paperdataset.paperid"), 277 | ("cite.citedpaperid", "paperdataset.paperid"), 278 | ], 279 | ("cite", "paperfield"): [ 280 | ("cite.citingpaperid", "paperfield.paperid"), 281 | ("cite.citedpaperid", "paperfield.paperid"), 282 | ], 283 | ("cite", "paperkeyphrase"): [ 284 | ("cite.citingpaperid", "paperkeyphrase.paperid"), 285 | ("cite.citedpaperid", "paperkeyphrase.paperid"), 286 | ], 287 | ("cite", "writes"): [ 288 | ("cite.citingpaperid", "writes.paperid"), 289 | ("cite.citedpaperid", "writes.paperid"), 290 | ], 291 | ("dataset", "paperdataset"): [ 292 | ("dataset.datasetid", "paperdataset.datasetid"), 293 | ], 294 | ("field", "paperfield"): [ 295 | ("field.fieldid", "paperfield.fieldid"), 296 | ], 297 | ("journal", "paper"): [ 298 | ("journal.journalid", "paper.journalid"), 299 | ], 300 | ("keyphrase", "paperkeyphrase"): [ 301 | ("keyphrase.keyphraseid", "paperkeyphrase.keyphraseid"), 302 | ], 303 | ("paper", "paperdataset"): [ 304 | ("paper.paperid", "paperdataset.paperid"), 305 | ], 306 | ("paper", "paperfield"): [ 307 | ("paper.paperid", "paperfield.paperid"), 308 | ], 309 | ("paper", "paperkeyphrase"): [ 310 | ("paper.paperid", "paperkeyphrase.paperid"), 311 | ], 312 | ("paper", "writes"): [ 313 | ("paper.paperid", "writes.paperid"), 314 | ], 315 | ("paper", "venue"): [ 316 | ("paper.venueid", "venue.venueid"), 317 | ], 318 | ("paperfield", "paperkeyphrase"): [ 319 | ("paperfield.paperid", "paperkeyphrase.paperid"), 320 | ], 321 | ("paperfield", "writes"): [ 322 | ("paperfield.paperid", "writes.paperid"), 323 | ], 324 | ("paperkeyphrase", "writes"): [ 325 | ("paperkeyphrase.paperid", "writes.paperid"), 326 | ], 327 | }, 328 | "broker": { 329 | ("sbCustomer", "sbTransaction"): [ 330 | ("sbCustomer.sbCustId", "sbTransaction.sbTxCustId") 331 | ], 332 | ("sbTicker", "sbDailyPrice"): [ 333 | ("sbTicker.sbTickerId", "sbDailyPrice.sbDpTickerId") 334 | ], 335 | ("sbTicker", "sbTransaction"): [ 336 | ("sbTicker.sbTickerId", "sbTransaction.sbTxTickerId") 337 | ], 338 | }, 339 | "car_dealership": { 340 | ("cars", "sales"): [("cars.id", "sales.car_id")], 341 | ("salespersons", "sales"): [("salespersons.id", "sales.salesperson_id")], 342 | ("customers", "sales"): [("customers.id", "sales.customer_id")], 343 | ("cars", "inventory_snapshots"): [("cars.id", "inventory_snapshots.car_id")], 344 | ("sales", "payments_received"): [("sales.id", "payments_received.sale_id")], 345 | }, 346 | "derm_treatment": { 347 | ("patients", "treatments"): [("patients.patient_id", "treatments.patient_id")], 348 | ("doctors", "treatments"): [("doctors.doc_id", "treatments.doc_id")], 349 | ("drugs", "treatments"): [("drugs.drug_id", "treatments.drug_id")], 350 | ("diagnoses", "treatments"): [("diagnoses.diag_id", "treatments.diag_id")], 351 | ("treatments", "outcomes"): [ 352 | ("treatments.treatment_id", "outcomes.treatment_id") 353 | ], 354 | ("treatments", "adverse_events"): [ 355 | ("treatments.treatment_id", "adverse_events.treatment_id") 356 | ], 357 | ("treatments", "concomitant_meds"): [ 358 | ("treatments.treatment_id", "concomitant_meds.treatment_id") 359 | ], 360 | }, 361 | "ewallet": { 362 | ("consumer_div.users", "consumer_div.notifications"): [ 363 | ("consumer_div.users.uid", "consumer_div.notifications.user_id") 364 | ], 365 | ("consumer_div.users", "consumer_div.user_sessions"): [ 366 | ("consumer_div.users.uid", "consumer_div.user_sessions.user_id") 367 | ], 368 | ("consumer_div.users", "consumer_div.user_setting_snapshot"): [ 369 | ("consumer_div.users.uid", "consumer_div.user_setting_snapshot.user_id") 370 | ], 371 | ("consumer_div.users", "consumer_div.wallet_user_balance_daily"): [ 372 | ("consumer_div.users.uid", "consumer_div.wallet_user_balance_daily.user_id") 373 | ], 374 | ("consumer_div.users", "consumer_div.wallet_transactions_daily"): [ 375 | ( 376 | "consumer_div.users.uid", 377 | "consumer_div.wallet_transactions_daily.sender_id", 378 | ), 379 | ( 380 | "consumer_div.users.uid", 381 | "consumer_div.wallet_transactions_daily.receiver_id", 382 | ), 383 | ], 384 | ("consumer_div.merchants", "consumer_div.wallet_transactions_daily"): [ 385 | ( 386 | "consumer_div.merchants.mid", 387 | "consumer_div.wallet_transactions_daily.sender_id", 388 | ), 389 | ( 390 | "consumer_div.merchants.mid", 391 | "consumer_div.wallet_transactions_daily.receiver_id", 392 | ), 393 | ], 394 | ("consumer_div.merchants", "consumer_div.coupons"): [ 395 | ("consumer_div.merchants.mid", "consumer_div.coupons.merchant_id") 396 | ], 397 | ("consumer_div.merchants", "consumer_div.wallet_merchant_balance_daily"): [ 398 | ( 399 | "consumer_div.merchants.mid", 400 | "consumer_div.wallet_merchant_balance_daily.merchant_id", 401 | ) 402 | ], 403 | ("consumer_div.coupons", "consumer_div.wallet_transactions_daily"): [ 404 | ( 405 | "consumer_div.coupons.cid", 406 | "consumer_div.wallet_transactions_daily.coupon_id", 407 | ), 408 | ( 409 | "consumer_div.coupons.merchant_id", 410 | "consumer_div.wallet_transactions_daily.sender_id", 411 | ), 412 | ( 413 | "consumer_div.coupons.merchant_id", 414 | "consumer_div.wallet_transactions_daily.receiver_id", 415 | ), 416 | ], 417 | }, 418 | } 419 | -------------------------------------------------------------------------------- /defog_data/yelp/yelp.json: -------------------------------------------------------------------------------- 1 | { 2 | "table_metadata": { 3 | "tip": [ 4 | { 5 | "column_name": "year", 6 | "data_type": "bigint", 7 | "column_description": "Year when the tip was created" 8 | }, 9 | { 10 | "column_name": "tip_id", 11 | "data_type": "bigint", 12 | "column_description": "Unique identifier for the tip" 13 | }, 14 | { 15 | "column_name": "month", 16 | "data_type": "text", 17 | "column_description": "Month when the tip was created. Eg. 'January', 'February', etc." 18 | }, 19 | { 20 | "column_name": "user_id", 21 | "data_type": "text", 22 | "column_description": "Unique identifier for the user who created the tip" 23 | }, 24 | { 25 | "column_name": "business_id", 26 | "data_type": "text", 27 | "column_description": "Unique identifier for the business where the tip was created." 28 | }, 29 | { 30 | "column_name": "text", 31 | "data_type": "text", 32 | "column_description": "Text content of the tip. All apostrophes use ’ instead of ' to avoid SQL errors." 33 | } 34 | ], 35 | "users": [ 36 | { 37 | "data_type": "bigint", 38 | "column_name": "uid", 39 | "column_description": "Unique identifier for each user" 40 | }, 41 | { 42 | "data_type": "text", 43 | "column_name": "user_id", 44 | "column_description": "Unique user ID assigned by the system" 45 | }, 46 | { 47 | "data_type": "text", 48 | "column_name": "name", 49 | "column_description": "Name of the user" 50 | } 51 | ], 52 | "review": [ 53 | { 54 | "data_type": "real", 55 | "column_name": "rating", 56 | "column_description": "The rating given by the user for the business, on a scale of 1 to 5." 57 | }, 58 | { 59 | "data_type": "bigint", 60 | "column_name": "rid", 61 | "column_description": "The unique identifier for each review." 62 | }, 63 | { 64 | "data_type": "bigint", 65 | "column_name": "year", 66 | "column_description": "The year in which the review was posted." 67 | }, 68 | { 69 | "data_type": "text", 70 | "column_name": "month", 71 | "column_description": "The month in which the review was posted. Eg. 'January', 'February', etc." 72 | }, 73 | { 74 | "data_type": "text", 75 | "column_name": "text", 76 | "column_description": "The text of the review. All apostrophes use ’ instead of ' to avoid SQL errors." 77 | }, 78 | { 79 | "data_type": "text", 80 | "column_name": "business_id", 81 | "column_description": "The unique identifier for the business being reviewed." 82 | }, 83 | { 84 | "data_type": "text", 85 | "column_name": "user_id", 86 | "column_description": "The unique identifier for the user who posted the review." 87 | } 88 | ], 89 | "checkin": [ 90 | { 91 | "data_type": "bigint", 92 | "column_name": "cid", 93 | "column_description": "Unique identifier for the daily check-in count" 94 | }, 95 | { 96 | "data_type": "bigint", 97 | "column_name": "count", 98 | "column_description": "Total number of check-ins at a business on a given day" 99 | }, 100 | { 101 | "data_type": "text", 102 | "column_name": "business_id", 103 | "column_description": "Unique identifier for the business where the check-in occurred" 104 | }, 105 | { 106 | "data_type": "text", 107 | "column_name": "day", 108 | "column_description": "Day of the week when the check-ins occurred. Eg. 'Monday', 'Tuesday', etc." 109 | } 110 | ], 111 | "business": [ 112 | { 113 | "data_type": "bigint", 114 | "column_name": "review_count", 115 | "column_description": "The number of reviews for the business" 116 | }, 117 | { 118 | "data_type": "bigint", 119 | "column_name": "is_open", 120 | "column_description": "Indicates whether the business is currently open or closed (1 for open, 0 for closed)" 121 | }, 122 | { 123 | "data_type": "bigint", 124 | "column_name": "bid", 125 | "column_description": "The unique identifier for the business" 126 | }, 127 | { 128 | "data_type": "text", 129 | "column_name": "city", 130 | "column_description": "The city where the business is located" 131 | }, 132 | { 133 | "data_type": "text", 134 | "column_name": "latitude", 135 | "column_description": "The latitude of the business location" 136 | }, 137 | { 138 | "data_type": "text", 139 | "column_name": "longitude", 140 | "column_description": "The longitude of the business location" 141 | }, 142 | { 143 | "data_type": "text", 144 | "column_name": "state", 145 | "column_description": "The US state where the business is located, represented by two-letter abbreviations (eg. 'CA', 'NV', 'NY', etc.)" 146 | }, 147 | { 148 | "data_type": "text", 149 | "column_name": "business_id", 150 | "column_description": "The unique identifier for the business" 151 | }, 152 | { 153 | "data_type": "text", 154 | "column_name": "name", 155 | "column_description": "The name of the business. All apostrophes use ’ instead of ' to avoid SQL errors." 156 | }, 157 | { 158 | "data_type": "text", 159 | "column_name": "full_address", 160 | "column_description": "The full address of the business" 161 | } 162 | ], 163 | "category": [ 164 | { 165 | "data_type": "bigint", 166 | "column_name": "id", 167 | "column_description": "Unique identifier for each category" 168 | }, 169 | { 170 | "data_type": "text", 171 | "column_name": "business_id", 172 | "column_description": "Identifier for the business associated with the category" 173 | }, 174 | { 175 | "data_type": "text", 176 | "column_name": "category_name", 177 | "column_description": "Name of the category. Eg 'Bistro', 'Diner', 'Pizza'" 178 | } 179 | ], 180 | "neighbourhood": [ 181 | { 182 | "data_type": "bigint", 183 | "column_name": "id", 184 | "column_description": "Unique identifier for each neighbourhood" 185 | }, 186 | { 187 | "data_type": "text", 188 | "column_name": "business_id", 189 | "column_description": "Identifier for each business in the neighbourhood" 190 | }, 191 | { 192 | "data_type": "text", 193 | "column_name": "neighbourhood_name", 194 | "column_description": "Name of the neighbourhood where the business is located" 195 | } 196 | ] 197 | }, 198 | "glossary": "" 199 | } 200 | -------------------------------------------------------------------------------- /defog_data/yelp/yelp.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE public.business ( 2 | bid bigint, 3 | business_id text, 4 | name text, 5 | full_address text, 6 | city text, 7 | latitude text, 8 | longitude text, 9 | review_count bigint, 10 | is_open bigint, 11 | state text 12 | ); 13 | 14 | 15 | CREATE TABLE public.category ( 16 | id bigint, 17 | business_id text, 18 | category_name text 19 | ); 20 | 21 | 22 | CREATE TABLE public.checkin ( 23 | cid bigint, 24 | business_id text, 25 | count bigint, 26 | day text 27 | ); 28 | 29 | 30 | CREATE TABLE public.neighbourhood ( 31 | id bigint, 32 | business_id text, 33 | neighbourhood_name text 34 | ); 35 | 36 | 37 | CREATE TABLE public.review ( 38 | rid bigint, 39 | business_id text, 40 | user_id text, 41 | rating real, 42 | text text, 43 | year bigint, 44 | month text 45 | ); 46 | 47 | 48 | CREATE TABLE public.tip ( 49 | tip_id bigint, 50 | business_id text, 51 | text text, 52 | user_id text, 53 | likes bigint, 54 | year bigint, 55 | month text 56 | ); 57 | 58 | 59 | CREATE TABLE public.users ( 60 | uid bigint, 61 | user_id text, 62 | name text 63 | ); 64 | 65 | 66 | INSERT INTO public.business (bid, business_id, name, full_address, city, latitude, longitude, review_count, is_open, state) VALUES 67 | (1, 'abc123', 'Joe’s Pizza', '123 Main St', 'San Francisco', '37.7749295', '-122.4194155', 3, 0, 'CA'), 68 | (2, 'def456', 'Peter’s Cafe', '456 Elm St', 'New York', '40.712776', '-74.005974', 4, 1, 'NY'), 69 | (3, 'ghi789', 'Anna’s Diner', '789 Oak St', 'Los Angeles', '34.052235', '-118.243683', 5, 0, 'CA'), 70 | (4, 'jkl012', 'Mark’s Bistro', '012 Maple St', 'San Francisco', '37.7749295', '-122.4194155', 4, 1, 'CA'), 71 | (5, 'mno345', 'Lily’s Bakery', '345 Walnut St', 'New York', '40.712776', '-74.005974', 3, 1, 'NY'), 72 | (6, 'xyz123', 'Izza’s Pizza', '83 Main St', 'San Francisco', '37.8749295', '-122.5194155', 2, 1, 'CA'), 73 | (7, 'uvw456', 'Sashays Cafe', '246 Elm St', 'New York', '40.812776', '-74.105974', 2, 1, 'NY') 74 | ; 75 | 76 | INSERT INTO public.category (id, business_id, category_name) VALUES 77 | (1, 'abc123', 'Pizza'), 78 | (2, 'def456', 'Cafe'), 79 | (3, 'ghi789', 'Diner'), 80 | (4, 'jkl012', 'Bistro'), 81 | (5, 'mno345', 'Bakery'), 82 | (1, 'xyz123', 'Pizza'), 83 | (2, 'uvw456', 'Cafe') 84 | ; 85 | 86 | INSERT INTO public.checkin (cid, business_id, count, day) VALUES 87 | (1, 'abc123', 10, 'Monday'), 88 | (2, 'def456', 20, 'Tuesday'), 89 | (3, 'ghi789', 15, 'Wednesday'), 90 | (4, 'jkl012', 30, 'Thursday'), 91 | (5, 'mno345', 25, 'Friday'), 92 | (6, 'abc123', 13, 'Tuesday'), 93 | (7, 'def456', 14, 'Wednesday'), 94 | (8, 'ghi789', 8, 'Thursday'), 95 | (9, 'jkl012', 21, 'Saturday'), 96 | (10, 'mno345', 24, 'Friday'), 97 | (11, 'xyz123', 10, 'Saturday'), 98 | (12, 'uvw456', 2, 'Monday') 99 | ; 100 | 101 | INSERT INTO public.neighbourhood (id, business_id, neighbourhood_name) VALUES 102 | (1, 'abc123', 'Downtown'), 103 | (2, 'def456', 'Midtown'), 104 | (3, 'ghi789', 'Hollywood'), 105 | (4, 'jkl012', 'Downtown'), 106 | (5, 'mno345', 'Upper East Side'), 107 | (6, 'xyz123', 'Downtown'), 108 | (7, 'uvw456', 'Midtown') 109 | ; 110 | 111 | INSERT INTO public.review (rid, business_id, user_id, rating, text, year, month) VALUES 112 | (1, 'abc123', '1', 4.5, 'Great pizza!', 2021, 'January'), 113 | (2, 'def456', '2', 4.2, 'Delicious food.', 2021, 'February'), 114 | (3, 'ghi789', '3', 3.9, 'Average diner.', 2021, 'March'), 115 | (4, 'jkl012', '4', 4.8, 'Amazing bistro.', 2021, 'April'), 116 | (5, 'mno345', '5', 4.6, 'Yummy bakery.', 2021, 'January'), 117 | (6, 'ghi789', '1', 1.2, 'Horrible staff!', 2021, 'April'), 118 | (7, 'def456', '2', 4.9, 'Second visit. I’m loving it.', 2021, 'May'), 119 | (8, 'xyz123', '3', 0.5, 'Hate it', 2021, 'June'), 120 | (9, 'uvw456', '4', 4.0, 'Not bad.', 2021, 'July'), 121 | (10, 'abc123', '5', 4.6, 'Very goody.', 2022, 'January'), 122 | (11, 'def456', '1', 3.0, 'Average', 2022, 'February'), 123 | (12, 'ghi789', '2', 4.0, 'Not bad.', 2022, 'March'), 124 | (13, 'jkl012', '3', 4.5, 'Second time here.', 2022, 'April'), 125 | (14, 'mno345', '4', 4.6, 'Third time here.', 2022, 'May'), 126 | (15, 'xyz123', '5', 3.5, 'Wont come again.', 2022, 'June'), 127 | (16, 'uvw456', '1', 4.0, 'Quite good.', 2022, 'July'), 128 | (17, 'mno345', '2', 4.6, 'Superb.', 2022, 'July'), 129 | (18, 'jkl012', '3', 5.0, 'WOwowow.', 2022, 'August'), 130 | (19, 'jkl012', '4', 4.8, 'Lovin it.', 2022, 'September'), 131 | (20, 'ghi789', '5', 1.5, 'Worst experience ever.', EXTRACT(YEAR FROM CURRENT_DATE - INTERVAL '15 months'), TO_CHAR(CURRENT_DATE - INTERVAL '15 months', 'Month')), 132 | (21, 'abc123', '1', 4.6, 'Very goody.', EXTRACT(YEAR FROM CURRENT_DATE - INTERVAL '9 months'), TO_CHAR(CURRENT_DATE - INTERVAL '9 months', 'Month')), 133 | (22, 'def456', '2', 3.0, 'Average', EXTRACT(YEAR FROM CURRENT_DATE - INTERVAL '8 months'), TO_CHAR(CURRENT_DATE - INTERVAL '8 months', 'Month')), 134 | (23, 'ghi789', '3', 4.0, 'Not bad.', EXTRACT(YEAR FROM CURRENT_DATE - INTERVAL '7 months'), TO_CHAR(CURRENT_DATE - INTERVAL '7 months', 'Month')) 135 | ; 136 | 137 | INSERT INTO public.tip (tip_id, business_id, text, user_id, likes, year, month) VALUES 138 | (1, 'abc123', 'Try their pepperoni pizza!', '1', NULL, 2021, 'January'), 139 | (2, 'def456', 'Their coffee is amazing.', '2', NULL, 2021, 'February'), 140 | (3, 'ghi789', 'The pancakes are delicious.', '3', NULL, 2021, 'March'), 141 | (4, 'jkl012', 'Highly recommend the steak.', '4', NULL, 2021, 'April'), 142 | (5, 'mno345', 'Their pastries are to die for.', '5', NULL, 2021, 'May'), 143 | (6, 'xyz123', 'Don’t waste your money.', '1', NULL, 2021, 'June'), 144 | (7, 'uvw456', 'Not bad.', '2', NULL, 2021, 'July'), 145 | (8, 'mno345', 'Get the blueberry pancakes!', '1', NULL, 2022, 'January'), 146 | (9, 'abc123', 'Try their pepperoni pizza!', '1', NULL, 2022, 'January'), 147 | (10, 'def456', 'Their coffee is amazing.', '2', NULL, 2022, 'February'), 148 | (11, 'ghi789', 'The pancakes are delicious.', '3', NULL, 2022, 'March'), 149 | (12, 'jkl012', 'Highly recommend the steak.', '4', NULL, 2022, 'April'), 150 | (13, 'mno345', 'Their pastries are to die for.', '5', NULL, 2022, 'May'), 151 | (14, 'xyz123', 'Don’t waste your money.', '1', NULL, 2022, 'June'), 152 | (15, 'uvw456', 'So-so.', '2', NULL, 2022, 'July'), 153 | (16, 'mno345', 'Second time having blueberry pancakes!', '1', NULL, 2022, 'July'), 154 | (17, 'jkl012', 'Great happy hour deals.', '5', NULL, 2022, 'August'), 155 | (18, 'jkl012', 'Ask for extra sauce.', '3', NULL, 2022, 'September'), 156 | (19, 'ghi789', 'Friendly staff.', '4', NULL, 2022, 'October'), 157 | (20, 'def456', 'Tasty lattes.', '4', NULL, 2022, 'November'), 158 | (21, 'abc123', 'Fresh ingredients.', '2', NULL, 2022, 'December') 159 | ; 160 | 161 | INSERT INTO public.users (uid, user_id, name) VALUES 162 | (1, '1', 'John Doe'), 163 | (2, '2', 'Jane Smith'), 164 | (3, '3', 'David Johnson'), 165 | (4, '4', 'Sarah Williams'), 166 | (5, '5', 'Michael Brown') 167 | ; 168 | 169 | 170 | 171 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/defog-ai/defog-data/856295d8f0aa8a0b0fb71b9623e86f363469797a/requirements.txt -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | setup( 4 | name="defog-data", 5 | version="0.1.2", 6 | description="Static SQL and JSON files containing the data we use for evaluations", 7 | author="Defog", 8 | author_email="support@defog.ai", 9 | packages=find_packages(), 10 | package_data={ 11 | "": [ 12 | "academic/*", 13 | "advising/*", 14 | "atis/*", 15 | "geography/*", 16 | "restaurants/*", 17 | "scholar/*", 18 | "yelp/*", 19 | ] 20 | }, 21 | include_package_data=True, 22 | ) 23 | -------------------------------------------------------------------------------- /setup.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | 3 | # get arguments 4 | # if $@ is empty, set it to all of our db's 5 | if [ -z "$@" ]; then 6 | set -- academic advising atis broker car_dealership derm_treatment ewallet geography restaurants scholar yelp 7 | fi 8 | # $@ is all arguments passed to the script 9 | echo "Databases to init: $@" 10 | 11 | # get each folder name in data/export 12 | for db_name in "$@"; do 13 | echo "dropping and recreating database ${db_name}" 14 | # drop and recreate database 15 | PGPASSWORD="${DBPASSWORD:-postgres}" psql -U "${DBUSER:-postgres}" -h "${DBHOST:-localhost}" -p "${DBPORT:-5432}" -c "DROP DATABASE IF EXISTS ${db_name};" 16 | PGPASSWORD="${DBPASSWORD:-postgres}" psql -U "${DBUSER:-postgres}" -h "${DBHOST:-localhost}" -p "${DBPORT:-5432}" -c "CREATE DATABASE ${db_name};" 17 | echo "done dropping and recreating database ${db_name}" 18 | db_path="defog_data/${db_name}/${db_name}.sql" 19 | echo "importing ${db_path} into database ${db_name}" 20 | PGPASSWORD="${DBPASSWORD:-postgres}" psql -U "${DBUSER:-postgres}" -h "${DBHOST:-localhost}" -p "${DBPORT:-5432}" -d "${db_name}" -f "${db_path}" 21 | done 22 | -------------------------------------------------------------------------------- /setup_snowflake.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | 3 | # get arguments 4 | # if there are no arguments, set them to a default list 5 | if [ $# -eq 0 ]; then 6 | set -- academic advising atis geography restaurants scholar yelp broker car_dealership derm_treatment ewallet 7 | fi 8 | echo "Databases to init: $@" 9 | 10 | for db_name in "$@"; do 11 | echo "dropping and recreating database ${db_name}" 12 | # drop and recreate database 13 | snowsql -q "DROP DATABASE IF EXISTS ${db_name}; CREATE DATABASE ${db_name};" -o exit_on_error=true 14 | echo "done dropping and recreating database ${db_name}" 15 | db_path="defog_data/${db_name}/${db_name}.sql" 16 | echo "importing ${db_path} into database ${db_name}" 17 | snowsql -d "${db_name}" -s public -f "${db_path}" 18 | done -------------------------------------------------------------------------------- /tests.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | from defog_data.metadata import get_db, dbs 4 | 5 | 6 | class TestDB(unittest.TestCase): 7 | def test_load_all_in_diff_dir(self): 8 | # get current directory 9 | test_dir = os.getcwd() 10 | # cd to /tmp and attempt to load a db 11 | os.chdir("/tmp") 12 | all_db_names = [ 13 | "academic", 14 | "advising", 15 | "atis", 16 | "broker", 17 | "car_dealership", 18 | "derm_treatment", 19 | "ewallet", 20 | "geography", 21 | "restaurants", 22 | "scholar", 23 | "yelp", 24 | ] 25 | for db_name in all_db_names: 26 | db = get_db(db_name) 27 | db_schema = db["table_metadata"] 28 | assert len(db_schema) > 0 29 | assert "glossary" in db 30 | os.chdir(test_dir) 31 | 32 | def dbs_exist(self): 33 | assert len(dbs) == 11 34 | 35 | # check that all the tables exist in each db 36 | def test_academic(self): 37 | db_name = "academic" 38 | db_schema = get_db(db_name)["table_metadata"] 39 | expected_tables = [ 40 | "cite", 41 | "author", 42 | "domain", 43 | "writes", 44 | "journal", 45 | "keyword", 46 | "conference", 47 | "publication", 48 | "organization", 49 | "domain_author", 50 | "domain_journal", 51 | "domain_keyword", 52 | "domain_conference", 53 | "domain_publication", 54 | "publication_keyword", 55 | ] 56 | self.assertEqual(list(db_schema.keys()), expected_tables) 57 | num_columns = sum([len(db_schema[table]) for table in db_schema]) 58 | self.assertEqual(num_columns, 42) 59 | 60 | def test_advising(self): 61 | db_name = "advising" 62 | db_schema = get_db(db_name)["table_metadata"] 63 | expected_tables = [ 64 | "area", 65 | "course", 66 | "program", 67 | "student", 68 | "semester", 69 | "instructor", 70 | "program_course", 71 | "student_record", 72 | "course_offering", 73 | "course_tags_count", 74 | "comment_instructor", 75 | "course_prerequisite", 76 | "offering_instructor", 77 | "program_requirement", 78 | ] 79 | self.assertEqual(list(db_schema.keys()), expected_tables) 80 | num_columns = sum([len(db_schema[table]) for table in db_schema]) 81 | self.assertEqual(num_columns, 109) 82 | 83 | def test_atis(self): 84 | db_name = "atis" 85 | db_schema = get_db(db_name)["table_metadata"] 86 | expected_tables = [ 87 | "city", 88 | "days", 89 | "fare", 90 | "month", 91 | "state", 92 | "flight", 93 | "airline", 94 | "airport", 95 | "aircraft", 96 | "time_zone", 97 | "fare_basis", 98 | "flight_leg", 99 | "flight_fare", 100 | "flight_stop", 101 | "restriction", 102 | "dual_carrier", 103 | "food_service", 104 | "time_interval", 105 | "ground_service", 106 | "airport_service", 107 | "class_of_service", 108 | "code_description", 109 | "compartment_class", 110 | "equipment_sequence", 111 | ] 112 | self.assertEqual(list(db_schema.keys()), expected_tables) 113 | num_columns = sum([len(db_schema[table]) for table in db_schema]) 114 | self.assertEqual(num_columns, 127) 115 | 116 | def test_broker(self): 117 | db_name = "broker" 118 | db_schema = get_db(db_name)["table_metadata"] 119 | expected_tables = [ 120 | "sbCustomer", 121 | "sbTicker", 122 | "sbDailyPrice", 123 | "sbTransaction", 124 | ] 125 | self.assertEqual(list(db_schema.keys()), expected_tables) 126 | glossary = get_db(db_name)["glossary"] 127 | expected_glossary = """- sbTicker can be joined to sbDailyPrice on sbTickerId 128 | - sbCustomer can be joined to sbTransaction on sbCustId 129 | - sbTicker can be joined to sbTransaction on sbTickerId 130 | - ADV (Average Daily Volume) for a ticker = AVG(sbDpVolume) from sbDailyPrice table for that ticker 131 | - ATH (All Time High) price for a ticker = MAX(sbDpHigh) from sbDailyPrice table for that ticker 132 | - ATP (Average Transaction Price) for a customer = SUM(sbTxAmount)/SUM(sbTxShares) from sbTransaction table for that customer 133 | - NCT (Net Commission Total) = SUM(sbTxCommission) from sbTransaction table""" 134 | self.assertEqual(glossary, expected_glossary) 135 | num_columns = sum([len(db_schema[table]) for table in db_schema]) 136 | self.assertEqual(num_columns, 43) 137 | 138 | def test_car_dealership(self): 139 | db_name = "car_dealership" 140 | db_schema = get_db(db_name)["table_metadata"] 141 | expected_tables = [ 142 | "cars", 143 | "salespersons", 144 | "customers", 145 | "sales", 146 | "inventory_snapshots", 147 | "payments_received", 148 | "payments_made", 149 | ] 150 | self.assertEqual(list(db_schema.keys()), expected_tables) 151 | num_columns = sum([len(db_schema[table]) for table in db_schema]) 152 | self.assertEqual(num_columns, 55) 153 | 154 | def test_derm_treatment(self): 155 | db_name = "derm_treatment" 156 | db_schema = get_db(db_name)["table_metadata"] 157 | expected_tables = [ 158 | "doctors", 159 | "patients", 160 | "drugs", 161 | "diagnoses", 162 | "treatments", 163 | "outcomes", 164 | "adverse_events", 165 | "concomitant_meds", 166 | ] 167 | self.assertEqual(list(db_schema.keys()), expected_tables) 168 | num_columns = sum([len(db_schema[table]) for table in db_schema]) 169 | self.assertEqual(num_columns, 81) 170 | 171 | def test_ewallet(self): 172 | db_name = "ewallet" 173 | db_schema = get_db(db_name)["table_metadata"] 174 | expected_tables = [ 175 | "consumer_div.users", 176 | "consumer_div.merchants", 177 | "consumer_div.coupons", 178 | "consumer_div.wallet_transactions_daily", 179 | "consumer_div.wallet_user_balance_daily", 180 | "consumer_div.wallet_merchant_balance_daily", 181 | "consumer_div.notifications", 182 | "consumer_div.user_sessions", 183 | "consumer_div.user_setting_snapshot", 184 | ] 185 | self.assertEqual(list(db_schema.keys()), expected_tables) 186 | glossary = get_db(db_name)["glossary"] 187 | expected_glossary = """- sender_id and receiver_id can be joined with either users.uid or merchants.mid depending on the sender_type/receiver_type 188 | - if a user applied a coupon to a purchase, there will be 2 rows in wallet_transactions_daily: 189 | - 1st row where coupon_id is NULL, amount = purchase value 190 | - 2nd row where coupon_id is NOT NULL, amount = coupon value applied 191 | - the sender and receiver id will be the same for both rows, but they will have different txid's 192 | - when using coupons.code, wallet_transactions_daily.gateway_name, filter case insensitively 193 | - Total Transaction Volume (TTV) = SUM(wallet_transactions_daily.amount) 194 | - Total Coupon Discount Redeemed (TCDR) = SUM(wallet_transactions_daily.amount) WHERE coupon_id IS NOT NULL 195 | - Session Density = COUNT(user_sessions.user_id) / COUNT(DISTINCT user_sessions.user_id) 196 | - Active Merchants Percentage (APM) = COUNT(DISTINCT CASE WHEN sender_type = 1 THEN wallet_transactions_daily.sender_id WHEN receiver_type = 1 THEN wallet_transactions_daily.receiver_id ELSE NULL END) / COUNT(DISTINCT merchants.mid)""" 197 | print(glossary) 198 | self.assertEqual(glossary, expected_glossary) 199 | num_columns = sum([len(db_schema[table]) for table in db_schema]) 200 | self.assertEqual(num_columns, 96) 201 | 202 | def test_geography(self): 203 | db_name = "geography" 204 | db_schema = get_db(db_name)["table_metadata"] 205 | expected_tables = [ 206 | "city", 207 | "lake", 208 | "river", 209 | "state", 210 | "highlow", 211 | "mountain", 212 | "border_info", 213 | ] 214 | self.assertEqual(list(db_schema.keys()), expected_tables) 215 | num_columns = sum([len(db_schema[table]) for table in db_schema]) 216 | self.assertEqual(num_columns, 29) 217 | 218 | def test_restaurants(self): 219 | db_name = "restaurants" 220 | db_schema = get_db(db_name)["table_metadata"] 221 | expected_tables = ["location", "geographic", "restaurant"] 222 | self.assertEqual(list(db_schema.keys()), expected_tables) 223 | num_columns = sum([len(db_schema[table]) for table in db_schema]) 224 | self.assertEqual(num_columns, 12) 225 | 226 | def test_scholar(self): 227 | db_name = "scholar" 228 | db_schema = get_db(db_name)["table_metadata"] 229 | expected_tables = [ 230 | "cite", 231 | "field", 232 | "paper", 233 | "venue", 234 | "author", 235 | "writes", 236 | "dataset", 237 | "journal", 238 | "keyphrase", 239 | "paperfield", 240 | "paperdataset", 241 | "paperkeyphrase", 242 | ] 243 | self.assertEqual(list(db_schema.keys()), expected_tables) 244 | num_columns = sum([len(db_schema[table]) for table in db_schema]) 245 | self.assertEqual(num_columns, 28) 246 | 247 | def test_yelp(self): 248 | db_name = "yelp" 249 | db_schema = get_db(db_name)["table_metadata"] 250 | expected_tables = [ 251 | "tip", 252 | "users", 253 | "review", 254 | "checkin", 255 | "business", 256 | "category", 257 | "neighbourhood", 258 | ] 259 | self.assertEqual(list(db_schema.keys()), expected_tables) 260 | num_columns = sum([len(db_schema[table]) for table in db_schema]) 261 | self.assertEqual(num_columns, 36) 262 | 263 | 264 | if __name__ == "__main__": 265 | unittest.main() 266 | -------------------------------------------------------------------------------- /translate_ddl_dialect.py: -------------------------------------------------------------------------------- 1 | # This script reads the DDL statements from the Postgres sql files and translates them to the specified dialect. 2 | # It then creates the databases and tables from the translated DDL statements. 3 | # Finally, it queries the database to check that values were insert into tables. 4 | 5 | import os 6 | import time 7 | from tqdm import tqdm 8 | import argparse 9 | from utils_dialects import ( 10 | create_bq_db, 11 | create_mysql_db, 12 | create_sqlite_db, 13 | create_tsql_db, 14 | test_query_db, 15 | conv_ddl_to_dialect, 16 | ) 17 | 18 | # List of databases to create 19 | db_names = [ 20 | "academic", 21 | "advising", 22 | "atis", 23 | "broker", 24 | "car_dealership", 25 | "derm_treatment", 26 | "ewallet", 27 | "geography", 28 | "restaurants", 29 | "scholar", 30 | "yelp", 31 | ] 32 | bigquery_proj = os.getenv("BIGQUERY_PROJ") 33 | 34 | # For testing that values were inserted into tables, format: (db_name, table_name) 35 | test_queries = [ 36 | ("academic", "writes"), 37 | ("advising", "student_record"), 38 | ("atis", "time_zone"), 39 | ("broker", "sbTransaction"), 40 | ("car_dealership", "payments_made"), 41 | ("derm_treatment", "concomitant_meds"), 42 | ("ewallet", "consumer_div.user_setting_snapshot"), 43 | ("geography", "state"), 44 | ("restaurants", "restaurant"), 45 | ("scholar", "writes"), 46 | ("yelp", "users"), 47 | ] 48 | 49 | 50 | # Run the main function 51 | def translate(dialects): 52 | for dialect in tqdm(dialects): 53 | print(f"Translating DDL to {dialect} dialect...") 54 | for db_name in tqdm(db_names): 55 | conv_ddl_to_dialect(db_name, dialect) 56 | if dialect == "bigquery": 57 | create_bq_db(bigquery_proj, db_name) 58 | time.sleep(10) 59 | elif dialect == "mysql": 60 | create_mysql_db(db_name) 61 | elif dialect == "sqlite": 62 | create_sqlite_db(db_name) 63 | elif dialect == "tsql": 64 | create_tsql_db(db_name) 65 | tries = 0 66 | while tries < 20: 67 | try: 68 | test_query_db(db_name, dialect, test_queries) 69 | break 70 | except Exception as e: 71 | if "not found" in str(e): 72 | # print(f"Table not found. Retrying...") 73 | tries += 1 74 | continue 75 | else: 76 | break 77 | 78 | 79 | if __name__ == "__main__": 80 | dialects = [ 81 | "bigquery", 82 | "mysql", 83 | "sqlite", 84 | "tsql", 85 | ] # Supported dialects: bigquery, mysql, sqlite, tsql 86 | parser = argparse.ArgumentParser() 87 | parser.add_argument( 88 | "--dialects", 89 | nargs="+", 90 | default=dialects, 91 | help="List of dialects to translate the DDL statements to", 92 | ) 93 | args = parser.parse_args() 94 | if args.dialects: 95 | translate(args.dialects) 96 | else: 97 | translate(dialects) 98 | --------------------------------------------------------------------------------