├── .dockerignore ├── MANIFEST.in ├── .github ├── titleLint.yml ├── CODEOWNERS ├── workflows │ ├── license.yml │ ├── pypipublish.yml │ └── pull_request.yml ├── stale.yml └── PULL_REQUEST_TEMPLATE.md ├── example ├── sample_data │ ├── sample_tags.csv │ ├── sample_schema_description.csv │ ├── sample_source.csv │ ├── sample_table_last_updated.csv │ ├── sample_table_owner.csv │ ├── sample_dashboard_owner.csv │ ├── sample_dashboard_last_modified.csv │ ├── sample_dashboard_usage.csv │ ├── sample_application.csv │ ├── sample_dashboard_table.csv │ ├── sample_dashboard_query.csv │ ├── sample_watermark.csv │ ├── sample_dashboard_last_execution.csv │ ├── sample_table.csv │ ├── sample_user.csv │ ├── sample_table_programmatic_source.csv │ ├── sample_col.csv │ ├── sample_table_column_stats.csv │ ├── sample_column_usage.csv │ └── sample_dashboard_base.csv └── __init__.py ├── tests ├── __init__.py └── unit │ ├── __init__.py │ ├── task │ └── __init__.py │ ├── callback │ ├── __init__.py │ └── test_call_back.py │ ├── extractor │ ├── __init__.py │ ├── user │ │ ├── __init__.py │ │ └── bamboohr │ │ │ ├── __init__.py │ │ │ └── test_bamboohr_user_extractor.py │ ├── dashboard │ │ ├── __init__.py │ │ ├── redash │ │ │ └── __init__.py │ │ ├── tableau │ │ │ ├── __init__.py │ │ │ ├── test_tableau_dashboard_query_extractor.py │ │ │ ├── test_tableau_dashboard_last_modified_extractor.py │ │ │ └── test_tableau_dashboard_extractor.py │ │ └── mode_analytics │ │ │ ├── __init__.py │ │ │ └── batch │ │ │ ├── __init__.py │ │ │ └── test_mode_dashboard_charts_batch_extractor.py │ ├── restapi │ │ ├── __init__.py │ │ └── test_rest_api_extractor.py │ ├── test_neo4j_es_last_updated_extractor.py │ ├── test_csv_extractor.py │ ├── test_generic_extractor.py │ └── test_kafka_source_extractor.py │ ├── loader │ ├── __init__.py │ └── test_generic_loader.py │ ├── models │ ├── __init__.py │ ├── schema │ │ └── __init__.py │ ├── dashboard │ │ ├── __init__.py │ │ ├── test_dashboard_owner.py │ │ ├── test_dashboard_query.py │ │ ├── test_dashboard_last_modified.py │ │ └── test_dashboard_usage.py │ ├── test_neo4j_es_last_updated.py │ ├── test_metric_elasticsearch_document.py │ ├── test_table_column_usage.py │ ├── test_user_elasticsearch_document.py │ ├── test_dashboard_elasticsearch_document.py │ ├── test_table_elasticsearch_document.py │ ├── test_table_lineage.py │ ├── test_table_stats.py │ ├── test_table_source.py │ └── test_table_last_updated.py │ ├── publisher │ ├── __init__.py │ └── test_publisher.py │ ├── rest_api │ ├── __init__.py │ ├── mode_analytics │ │ └── __init__.py │ └── test_rest_api_failure_handlers.py │ ├── usage │ ├── __init__.py │ └── presto │ │ └── __init__.py │ ├── filesystem │ ├── __init__.py │ └── test_filesystem.py │ ├── resources │ ├── fs_neo4j_csv_loader │ │ ├── movies │ │ │ ├── nodes │ │ │ │ ├── Movie_0.csv │ │ │ │ ├── City_0.csv │ │ │ │ └── Actor_0.csv │ │ │ └── relationships │ │ │ │ ├── test_Movie_Actor_ACTOR.csv │ │ │ │ └── test_Movie_City_FILMED_AT.csv │ │ └── people │ │ │ └── nodes │ │ │ ├── Person_0.csv │ │ │ └── Person_1.csv │ ├── csv_publisher │ │ ├── nodes │ │ │ ├── test_table.csv │ │ │ └── test_column.csv │ │ └── relations │ │ │ └── test_edge_short.csv │ └── extractor │ │ └── user │ │ └── bamboohr │ │ └── testdata.xml │ └── transformer │ ├── __init__.py │ ├── test_template_variable_substitution_transformer.py │ ├── test_dict_to_model_transformer.py │ ├── test_remove_field_transformer.py │ ├── test_timestamp_string_to_epoch_transformer.py │ ├── test_chained_transformer.py │ ├── test_regex_str_replace_transformer.py │ ├── test_bigquery_usage_transformer.py │ └── test_table_tag_transformer.py ├── databuilder ├── job │ ├── __init__.py │ └── base_job.py ├── callback │ ├── __init__.py │ └── call_back.py ├── loader │ ├── __init__.py │ ├── base_loader.py │ ├── generic_loader.py │ ├── file_system_csv_loader.py │ └── file_system_elasticsearch_json_loader.py ├── models │ ├── __init__.py │ ├── schema │ │ ├── __init__.py │ │ ├── schema_constant.py │ │ └── schema.py │ ├── usage │ │ ├── __init__.py │ │ └── usage_constants.py │ ├── cluster │ │ ├── __init__.py │ │ └── cluster_constants.py │ ├── dashboard │ │ ├── __init__.py │ │ └── dashboard_owner.py │ ├── timestamp │ │ ├── __init__.py │ │ └── timestamp_constants.py │ ├── owner_constants.py │ ├── graph_node.py │ ├── graph_relationship.py │ ├── presto_query_logs.py │ ├── elasticsearch_document.py │ ├── metric_elasticsearch_document.py │ ├── user_elasticsearch_document.py │ ├── dashboard_elasticsearch_document.py │ ├── neo4j_es_last_updated.py │ └── table_elasticsearch_document.py ├── rest_api │ ├── __init__.py │ ├── mode_analytics │ │ ├── __init__.py │ │ └── mode_paginated_rest_api_query.py │ ├── rest_api_failure_handlers.py │ └── base_rest_api_query.py ├── task │ ├── __init__.py │ ├── base_task.py │ └── task.py ├── utils │ ├── __init__.py │ └── closer.py ├── extractor │ ├── __init__.py │ ├── restapi │ │ ├── __init__.py │ │ └── rest_api_extractor.py │ ├── user │ │ ├── __init__.py │ │ └── bamboohr │ │ │ ├── __init__.py │ │ │ └── bamboohr_user_extractor.py │ ├── dashboard │ │ ├── __init__.py │ │ ├── redash │ │ │ └── __init__.py │ │ ├── tableau │ │ │ ├── __init__.py │ │ │ └── tableau_dashboard_constants.py │ │ └── mode_analytics │ │ │ ├── __init__.py │ │ │ ├── batch │ │ │ └── __init__.py │ │ │ ├── mode_dashboard_constants.py │ │ │ ├── mode_dashboard_usage_extractor.py │ │ │ ├── mode_dashboard_last_successful_executions_extractor.py │ │ │ └── mode_dashboard_last_modified_timestamp_extractor.py │ ├── table_metadata_constants.py │ ├── base_extractor.py │ ├── generic_extractor.py │ ├── neo4j_es_last_updated_extractor.py │ ├── postgres_metadata_extractor.py │ ├── sql_alchemy_extractor.py │ ├── db_api_extractor.py │ ├── redshift_metadata_extractor.py │ └── glue_extractor.py ├── filesystem │ ├── __init__.py │ └── metadata.py ├── publisher │ ├── __init__.py │ └── base_publisher.py ├── serializers │ ├── __init__.py │ └── neo4_serializer.py ├── transformer │ ├── __init__.py │ ├── remove_field_transformer.py │ ├── dict_to_model.py │ ├── generic_transformer.py │ ├── table_tag_transformer.py │ ├── template_variable_substitution_transformer.py │ ├── timestamp_string_to_epoch.py │ ├── regex_str_replace_transformer.py │ ├── bigquery_usage_transformer.py │ └── base_transformer.py └── __init__.py ├── NOTICE ├── docs └── assets │ ├── AmundsenDataBuilder.png │ └── dashboard_graph_modeling.png ├── .dependabot └── config.yml ├── CODE_OF_CONDUCT.md ├── .gitignore ├── Makefile ├── setup.cfg ├── requirements.txt └── setup.py /.dockerignore: -------------------------------------------------------------------------------- 1 | .git 2 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt 2 | -------------------------------------------------------------------------------- /.github/titleLint.yml: -------------------------------------------------------------------------------- 1 | regex: (build|ci|docs|feat|fix|perf|refactor|style|test|chore|other): .* 2 | -------------------------------------------------------------------------------- /example/sample_data/sample_tags.csv: -------------------------------------------------------------------------------- 1 | name,tag_type 2 | pii,default 3 | high_quality,default 4 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /example/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /tests/unit/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /databuilder/job/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /tests/unit/task/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | amundsendatabuilder 2 | Copyright 2018-2019 Lyft Inc. 3 | 4 | This product includes software developed at Lyft Inc. 5 | -------------------------------------------------------------------------------- /databuilder/callback/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /databuilder/loader/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /databuilder/models/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /databuilder/rest_api/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /databuilder/task/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /databuilder/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /tests/unit/callback/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /tests/unit/extractor/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /tests/unit/loader/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /tests/unit/models/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /tests/unit/publisher/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /tests/unit/rest_api/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /tests/unit/usage/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /databuilder/extractor/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /databuilder/filesystem/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /databuilder/models/schema/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /databuilder/models/usage/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /databuilder/publisher/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /databuilder/serializers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /databuilder/transformer/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /tests/unit/extractor/user/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /tests/unit/filesystem/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /tests/unit/models/schema/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /tests/unit/resources/fs_neo4j_csv_loader/movies/nodes/Movie_0.csv: -------------------------------------------------------------------------------- 1 | "name","KEY","LABEL" 2 | "Top Gun","movie://Top Gun","Movie" 3 | -------------------------------------------------------------------------------- /tests/unit/transformer/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /tests/unit/usage/presto/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /databuilder/extractor/restapi/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /databuilder/extractor/user/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /databuilder/models/cluster/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /databuilder/models/dashboard/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /databuilder/models/timestamp/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /docs/assets/AmundsenDataBuilder.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dropbox/amundsendatabuilder/master/docs/assets/AmundsenDataBuilder.png -------------------------------------------------------------------------------- /tests/unit/extractor/dashboard/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /tests/unit/extractor/restapi/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /tests/unit/models/dashboard/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /databuilder/extractor/dashboard/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /databuilder/extractor/user/bamboohr/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /databuilder/rest_api/mode_analytics/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /tests/unit/extractor/user/bamboohr/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /tests/unit/rest_api/mode_analytics/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /databuilder/extractor/dashboard/redash/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /databuilder/extractor/dashboard/tableau/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /docs/assets/dashboard_graph_modeling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dropbox/amundsendatabuilder/master/docs/assets/dashboard_graph_modeling.png -------------------------------------------------------------------------------- /example/sample_data/sample_schema_description.csv: -------------------------------------------------------------------------------- 1 | schema_key,schema,description 2 | hive://gold.test_schema,test_schema,"test schema description" -------------------------------------------------------------------------------- /tests/unit/extractor/dashboard/redash/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /tests/unit/extractor/dashboard/tableau/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /tests/unit/resources/fs_neo4j_csv_loader/people/nodes/Person_0.csv: -------------------------------------------------------------------------------- 1 | "name","job","KEY","LABEL" 2 | "Taylor","Engineer","person://Taylor","Person" 3 | -------------------------------------------------------------------------------- /tests/unit/resources/fs_neo4j_csv_loader/people/nodes/Person_1.csv: -------------------------------------------------------------------------------- 1 | "name","pet","KEY","LABEL" 2 | "Griffin","Lion","person://Griffin","Person" 3 | -------------------------------------------------------------------------------- /.dependabot/config.yml: -------------------------------------------------------------------------------- 1 | version: 1 2 | update_configs: 3 | - package_manager: "python" 4 | directory: "/" 5 | update_schedule: "monthly" 6 | -------------------------------------------------------------------------------- /tests/unit/extractor/dashboard/mode_analytics/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /databuilder/extractor/dashboard/mode_analytics/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /databuilder/extractor/dashboard/mode_analytics/batch/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /tests/unit/extractor/dashboard/mode_analytics/batch/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /tests/unit/resources/fs_neo4j_csv_loader/movies/nodes/City_0.csv: -------------------------------------------------------------------------------- 1 | "name","KEY","LABEL" 2 | "Top Gun","city://San Diego","City" 3 | "Top Gun","city://Oakland","City" 4 | -------------------------------------------------------------------------------- /tests/unit/resources/fs_neo4j_csv_loader/movies/nodes/Actor_0.csv: -------------------------------------------------------------------------------- 1 | "name","KEY","LABEL" 2 | "Top Gun","actor://Tom Cruise","Actor" 3 | "Top Gun","actor://Meg Ryan","Actor" 4 | -------------------------------------------------------------------------------- /example/sample_data/sample_source.csv: -------------------------------------------------------------------------------- 1 | db_name,cluster,schema,table_name,source,source_type 2 | hive,gold,test_schema,test_table1,"https://github.com/amundsen-io/amundsen/",github 3 | -------------------------------------------------------------------------------- /example/sample_data/sample_table_last_updated.csv: -------------------------------------------------------------------------------- 1 | cluster,db,schema,table_name,last_updated_time_epoch 2 | gold,hive,test_schema,test_table1,1570230473 3 | gold,dynamo,test_schema,test_table2,1070230473 4 | -------------------------------------------------------------------------------- /example/sample_data/sample_table_owner.csv: -------------------------------------------------------------------------------- 1 | db_name,schema,cluster,table_name,owners 2 | hive,test_schema,gold,test_table1,"roald.amundsen@example.org,chrisc@example.org" 3 | dynamo,test_schema,gold,test_table2, 4 | -------------------------------------------------------------------------------- /tests/unit/resources/csv_publisher/nodes/test_table.csv: -------------------------------------------------------------------------------- 1 | "KEY","name","LABEL" 2 | "presto://gold.test_schema1/test_table1","test_table1","Table" 3 | "presto://gold.test_schema1/test_table2","test_table2","Table" 4 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | This project is governed by [Amundsen's code of conduct](https://github.com/amundsen-io/amundsen/blob/master/CODE_OF_CONDUCT.md). 2 | All contributors and participants agree to abide by its terms. 3 | -------------------------------------------------------------------------------- /databuilder/models/owner_constants.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | 5 | OWNER_RELATION_TYPE = 'OWNER' 6 | OWNER_OF_OBJECT_RELATION_TYPE = 'OWNER_OF' 7 | -------------------------------------------------------------------------------- /databuilder/extractor/table_metadata_constants.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # String for partition column badge 5 | PARTITION_BADGE = 'partition column' 6 | -------------------------------------------------------------------------------- /example/sample_data/sample_dashboard_owner.csv: -------------------------------------------------------------------------------- 1 | product,cluster,dashboard_group_id,dashboard_id,email 2 | mode,gold,test_group_id_1,test_dashboard_id_1,roald.amundsen@example.org 3 | mode,gold,test_group_id_2,test_dashboard_id_2,buzz@example.org -------------------------------------------------------------------------------- /example/sample_data/sample_dashboard_last_modified.csv: -------------------------------------------------------------------------------- 1 | product,cluster,dashboard_group_id,dashboard_id,last_modified_timestamp 2 | mode,gold,test_group_id_1,test_dashboard_id_1,1592351454 3 | mode,gold,test_group_id_2,test_dashboard_id_2,1592311423 -------------------------------------------------------------------------------- /example/sample_data/sample_dashboard_usage.csv: -------------------------------------------------------------------------------- 1 | product,cluster,dashboard_group_id,dashboard_id,view_count,email 2 | mode,gold,test_group_id_1,test_dashboard_id_1,100,roald.amundsen@example.org 3 | mode,gold,test_group_id_2,test_dashboard_id_2,2000,chrisc@example.org -------------------------------------------------------------------------------- /databuilder/models/usage/usage_constants.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | READ_RELATION_TYPE = 'READ' 5 | READ_REVERSE_RELATION_TYPE = 'READ_BY' 6 | 7 | READ_RELATION_COUNT_PROPERTY = 'read_count' 8 | -------------------------------------------------------------------------------- /tests/unit/resources/csv_publisher/nodes/test_column.csv: -------------------------------------------------------------------------------- 1 | "KEY","name","order_pos:UNQUOTED","type","LABEL" 2 | "presto://gold.test_schema1/test_table1/test_id1","test_id1",1,"bigint","Column" 3 | "presto://gold.test_schema1/test_table1/test_id2","test_id2",2,"bigint","Column" 4 | -------------------------------------------------------------------------------- /example/sample_data/sample_application.csv: -------------------------------------------------------------------------------- 1 | task_id,dag_id,exec_date,application_url_template,db_name,schema,table_name,cluster 2 | hive.test_schema.test_table1,event_test,"2018-05-31T00:00:00","https://airflow_host.net/admin/airflow/tree?dag_id={dag_id}",hive,test_schema,test_table1,gold 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | *.pyc 3 | *.pyo 4 | *.pyt 5 | *.pytc 6 | *.egg-info 7 | .*.swp 8 | .DS_Store 9 | build/ 10 | dist/ 11 | venv/ 12 | venv3/ 13 | .python-version 14 | .cache/ 15 | .env 16 | .idea/ 17 | .vscode/ 18 | .coverage 19 | .mypy_cache 20 | .pytest_cache 21 | **/coverage.xml 22 | **/htmlcov/** 23 | -------------------------------------------------------------------------------- /example/sample_data/sample_dashboard_table.csv: -------------------------------------------------------------------------------- 1 | product,cluster,dashboard_group_id,dashboard_id,table_ids 2 | mode,gold,test_group_id_1,test_dashboard_id_1,"hive://gold.test_schema/test_table1" 3 | mode,gold,test_group_id_2,test_dashboard_id_2,"hive://gold.test_schema/test_view1,hive://gold.test_schema/test_table3" 4 | -------------------------------------------------------------------------------- /tests/unit/resources/fs_neo4j_csv_loader/movies/relationships/test_Movie_Actor_ACTOR.csv: -------------------------------------------------------------------------------- 1 | "END_KEY","START_LABEL","END_LABEL","START_KEY","TYPE","REVERSE_TYPE" 2 | "actor://Tom Cruise","Movie","Actor","movie://Top Gun","ACTOR","ACTED_IN" 3 | "actor://Meg Ryan","Movie","Actor","movie://Top Gun","ACTOR","ACTED_IN" 4 | -------------------------------------------------------------------------------- /databuilder/models/cluster/cluster_constants.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | CLUSTER_NODE_LABEL = 'Cluster' 5 | 6 | CLUSTER_RELATION_TYPE = 'CLUSTER' 7 | CLUSTER_REVERSE_RELATION_TYPE = 'CLUSTER_OF' 8 | 9 | CLUSTER_NAME_PROP_KEY = 'name' 10 | -------------------------------------------------------------------------------- /tests/unit/resources/fs_neo4j_csv_loader/movies/relationships/test_Movie_City_FILMED_AT.csv: -------------------------------------------------------------------------------- 1 | "END_KEY","START_LABEL","END_LABEL","START_KEY","TYPE","REVERSE_TYPE" 2 | "city://San Diego","Movie","City","city://Top Gun","FILMED_AT","APPEARS_IN" 3 | "city://Oakland","Movie","City","city://Top Gun","FILMED_AT","APPEARS_IN" 4 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | clean: 2 | find . -name \*.pyc -delete 3 | find . -name __pycache__ -delete 4 | rm -rf dist/ 5 | 6 | .PHONY: test_unit 7 | test_unit: 8 | python3 -bb -m pytest tests 9 | 10 | lint: 11 | flake8 . 12 | 13 | .PHONY: mypy 14 | mypy: 15 | mypy . 16 | 17 | .PHONY: test 18 | test: test_unit lint mypy 19 | 20 | -------------------------------------------------------------------------------- /databuilder/models/graph_node.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from collections import namedtuple 5 | 6 | GraphNode = namedtuple( 7 | 'GraphNode', 8 | [ 9 | 'key', 10 | 'label', 11 | 'attributes' 12 | ] 13 | ) 14 | -------------------------------------------------------------------------------- /databuilder/models/schema/schema_constant.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | SCHEMA_NODE_LABEL = 'Schema' 5 | 6 | SCHEMA_NAME_ATTR = 'name' 7 | 8 | SCHEMA_RELATION_TYPE = 'SCHEMA' 9 | SCHEMA_REVERSE_RELATION_TYPE = 'SCHEMA_OF' 10 | 11 | DATABASE_SCHEMA_KEY_FORMAT = '{db}://{cluster}.{schema}' 12 | -------------------------------------------------------------------------------- /tests/unit/resources/csv_publisher/relations/test_edge_short.csv: -------------------------------------------------------------------------------- 1 | "START_LABEL","START_KEY","END_LABEL","END_KEY","TYPE","REVERSE_TYPE" 2 | "Table","presto://gold.test_schema1/test_table1","Column","presto://gold.test_schema1/test_table1/test_id1","COLUMN","BELONG_TO_TABLE" 3 | "Table","presto://gold.test_schema1/test_table1","Column","presto://gold.test_schema1/test_table1/test_id2","COLUMN","BELONG_TO_TABLE" 4 | -------------------------------------------------------------------------------- /databuilder/models/graph_relationship.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from collections import namedtuple 5 | 6 | GraphRelationship = namedtuple( 7 | 'GraphRelationship', 8 | [ 9 | 'start_label', 10 | 'end_label', 11 | 'start_key', 12 | 'end_key', 13 | 'type', 14 | 'reverse_type', 15 | 'attributes' 16 | ] 17 | ) 18 | -------------------------------------------------------------------------------- /example/sample_data/sample_dashboard_query.csv: -------------------------------------------------------------------------------- 1 | product,cluster,dashboard_group_id,dashboard_id,query_name,query_id,url,query_text 2 | mode,gold,test_group_id_1,test_dashboard_id_1,first query,query_1,http://mode.test_group_id_1.com/test_dashboard_id_1/query/query_1,SELECT * FROM foo.bar 3 | mode,gold,test_group_id_2,test_dashboard_id_2,second query,query_2,http://mode.test_group_id_2.com/test_dashboard_id_2/query/query_2,SELECT * FROM bar.foo JOIN foo.bar USING (baz) -------------------------------------------------------------------------------- /databuilder/extractor/dashboard/mode_analytics/mode_dashboard_constants.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | ORGANIZATION = 'organization' 5 | MODE_ACCESS_TOKEN = 'mode_user_token' 6 | MODE_PASSWORD_TOKEN = 'mode_password_token' 7 | 8 | # this token is needed to access batch discover endpoint 9 | # e.g https://mode.com/developer/discovery-api/introduction/ 10 | MODE_BEARER_TOKEN = 'mode_bearer_token' 11 | -------------------------------------------------------------------------------- /example/sample_data/sample_watermark.csv: -------------------------------------------------------------------------------- 1 | create_time,database,schema,table_name,part_name,part_type,cluster 2 | 2019-10-01T12:13:14,hive,test_schema,test_table1,col3=2017-04-22/col4=0,low_watermark,gold 3 | 2019-10-01T12:13:14,hive,test_schema,test_table1,col3=2019-09-30/col4=11,high_watermark,gold 4 | 2019-10-01T12:13:14,dynamo,test_schema,test_table2,col3=2018-01-01,low_watermark,gold 5 | 2019-10-01T12:13:14,dynamo,test_schema,test_table2,col3=2019-10-01,high_watermark,gold 6 | -------------------------------------------------------------------------------- /example/sample_data/sample_dashboard_last_execution.csv: -------------------------------------------------------------------------------- 1 | product,cluster,dashboard_group_id,dashboard_id,execution_id,execution_timestamp,execution_state 2 | mode,gold,test_group_id_1,test_dashboard_id_1,_last_successful_execution,1592351193,success 3 | mode,gold,test_group_id_2,test_dashboard_id_2,_last_successful_execution,1592351210,success 4 | mode,gold,test_group_id_1,test_dashboard_id_1,_last_execution,1593351193,fail 5 | mode,gold,test_group_id_2,test_dashboard_id_2,_last_execution,1594351210,success -------------------------------------------------------------------------------- /example/sample_data/sample_table.csv: -------------------------------------------------------------------------------- 1 | database,cluster,schema,name,description,tags,is_view,description_source 2 | hive,gold,test_schema,test_table1,"1st test table","tag1,tag2,pii,high_quality",false, 3 | dynamo,gold,test_schema,test_table2,"2nd test table","high_quality,recommended",false, 4 | hive,gold,test_schema,test_view1,"1st test view","tag1",true, 5 | hive,gold,test_schema,test_table3,"3rd test","needs_documentation",false, 6 | hive,gold,test_schema,"test's_table4","4th test","needs_documentation",false, 7 | -------------------------------------------------------------------------------- /example/sample_data/sample_user.csv: -------------------------------------------------------------------------------- 1 | email,first_name,last_name,full_name,github_username,team_name,employee_type,manager_email,slack_id,role_name 2 | roald.amundsen@example.org,Roald,Amundsen,"Roald Amundsen",amundsen-io,"Team Amundsen",sailor,"phboss@example.org",ramundzn,swe 3 | chrisc@example.org,Christopher,Columbus,"Christopher Columbus",ChristopherColumbusFAKE,"Team Amundsen",sailor,"phboss@example.org",chrisc,swe 4 | buzz@example.org,Buzz,Aldrin,"Buzz Aldrin",BuzzAldrinFAKE,"Team Amundsen",astronaut,"phboss@example.org",buzz,swe 5 | -------------------------------------------------------------------------------- /databuilder/models/presto_query_logs.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | 5 | class PrestoQueryLogs: 6 | """ 7 | Presto Query logs model. 8 | Sql result has one row per presto query. 9 | """ 10 | 11 | def __init__(self, 12 | user: str, 13 | query_text: str, 14 | occurred_at: str 15 | ) -> None: 16 | self.user = user 17 | self.query_text = query_text 18 | self.occurred_at = occurred_at 19 | -------------------------------------------------------------------------------- /databuilder/models/timestamp/timestamp_constants.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from enum import Enum 5 | 6 | NODE_LABEL = 'Timestamp' 7 | 8 | TIMESTAMP_PROPERTY = 'timestamp' 9 | TIMESTAMP_NAME_PROPERTY = 'name' 10 | # This is deprecated property as it's not generic for the Timestamp 11 | DEPRECATED_TIMESTAMP_PROPERTY = 'last_updated_timestamp' 12 | 13 | 14 | LASTUPDATED_RELATION_TYPE = 'LAST_UPDATED_AT' 15 | LASTUPDATED_REVERSE_RELATION_TYPE = 'LAST_UPDATED_TIME_OF' 16 | 17 | 18 | class TimestampName(Enum): 19 | last_updated_timestamp = 1 20 | -------------------------------------------------------------------------------- /example/sample_data/sample_table_programmatic_source.csv: -------------------------------------------------------------------------------- 1 | database,cluster,schema,name,description,tags,description_source 2 | hive,gold,test_schema,test_table1,"**Size**: 50T 3 | 4 | **Monthly Cost**: $5000","expensive","s3_crawler" 5 | dynamo,gold,test_schema,test_table2,"**Size**: 1T 6 | 7 | **Monthly Cost**: $50","cheap","s3_crawler" 8 | hive,gold,test_schema,test_table1,"### Quality Report: 9 | --- 10 | Ipsus enom. Ipsus enom ipsus lorenum. 11 | --- 12 | [![Build Status](https://api.travis-ci.com/amundsen-io/amundsendatabuilder.svg?branch=master)](https://travis-ci.com/amundsen-io/amundsendatabuilder)","low_quality","quality_service" 13 | -------------------------------------------------------------------------------- /databuilder/loader/base_loader.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import abc 5 | 6 | from pyhocon import ConfigTree 7 | 8 | from databuilder import Scoped 9 | from typing import Any 10 | 11 | 12 | class Loader(Scoped): 13 | """ 14 | A loader loads to the destination or to the staging area 15 | """ 16 | @abc.abstractmethod 17 | def init(self, conf: ConfigTree) -> None: 18 | pass 19 | 20 | @abc.abstractmethod 21 | def load(self, record: Any) -> None: 22 | pass 23 | 24 | def get_scope(self) -> str: 25 | return 'loader' 26 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # Codeowners file by GitHub 2 | # Reference: https://docs.github.com/en/github/creating-cloning-and-archiving-repositories/about-code-owners 3 | # Each line is a file pattern followed by one or more owners. 4 | # Order is important; the last matching pattern takes the most 5 | # precedence. 6 | 7 | # These owners will be the default owners for everything in 8 | # the repo. Unless a later match takes precedence, 9 | # @amundsen-io/amundsen-committerswill be requested for 10 | # review when someone opens a pull request. 11 | * @amundsen-io/amundsen-committers 12 | 13 | *.py @feng-tao @jinhyukchang @allisonsuarez @dikshathakur3119 14 | -------------------------------------------------------------------------------- /databuilder/models/elasticsearch_document.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import json 5 | from abc import ABCMeta 6 | 7 | 8 | class ElasticsearchDocument: 9 | """ 10 | Base class for ElasticsearchDocument 11 | Each different resource ESDoc will be a subclass 12 | """ 13 | __metaclass__ = ABCMeta 14 | 15 | def to_json(self) -> str: 16 | """ 17 | Convert object to json 18 | :return: 19 | """ 20 | obj_dict = {k: v for k, v in sorted(self.__dict__.items())} 21 | data = json.dumps(obj_dict) + "\n" 22 | return data 23 | -------------------------------------------------------------------------------- /databuilder/task/base_task.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import abc 5 | 6 | from pyhocon import ConfigTree 7 | 8 | from databuilder import Scoped 9 | 10 | 11 | class Task(Scoped): 12 | """ 13 | A Abstract task that can run an abstract task 14 | """ 15 | @abc.abstractmethod 16 | def init(self, conf: ConfigTree) -> None: 17 | pass 18 | 19 | @abc.abstractmethod 20 | def run(self) -> None: 21 | """ 22 | Runs a task 23 | :return: 24 | """ 25 | pass 26 | 27 | def get_scope(self) -> str: 28 | return 'task' 29 | -------------------------------------------------------------------------------- /databuilder/filesystem/metadata.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from datetime import datetime 5 | 6 | 7 | class FileMetadata(object): 8 | 9 | def __init__(self, 10 | path: str, 11 | last_updated: datetime, 12 | size: int 13 | ) -> None: 14 | self.path = path 15 | self.last_updated = last_updated 16 | self.size = size 17 | 18 | def __repr__(self) -> str: 19 | return """FileMetadata(path={!r}, last_updated={!r}, size={!r})""" \ 20 | .format(self.path, self.last_updated, self.size) 21 | -------------------------------------------------------------------------------- /.github/workflows/license.yml: -------------------------------------------------------------------------------- 1 | name: license 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | branches: 9 | - master 10 | 11 | jobs: 12 | build: 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v2 17 | - name: Set up Golang 18 | uses: actions/setup-go@v2 19 | - name: Install addlicense 20 | run: | 21 | export PATH=${PATH}:`go env GOPATH`/bin 22 | go get -v -u github.com/google/addlicense 23 | - name: Check license 24 | run: | 25 | export PATH=${PATH}:`go env GOPATH`/bin 26 | addlicense -check -l mit -c "Amundsen" $(find $PWD -type f -name '*.py') -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | format = pylint 3 | exclude = .svc,CVS,.bzr,.hg,.git,__pycache__,venv,build,databuilder/sql_parser/usage/presto/antlr_generated 4 | max-complexity = 10 5 | max-line-length = 120 6 | ignore = NONE 7 | 8 | [pep8] 9 | max-line-length = 120 10 | 11 | [tool:pytest] 12 | addopts = -rs --cov=databuilder --cov-fail-under=70 --cov-report=term-missing:skip-covered --cov-report=xml --cov-report=html -vvv 13 | 14 | [coverage:run] 15 | branch = True 16 | omit = */antlr_generated/* 17 | 18 | [coverage:xml] 19 | output = build/coverage.xml 20 | 21 | [coverage:html] 22 | directory = build/coverage_html 23 | 24 | [mypy] 25 | python_version = 3.6 26 | disallow_untyped_defs = True 27 | ignore_missing_imports = True 28 | -------------------------------------------------------------------------------- /databuilder/extractor/dashboard/tableau/tableau_dashboard_constants.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | API_VERSION = 'api_version' 5 | API_BASE_URL = 'api_base_url' 6 | TABLEAU_BASE_URL = 'tableau_base_url' 7 | SITE_NAME = 'site_name' 8 | TABLEAU_ACCESS_TOKEN_NAME = 'tableau_personal_access_token_name' 9 | TABLEAU_ACCESS_TOKEN_SECRET = 'tableau_personal_access_token_secret' 10 | EXCLUDED_PROJECTS = 'excluded_projects' 11 | EXTERNAL_CLUSTER_NAME = 'external_cluster_name' 12 | EXTERNAL_SCHEMA_NAME = 'external_schema_name' 13 | EXTERNAL_TABLE_TYPES = 'external_table_types' 14 | CLUSTER = 'cluster' 15 | DATABASE = 'database' 16 | VERIFY_REQUEST = 'verify_request' 17 | -------------------------------------------------------------------------------- /databuilder/extractor/base_extractor.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import abc 5 | 6 | from pyhocon import ConfigTree 7 | from typing import Any 8 | 9 | from databuilder import Scoped 10 | 11 | 12 | class Extractor(Scoped): 13 | """ 14 | An extractor extracts record 15 | """ 16 | 17 | @abc.abstractmethod 18 | def init(self, conf: ConfigTree) -> None: 19 | pass 20 | 21 | @abc.abstractmethod 22 | def extract(self) -> Any: 23 | """ 24 | :return: Provides a record or None if no more to extract 25 | """ 26 | return None 27 | 28 | def get_scope(self) -> str: 29 | return 'extractor' 30 | -------------------------------------------------------------------------------- /databuilder/job/base_job.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import abc 5 | 6 | from pyhocon import ConfigTree 7 | 8 | from databuilder import Scoped 9 | from databuilder.utils.closer import Closer 10 | 11 | 12 | class Job(Scoped): 13 | closer = Closer() 14 | 15 | """ 16 | A Databuilder job that represents single work unit. 17 | """ 18 | @abc.abstractmethod 19 | def init(self, conf: ConfigTree) -> None: 20 | pass 21 | 22 | @abc.abstractmethod 23 | def launch(self) -> None: 24 | """ 25 | Launch a job 26 | :return: None 27 | """ 28 | pass 29 | 30 | def get_scope(self) -> str: 31 | return 'job' 32 | -------------------------------------------------------------------------------- /databuilder/models/metric_elasticsearch_document.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from typing import List 5 | 6 | from databuilder.models.elasticsearch_document import ElasticsearchDocument 7 | 8 | 9 | class MetricESDocument(ElasticsearchDocument): 10 | """ 11 | Schema for the Search index document 12 | """ 13 | 14 | def __init__(self, 15 | name: str, 16 | description: str, 17 | type: str, 18 | dashboards: List, 19 | tags: List, 20 | ) -> None: 21 | self.name = name 22 | self.description = description 23 | self.type = type 24 | self.dashboards = dashboards 25 | self.tags = tags 26 | -------------------------------------------------------------------------------- /.github/stale.yml: -------------------------------------------------------------------------------- 1 | # Number of days of inactivity before an issue becomes stale 2 | daysUntilStale: 14 3 | # Number of days of inactivity before a stale issue is closed 4 | daysUntilClose: 21 5 | # Issues with these labels will never be considered stale 6 | exemptLabels: 7 | - keep fresh 8 | # Label to use when marking an issue as stale 9 | staleLabel: stale 10 | # Comment to post when marking an issue as stale. Set to `false` to disable 11 | markComment: > 12 | This issue has been automatically marked as stale because it has not had 13 | recent activity. It will be closed if no further activity occurs. 14 | # Comment to post when closing a stale issue. Set to `false` to disable 15 | closeComment: > 16 | This issue has been automatically closed for inactivity. If you still wish to 17 | make these changes, please open a new pull request or reopen this one. 18 | -------------------------------------------------------------------------------- /.github/workflows/pypipublish.yml: -------------------------------------------------------------------------------- 1 | 2 | name: Build and Deploy 3 | on: 4 | push: 5 | branches: 6 | - master 7 | tags: 8 | - '*' 9 | jobs: 10 | build-and-publish-python-module: 11 | name: Build and publish python module to pypi 12 | runs-on: ubuntu-18.04 13 | steps: 14 | - name: Checkout 15 | uses: actions/checkout@v1 16 | - name: Setup python 3.6 17 | uses: actions/setup-python@v1 18 | with: 19 | python-version: 3.6 20 | - name: Add wheel dependency 21 | run: pip install wheel 22 | - name: Generate dist 23 | run: python setup.py sdist bdist_wheel 24 | - name: Publish to PyPI 25 | if: startsWith(github.event.ref, 'refs/tags') 26 | uses: pypa/gh-action-pypi-publish@master 27 | with: 28 | user: __token__ 29 | password: ${{ secrets.pypi_password }} 30 | -------------------------------------------------------------------------------- /example/sample_data/sample_col.csv: -------------------------------------------------------------------------------- 1 | name,description,col_type,sort_order,database,cluster,schema,table_name 2 | col1,"col1 description","string",1,hive,gold,test_schema,test_table1 3 | col2,"col2 description","string",2,hive,gold,test_schema,test_table1 4 | col3,"col3 description","string",3,hive,gold,test_schema,test_table1 5 | col4,"col4 description","string",4,hive,gold,test_schema,test_table1 6 | col5,"col5 description","float",5,hive,gold,test_schema,test_table1 7 | col1,"col1 description","string",1,dynamo,gold,test_schema,test_table2 8 | col2,"col2 description","string",2,dynamo,gold,test_schema,test_table2 9 | col3,"col3 description","string",3,dynamo,gold,test_schema,test_table2 10 | col4,"col4 description","int",4,dynamo,gold,test_schema,test_table2 11 | col1,"view col description","int",1,hive,gold,test_schema,test_view1 12 | col1,"col1 description","int",1,hive,gold,test_schema,test_table3 13 | -------------------------------------------------------------------------------- /tests/unit/rest_api/test_rest_api_failure_handlers.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import unittest 5 | 6 | from databuilder.rest_api.rest_api_failure_handlers import HttpFailureSkipOnStatus 7 | from mock import MagicMock 8 | 9 | 10 | class TestHttpFailureSkipOnStatus(unittest.TestCase): 11 | 12 | def testSkip(self) -> None: 13 | failure_handler = HttpFailureSkipOnStatus([404, 400]) 14 | 15 | exception = MagicMock() 16 | exception.response.status_code = 404 17 | self.assertTrue(failure_handler.can_skip_failure(exception=exception)) 18 | 19 | exception.response.status_code = 400 20 | self.assertTrue(failure_handler.can_skip_failure(exception=exception)) 21 | 22 | exception.response.status_code = 500 23 | self.assertFalse(failure_handler.can_skip_failure(exception=exception)) 24 | -------------------------------------------------------------------------------- /example/sample_data/sample_table_column_stats.csv: -------------------------------------------------------------------------------- 1 | cluster,db,schema,table_name,col_name,stat_name,stat_val,start_epoch,end_epoch 2 | gold,hive,test_schema,test_table1,col1,"distinct values","8",1432300762,1562300762 3 | gold,hive,test_schema,test_table1,col1,"min","""aardvark""",1432300762,1562300762 4 | gold,hive,test_schema,test_table1,col1,"max","""zebra""",1432300762,1562300762 5 | gold,hive,test_schema,test_table1,col1,"num nulls","""500320""",1432300762,1562300762 6 | gold,hive,test_schema,test_table1,col1,"verified","""230430""",1432300762,1562300762 7 | gold,hive,test_schema,test_table1,col5,"average","""5.0""",1532300762,1572300762 8 | gold,hive,test_schema,test_table1,col5,"max","""500.0""",1534300762,1572300762 9 | gold,hive,test_schema,test_table1,col5,"min","""-500.0""",1534300762,1572300762 10 | gold,dynamo,test_schema,test_table2,col4,"median","""250""",1534300762,1572300762 11 | gold,dynamo,test_schema,test_table2,col4,"average","""400""",1534300762,1572300762 -------------------------------------------------------------------------------- /databuilder/transformer/remove_field_transformer.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import logging 5 | from typing import Any, Dict 6 | 7 | from pyhocon import ConfigTree 8 | 9 | from databuilder.transformer.base_transformer import Transformer 10 | 11 | FIELD_NAMES = 'field_names' # field name to be removed 12 | 13 | LOGGER = logging.getLogger(__name__) 14 | 15 | 16 | class RemoveFieldTransformer(Transformer): 17 | """ 18 | Remove field in Dict by specifying list of fields (keys). 19 | 20 | """ 21 | 22 | def init(self, conf: ConfigTree) -> None: 23 | self._field_names = conf.get_list(FIELD_NAMES) 24 | 25 | def transform(self, record: Dict[str, Any]) -> Dict[str, Any]: 26 | 27 | for k in self._field_names: 28 | if k in record: 29 | del record[k] 30 | 31 | return record 32 | 33 | def get_scope(self) -> str: 34 | return 'transformer.remove_field' 35 | -------------------------------------------------------------------------------- /databuilder/transformer/dict_to_model.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import importlib 5 | import logging 6 | 7 | from pyhocon import ConfigTree 8 | from typing import Any, Dict 9 | 10 | from databuilder.transformer.base_transformer import Transformer 11 | 12 | MODEL_CLASS = 'model_class' 13 | 14 | LOGGER = logging.getLogger(__name__) 15 | 16 | 17 | class DictToModel(Transformer): 18 | """ 19 | Transforms dictionary into model 20 | """ 21 | 22 | def init(self, conf: ConfigTree) -> None: 23 | model_class = conf.get_string(MODEL_CLASS) 24 | module_name, class_name = model_class.rsplit(".", 1) 25 | mod = importlib.import_module(module_name) 26 | self._model_class = getattr(mod, class_name) 27 | 28 | def transform(self, record: Dict[str, Any]) -> Dict[str, Any]: 29 | return self._model_class(**record) 30 | 31 | def get_scope(self) -> str: 32 | return 'transformer.dict_to_model' 33 | -------------------------------------------------------------------------------- /tests/unit/transformer/test_template_variable_substitution_transformer.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import unittest 5 | 6 | from pyhocon import ConfigFactory 7 | 8 | from databuilder.transformer.template_variable_substitution_transformer import \ 9 | TemplateVariableSubstitutionTransformer, FIELD_NAME, TEMPLATE 10 | 11 | 12 | class TestTemplateVariableSubstitutionTransformer(unittest.TestCase): 13 | 14 | def test_conversion(self) -> None: 15 | 16 | transformer = TemplateVariableSubstitutionTransformer() 17 | config = ConfigFactory.from_dict({ 18 | FIELD_NAME: 'baz', 19 | TEMPLATE: 'Hello {foo}' 20 | }) 21 | transformer.init(conf=config) 22 | 23 | actual = transformer.transform({'foo': 'bar'}) 24 | expected = { 25 | 'foo': 'bar', 26 | 'baz': 'Hello bar' 27 | } 28 | self.assertDictEqual(expected, actual) 29 | 30 | 31 | if __name__ == '__main__': 32 | unittest.main() 33 | -------------------------------------------------------------------------------- /example/sample_data/sample_column_usage.csv: -------------------------------------------------------------------------------- 1 | database,cluster,schema,table_name,column_name,user_email,read_count 2 | hive,gold,test_schema,test_table1,col1,roald.amundsen@example.org,500 3 | hive,gold,test_schema,test_table1,col1,aoald0@example.org,100 4 | hive,gold,test_schema,test_table1,col1,boald1@example.org,100 5 | hive,gold,test_schema,test_table1,col1,coald2@example.org,100 6 | hive,gold,test_schema,test_table1,col1,doald3@example.org,100 7 | hive,gold,test_schema,test_table1,col1,eoald4@example.org,100 8 | hive,gold,test_schema,test_table1,col1,foald5@example.org,100 9 | hive,gold,test_schema,test_table1,col1,goald6@example.org,100 10 | hive,gold,test_schema,test_table1,col1,hoald7@example.org,100 11 | hive,gold,test_schema,test_table1,col1,ioald8@example.org,10 12 | hive,gold,test_schema,test_table1,col1,joald9@example.org,10 13 | hive,gold,test_schema,test_table1,col1,koald9@example.org,10 14 | hive,gold,test_schema,test_table2,col1,soald9@example.org,10 15 | hive,gold,test_schema,test_table2,col1,toald9@example.org,10 16 | dynamo,gold,test_schema,test_table2,col1,chrisc@example.org,500 17 | -------------------------------------------------------------------------------- /example/sample_data/sample_dashboard_base.csv: -------------------------------------------------------------------------------- 1 | product,cluster,dashboard_group,dashboard_group_id,dashboard_group_description,dashboard_group_url,dashboard_name,dashboard_id,description,created_timestamp,dashboard_url 2 | mode,gold,test group1,test_group_id_1,test group description 1,http://mode.test_group_id_1.com,test dashboard,test_dashboard_id_1,test dashboard description,1592333799,http://mode.test_group_id_1.com/test_dashboard_id_1 3 | mode,gold,test group1,test_group_id_1,test group description 1_2,http://mode.test_group_id_1.com,test dashboard,test_dashboard_id_1_2,test dashboard description 1_2,1592332799,http://mode.test_group_id_1.com/test_dashboard_id_1_2 4 | mode,gold,test group2,test_group_id_2,test group description 2,http://mode.test_group_id_2.com,test dashboard,test_dashboard_id_2,test dashboard description,1592133799,http://mode.test_group_id_2.com/test_dashboard_id_2 5 | superset,gold,test group3,test_group_id_3,test group description 1,http://mode.test_group_id_3.com,test dashboard,test_dashboard_id_3,test dashboard description,1591333799,http://mode.test_group_id_3.com/test_dashboard_id_3 6 | -------------------------------------------------------------------------------- /databuilder/rest_api/rest_api_failure_handlers.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import abc 5 | 6 | from typing import Iterable 7 | 8 | 9 | class BaseFailureHandler(object, metaclass=abc.ABCMeta): 10 | 11 | @abc.abstractmethod 12 | def can_skip_failure(self, 13 | exception: Exception, 14 | ) -> bool: 15 | pass 16 | 17 | 18 | class HttpFailureSkipOnStatus(BaseFailureHandler): 19 | 20 | def __init__(self, 21 | status_codes_to_skip: Iterable[int], 22 | ) -> None: 23 | self._status_codes_to_skip = {v for v in status_codes_to_skip} 24 | 25 | def can_skip_failure(self, 26 | exception: Exception, 27 | ) -> bool: 28 | try: 29 | status_code: int = getattr(getattr(exception, 'response'), 'status_code') 30 | return status_code in self._status_codes_to_skip 31 | except AttributeError: 32 | pass 33 | 34 | return False 35 | -------------------------------------------------------------------------------- /tests/unit/loader/test_generic_loader.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import unittest 5 | 6 | from mock import MagicMock 7 | from pyhocon import ConfigFactory 8 | 9 | from databuilder.loader.generic_loader import GenericLoader, CALLBACK_FUNCTION 10 | 11 | 12 | class TestGenericLoader(unittest.TestCase): 13 | 14 | def test_loading(self) -> None: 15 | 16 | loader = GenericLoader() 17 | callback_func = MagicMock() 18 | loader.init(conf=ConfigFactory.from_dict({ 19 | CALLBACK_FUNCTION: callback_func 20 | })) 21 | 22 | loader.load({'foo': 'bar'}) 23 | loader.close() 24 | 25 | callback_func.assert_called_once() 26 | 27 | def test_none_loading(self) -> None: 28 | 29 | loader = GenericLoader() 30 | callback_func = MagicMock() 31 | loader.init(conf=ConfigFactory.from_dict({ 32 | CALLBACK_FUNCTION: callback_func 33 | })) 34 | 35 | loader.load(None) 36 | loader.close() 37 | 38 | callback_func.assert_not_called() 39 | -------------------------------------------------------------------------------- /.github/workflows/pull_request.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | on: pull_request 5 | jobs: 6 | pre-commit: 7 | runs-on: ubuntu-18.04 8 | steps: 9 | - name: Checkout 10 | uses: actions/checkout@v1 11 | - name: Setup python 3.6 12 | uses: actions/setup-python@v1 13 | with: 14 | python-version: 3.6 15 | test-unit: 16 | runs-on: ubuntu-18.04 17 | strategy: 18 | matrix: 19 | python-version: ['3.6.x', '3.7.x'] 20 | steps: 21 | - name: Checkout 22 | uses: actions/checkout@v1 23 | - name: Setup python ${{ matrix.python-version }} 24 | uses: actions/setup-python@v1 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | - name: Install dependencies 28 | run: pip3 install -r requirements.txt && pip3 install .[all] && pip3 install codecov 29 | - name: Run python unit tests 30 | run: make test 31 | -------------------------------------------------------------------------------- /databuilder/transformer/generic_transformer.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import logging 5 | 6 | from pyhocon import ConfigTree 7 | from typing import Any, Dict 8 | 9 | from databuilder.transformer.base_transformer import Transformer 10 | 11 | CALLBACK_FUNCTION = 'callback_function' 12 | FIELD_NAME = 'field_name' 13 | 14 | LOGGER = logging.getLogger(__name__) 15 | 16 | 17 | class GenericTransformer(Transformer): 18 | """ 19 | A generic transformer that accepts a callback function that transforms the record on specified field. 20 | """ 21 | 22 | def init(self, conf: ConfigTree) -> None: 23 | self._callback_function = conf.get(CALLBACK_FUNCTION) 24 | self._field_name = conf.get_string(FIELD_NAME) 25 | 26 | def transform(self, record: Dict[str, Any]) -> Dict[str, Any]: 27 | 28 | for k, v in record.items(): 29 | if k == self._field_name: 30 | new_val = self._callback_function(v) 31 | record[k] = new_val 32 | return record 33 | 34 | def get_scope(self) -> str: 35 | return 'transformer.generic' 36 | -------------------------------------------------------------------------------- /databuilder/transformer/table_tag_transformer.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from pyhocon import ConfigFactory, ConfigTree 5 | from typing import Any 6 | 7 | from databuilder.transformer.base_transformer import Transformer 8 | from databuilder.models.table_metadata import TableMetadata 9 | 10 | 11 | class TableTagTransformer(Transformer): 12 | """Simple transformer that adds tags to all table nodes produced as part of a job.""" 13 | # Config 14 | TAGS = 'tags' 15 | DEFAULT_CONFIG = ConfigFactory.from_dict({TAGS: None}) 16 | 17 | def init(self, conf: ConfigTree) -> None: 18 | conf = conf.with_fallback(TableTagTransformer.DEFAULT_CONFIG) 19 | tags = conf.get_string(TableTagTransformer.TAGS) 20 | 21 | self.tags = TableMetadata.format_tags(tags) 22 | 23 | def transform(self, record: Any) -> Any: 24 | if isinstance(record, TableMetadata): 25 | if record.tags: 26 | record.tags += self.tags 27 | else: 28 | record.tags = self.tags 29 | return record 30 | 31 | def get_scope(self) -> str: 32 | return 'transformer.table_tag' 33 | -------------------------------------------------------------------------------- /databuilder/transformer/template_variable_substitution_transformer.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import logging 5 | 6 | from pyhocon import ConfigTree 7 | from typing import Any, Dict 8 | 9 | from databuilder.transformer.base_transformer import Transformer 10 | 11 | TEMPLATE = 'template' 12 | FIELD_NAME = 'field_name' # field name to UPSERT 13 | 14 | LOGGER = logging.getLogger(__name__) 15 | 16 | 17 | class TemplateVariableSubstitutionTransformer(Transformer): 18 | """ 19 | Add/Replace field in Dict by string.format based on given template and provide record Dict as a template parameter 20 | https://docs.python.org/3.4/library/string.html#string.Formatter.format 21 | 22 | """ 23 | 24 | def init(self, conf: ConfigTree) -> None: 25 | 26 | self._template = conf.get_string(TEMPLATE) 27 | self._field_name = conf.get_string(FIELD_NAME) 28 | 29 | def transform(self, record: Dict[str, Any]) -> Dict[str, Any]: 30 | 31 | val = self._template.format(**record) 32 | record[self._field_name] = val 33 | return record 34 | 35 | def get_scope(self) -> str: 36 | return 'transformer.template_variable_substitution' 37 | -------------------------------------------------------------------------------- /tests/unit/extractor/test_neo4j_es_last_updated_extractor.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from mock import patch 5 | from typing import Any 6 | import unittest 7 | 8 | from pyhocon import ConfigFactory 9 | 10 | from databuilder import Scoped 11 | from databuilder.extractor.neo4j_es_last_updated_extractor import Neo4jEsLastUpdatedExtractor 12 | 13 | 14 | class TestNeo4jEsLastUpdatedExtractor(unittest.TestCase): 15 | 16 | def setUp(self) -> None: 17 | config_dict = { 18 | 'extractor.neo4j_es_last_updated.model_class': 19 | 'databuilder.models.neo4j_es_last_updated.Neo4jESLastUpdated', 20 | } 21 | self.conf = ConfigFactory.from_dict(config_dict) 22 | 23 | @patch('time.time') 24 | def test_extraction_with_model_class(self, mock_time: Any) -> None: 25 | """ 26 | Test Extraction using model class 27 | """ 28 | mock_time.return_value = 10000000 29 | extractor = Neo4jEsLastUpdatedExtractor() 30 | extractor.init(Scoped.get_scoped_conf(conf=self.conf, 31 | scope=extractor.get_scope())) 32 | 33 | result = extractor.extract() 34 | self.assertEqual(result.timestamp, 10000000) 35 | -------------------------------------------------------------------------------- /tests/unit/publisher/test_publisher.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import unittest 5 | 6 | from mock import MagicMock 7 | from pyhocon import ConfigTree 8 | 9 | from databuilder.publisher.base_publisher import Publisher, NoopPublisher 10 | 11 | 12 | class TestPublisher(unittest.TestCase): 13 | 14 | def testCallback(self) -> None: 15 | publisher = NoopPublisher() 16 | callback = MagicMock() 17 | publisher.register_call_back(callback) 18 | publisher.publish() 19 | 20 | self.assertTrue(callback.on_success.called) 21 | 22 | def testFailureCallback(self) -> None: 23 | publisher = FailedPublisher() 24 | callback = MagicMock() 25 | publisher.register_call_back(callback) 26 | 27 | try: 28 | publisher.publish() 29 | except Exception: 30 | pass 31 | 32 | self.assertTrue(callback.on_failure.called) 33 | 34 | 35 | class FailedPublisher(Publisher): 36 | def __init__(self) -> None: 37 | super(FailedPublisher, self).__init__() 38 | 39 | def init(self, conf: ConfigTree) -> None: 40 | pass 41 | 42 | def publish_impl(self) -> None: 43 | raise Exception('Bomb') 44 | 45 | 46 | if __name__ == '__main__': 47 | unittest.main() 48 | -------------------------------------------------------------------------------- /tests/unit/extractor/test_csv_extractor.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import unittest 5 | 6 | from pyhocon import ConfigFactory 7 | 8 | from databuilder import Scoped 9 | from databuilder.extractor.csv_extractor import CsvExtractor 10 | 11 | 12 | class TestCsvExtractor(unittest.TestCase): 13 | 14 | def setUp(self) -> None: 15 | config_dict = { 16 | 'extractor.csv.{}'.format(CsvExtractor.FILE_LOCATION): 'example/sample_data/sample_table.csv', 17 | 'extractor.csv.model_class': 'databuilder.models.table_metadata.TableMetadata', 18 | } 19 | self.conf = ConfigFactory.from_dict(config_dict) 20 | 21 | def test_extraction_with_model_class(self) -> None: 22 | """ 23 | Test Extraction using model class 24 | """ 25 | extractor = CsvExtractor() 26 | extractor.init(Scoped.get_scoped_conf(conf=self.conf, 27 | scope=extractor.get_scope())) 28 | 29 | result = extractor.extract() 30 | self.assertEqual(result.name, 'test_table1') 31 | self.assertEqual(result.description._text, '1st test table') 32 | self.assertEqual(result.database, 'hive') 33 | self.assertEqual(result.cluster, 'gold') 34 | self.assertEqual(result.schema, 'test_schema') 35 | -------------------------------------------------------------------------------- /databuilder/loader/generic_loader.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import logging 5 | 6 | from pyhocon import ConfigTree 7 | from typing import Optional, Any 8 | 9 | from databuilder.loader.base_loader import Loader 10 | 11 | LOGGER = logging.getLogger(__name__) 12 | 13 | CALLBACK_FUNCTION = 'callback_function' 14 | 15 | 16 | def log_call_back(record: Optional[Any]) -> None: 17 | """ 18 | A Sample callback function. Implement any function follows this function's signature to fit your needs. 19 | :param record: 20 | :return: 21 | """ 22 | LOGGER.info('record: {}'.format(record)) 23 | 24 | 25 | class GenericLoader(Loader): 26 | """ 27 | Loader class to call back a function provided by user 28 | """ 29 | 30 | def init(self, conf: ConfigTree) -> None: 31 | """ 32 | Initialize file handlers from conf 33 | :param conf: 34 | """ 35 | self.conf = conf 36 | self._callback_func = self.conf.get(CALLBACK_FUNCTION, log_call_back) 37 | 38 | def load(self, record: Optional[Any]) -> None: 39 | """ 40 | Write record to function 41 | :param record: 42 | :return: 43 | """ 44 | if not record: 45 | return 46 | 47 | self._callback_func(record) 48 | 49 | def close(self) -> None: 50 | pass 51 | 52 | def get_scope(self) -> str: 53 | return "loader.generic" 54 | -------------------------------------------------------------------------------- /tests/unit/models/test_neo4j_es_last_updated.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import unittest 5 | from databuilder.models.neo4j_es_last_updated import Neo4jESLastUpdated 6 | 7 | from databuilder.models.graph_serializable import NODE_KEY, \ 8 | NODE_LABEL 9 | from databuilder.serializers import neo4_serializer 10 | 11 | 12 | class TestNeo4jESLastUpdated(unittest.TestCase): 13 | 14 | def setUp(self) -> None: 15 | super(TestNeo4jESLastUpdated, self).setUp() 16 | self.neo4j_es_last_updated = Neo4jESLastUpdated(timestamp=100) 17 | 18 | self.expected_node_result = { 19 | NODE_KEY: 'amundsen_updated_timestamp', 20 | NODE_LABEL: 'Updatedtimestamp', 21 | 'latest_timestmap:UNQUOTED': 100, 22 | } 23 | 24 | def test_create_nodes(self) -> None: 25 | nodes = self.neo4j_es_last_updated.create_nodes() 26 | self.assertEquals(len(nodes), 1) 27 | serialized_node = neo4_serializer.serialize_node(nodes[0]) 28 | self.assertEquals(serialized_node, self.expected_node_result) 29 | 30 | def test_create_next_node(self) -> None: 31 | next_node = self.neo4j_es_last_updated.create_next_node() 32 | self.assertEquals(neo4_serializer.serialize_node(next_node), self.expected_node_result) 33 | 34 | def test_create_next_relation(self) -> None: 35 | self.assertIs(self.neo4j_es_last_updated.create_next_relation(), None) 36 | -------------------------------------------------------------------------------- /tests/unit/transformer/test_dict_to_model_transformer.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import unittest 5 | 6 | from pyhocon import ConfigFactory 7 | 8 | from databuilder.transformer.dict_to_model import DictToModel, MODEL_CLASS 9 | from databuilder.models.dashboard.dashboard_execution import DashboardExecution 10 | 11 | 12 | class TestDictToModel(unittest.TestCase): 13 | 14 | def test_conversion(self) -> None: 15 | 16 | transformer = DictToModel() 17 | config = ConfigFactory.from_dict({ 18 | MODEL_CLASS: 'databuilder.models.dashboard.dashboard_execution.DashboardExecution', 19 | }) 20 | transformer.init(conf=config) 21 | 22 | actual = transformer.transform( 23 | { 24 | 'dashboard_group_id': 'foo', 25 | 'dashboard_id': 'bar', 26 | 'execution_timestamp': 123456789, 27 | 'execution_state': 'succeed', 28 | 'product': 'mode', 29 | 'cluster': 'gold' 30 | } 31 | ) 32 | 33 | self.assertTrue(isinstance(actual, DashboardExecution)) 34 | self.assertEqual(actual.__repr__(), DashboardExecution( 35 | dashboard_group_id='foo', 36 | dashboard_id='bar', 37 | execution_timestamp=123456789, 38 | execution_state='succeed', 39 | product='mode', 40 | cluster='gold' 41 | ).__repr__()) 42 | -------------------------------------------------------------------------------- /databuilder/rest_api/base_rest_api_query.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import abc 5 | import logging 6 | 7 | from typing import Iterable, Any, Dict, Iterator 8 | 9 | LOGGER = logging.getLogger(__name__) 10 | 11 | 12 | class BaseRestApiQuery(object, metaclass=abc.ABCMeta): 13 | 14 | @abc.abstractmethod 15 | def execute(self) -> Iterator[Dict[str, Any]]: 16 | """ 17 | Provides iterator of the records. It uses iterator so that it can stream the result. 18 | :return: 19 | """ 20 | 21 | return iter([dict()]) 22 | 23 | 24 | class RestApiQuerySeed(BaseRestApiQuery): 25 | """ 26 | A seed RestApiQuery. 27 | 28 | RestApiQuery is using decorator pattern where it needs to have a seed to begin with. RestApiQuerySeed is for 29 | RestApiQuery to start with. 30 | 31 | Example: see ModeDashboardExtractor._build_restapi_query 32 | """ 33 | 34 | def __init__(self, 35 | seed_record: Iterable[Dict[str, Any]] 36 | ) -> None: 37 | self._seed_record = seed_record 38 | 39 | def execute(self) -> Iterator[Dict[str, Any]]: 40 | return iter(self._seed_record) 41 | 42 | 43 | class EmptyRestApiQuerySeed(RestApiQuerySeed): 44 | """ 45 | Sometimes there simply isn't a record to seed with. 46 | """ 47 | 48 | def __init__(self) -> None: 49 | super(EmptyRestApiQuerySeed, self).__init__([{'empty_rest_api_query_seed': 1}]) 50 | -------------------------------------------------------------------------------- /databuilder/models/user_elasticsearch_document.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from databuilder.models.elasticsearch_document import ElasticsearchDocument 5 | 6 | 7 | class UserESDocument(ElasticsearchDocument): 8 | """ 9 | Schema for the Search index document for user 10 | """ 11 | 12 | def __init__(self, 13 | email: str, 14 | first_name: str, 15 | last_name: str, 16 | full_name: str, 17 | github_username: str, 18 | team_name: str, 19 | employee_type: str, 20 | manager_email: str, 21 | slack_id: str, 22 | role_name: str, 23 | is_active: bool, 24 | total_read: int, 25 | total_own: int, 26 | total_follow: int, 27 | ) -> None: 28 | self.email = email 29 | self.first_name = first_name 30 | self.last_name = last_name 31 | self.full_name = full_name 32 | self.github_username = github_username 33 | self.team_name = team_name 34 | self.employee_type = employee_type 35 | self.manager_email = manager_email 36 | self.slack_id = slack_id 37 | self.role_name = role_name 38 | self.is_active = is_active 39 | self.total_read = total_read 40 | self.total_own = total_own 41 | self.total_follow = total_follow 42 | -------------------------------------------------------------------------------- /tests/unit/transformer/test_remove_field_transformer.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import unittest 5 | 6 | from pyhocon import ConfigFactory 7 | 8 | from databuilder.transformer.remove_field_transformer import RemoveFieldTransformer, FIELD_NAMES 9 | 10 | 11 | class TestRemoveFieldTransformer(unittest.TestCase): 12 | 13 | def test_conversion(self) -> None: 14 | 15 | transformer = RemoveFieldTransformer() 16 | config = ConfigFactory.from_dict({ 17 | FIELD_NAMES: ['foo', 'bar'], 18 | }) 19 | transformer.init(conf=config) 20 | 21 | actual = transformer.transform({ 22 | 'foo': 'foo_val', 23 | 'bar': 'bar_val', 24 | 'baz': 'baz_val', 25 | }) 26 | expected = { 27 | 'baz': 'baz_val' 28 | } 29 | self.assertDictEqual(expected, actual) 30 | 31 | def test_conversion_missing_field(self) -> None: 32 | 33 | transformer = RemoveFieldTransformer() 34 | config = ConfigFactory.from_dict({ 35 | FIELD_NAMES: ['foo', 'bar'], 36 | }) 37 | transformer.init(conf=config) 38 | 39 | actual = transformer.transform({ 40 | 'foo': 'foo_val', 41 | 'baz': 'baz_val', 42 | 'john': 'doe', 43 | }) 44 | expected = { 45 | 'baz': 'baz_val', 46 | 'john': 'doe' 47 | } 48 | self.assertDictEqual(expected, actual) 49 | 50 | 51 | if __name__ == '__main__': 52 | unittest.main() 53 | -------------------------------------------------------------------------------- /tests/unit/models/test_metric_elasticsearch_document.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import json 5 | import unittest 6 | 7 | from databuilder.models.metric_elasticsearch_document import MetricESDocument 8 | 9 | 10 | class TestMetricElasticsearchDocument(unittest.TestCase): 11 | 12 | def test_to_json(self) -> None: 13 | """ 14 | Test string generated from to_json method 15 | """ 16 | 17 | test_obj = MetricESDocument(name='test_metric_name', 18 | description='test_metric_description', 19 | type='test_metric_type', 20 | dashboards=['test_dashboard_1', 'test_dashboard_2'], 21 | tags=['test_metric_group']) 22 | 23 | expected_document_dict = {"name": "test_metric_name", 24 | "description": "test_metric_description", 25 | "type": "test_metric_type", 26 | "dashboards": ['test_dashboard_1', 'test_dashboard_2'], 27 | "tags": ['test_metric_group'] 28 | } 29 | 30 | result = test_obj.to_json() 31 | results = result.split("\n") 32 | 33 | # verify two new line characters in result 34 | self.assertEqual(len(results), 2, "Result from to_json() function doesn't have a newline!") 35 | self.assertDictEqual(json.loads(results[0]), expected_document_dict) 36 | -------------------------------------------------------------------------------- /tests/unit/extractor/user/bamboohr/test_bamboohr_user_extractor.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import io 5 | import unittest 6 | 7 | import os 8 | 9 | import responses 10 | from pyhocon import ConfigFactory 11 | 12 | from databuilder.models.user import User 13 | from databuilder.extractor.user.bamboohr.bamboohr_user_extractor import BamboohrUserExtractor 14 | 15 | 16 | class TestBamboohrUserExtractor(unittest.TestCase): 17 | @responses.activate 18 | def test_parse_testdata(self) -> None: 19 | bhr = BamboohrUserExtractor() 20 | bhr.init(ConfigFactory.from_dict({'api_key': 'api_key', 'subdomain': 'amundsen'})) 21 | 22 | testdata_xml = os.path.join( 23 | os.path.dirname(os.path.realpath(__file__)), 24 | '../../../resources/extractor/user/bamboohr/testdata.xml' 25 | ) 26 | 27 | with io.open(testdata_xml) as testdata: 28 | responses.add(responses.GET, bhr._employee_directory_uri(), body=testdata.read()) 29 | 30 | expected = User( 31 | email='roald@amundsen.io', 32 | first_name='Roald', 33 | last_name='Amundsen', 34 | name='Roald Amundsen', 35 | team_name='508 Corporate Marketing', 36 | role_name='Antarctic Explorer', 37 | ) 38 | 39 | actual_users = list(bhr._get_extract_iter()) 40 | 41 | self.assertEqual(1, len(actual_users)) 42 | self.assertEqual(repr(expected), repr(actual_users[0])) 43 | 44 | 45 | if __name__ == '__main__': 46 | unittest.main() 47 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # The modular source code checker: pep8, pyflakes and co 2 | # License: MIT 3 | # Upstream url: http://bitbucket.org/tarek/flake8 4 | flake8==3.5.0 5 | 6 | # A flake8 plugin that helps you write tidier imports. 7 | # License: ISCL 8 | # Upstream url: https://pypi.python.org/pypi/flake8-tidy-imports 9 | flake8-tidy-imports>=1.1.0,<2.0 10 | 11 | # A mature full-featured Python testing tool. 12 | # License: MIT 13 | # Upstream url: http://pytest.org/ 14 | pytest>=3.6.0,<4.0 15 | 16 | # Pytest plugin for measuring coverage. 17 | # License: MIT 18 | # Upstream url: https://github.com/pytest-dev/pytest-cov 19 | pytest-cov>=2.5.1,<=2.9 20 | 21 | # Rolling backport of unittest.mock for all Pythons 22 | # License: BSD 23 | # Upstream url: https://mock.readthedocs.io/en/latest/ 24 | mock>=2.0.0,<3.0 25 | 26 | # Thin-wrapper around the mock package for easier use with py.test. 27 | # License: MIT 28 | # Upstream url: https://pypi.python.org/pypi/pytest-mock 29 | pytest-mock>=1.1,<2.0 30 | 31 | # Python client for ElasticSearch 32 | # License: Apache Software License 33 | # Upstream url: https://pypi.org/project/elasticsearch/ 34 | elasticsearch>=6.2.0,<7.0 35 | 36 | atomicwrites==1.1.5 37 | more-itertools==4.2.0 38 | pluggy>=0.6.0 39 | py==1.5.3 40 | pyhocon==0.3.42 41 | pyparsing==2.2.0 42 | six>=1.11.0,<2.0.0 43 | sqlalchemy>=1.3.0,<2.0 44 | wheel==0.31.1 45 | neo4j-driver==1.7.2 46 | neotime==1.7.1 47 | mypy==0.782 48 | pytz==2018.4 49 | statsd==3.2.1 50 | retrying==1.3.3 51 | unicodecsv==0.14.1,<1.0 52 | 53 | httplib2>=0.18.0 54 | unidecode 55 | Jinja2>=2.10.0,<2.12 56 | pandas>=0.21.0,<1.2.0 57 | 58 | requests==2.23.0,<3.0 59 | responses==0.10.6 60 | -------------------------------------------------------------------------------- /tests/unit/transformer/test_timestamp_string_to_epoch_transformer.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import unittest 5 | 6 | from pyhocon import ConfigFactory 7 | 8 | from databuilder.transformer.timestamp_string_to_epoch import TimestampStringToEpoch, FIELD_NAME, TIMESTAMP_FORMAT 9 | 10 | 11 | class TestTimestampStrToEpoch(unittest.TestCase): 12 | 13 | def test_conversion(self) -> None: 14 | 15 | transformer = TimestampStringToEpoch() 16 | config = ConfigFactory.from_dict({ 17 | FIELD_NAME: 'foo', 18 | }) 19 | transformer.init(conf=config) 20 | 21 | actual = transformer.transform({'foo': '2020-02-19T19:52:33.1Z'}) 22 | self.assertDictEqual({'foo': 1582141953}, actual) 23 | 24 | def test_conversion_with_format(self) -> None: 25 | 26 | transformer = TimestampStringToEpoch() 27 | config = ConfigFactory.from_dict({ 28 | FIELD_NAME: 'foo', 29 | TIMESTAMP_FORMAT: '%Y-%m-%dT%H:%M:%SZ' 30 | }) 31 | transformer.init(conf=config) 32 | 33 | actual = transformer.transform({'foo': '2020-02-19T19:52:33Z'}) 34 | self.assertDictEqual({'foo': 1582141953}, actual) 35 | 36 | def test_invalid_timestamp(self) -> None: 37 | transformer = TimestampStringToEpoch() 38 | config = ConfigFactory.from_dict({ 39 | FIELD_NAME: 'foo', 40 | }) 41 | transformer.init(conf=config) 42 | actual = transformer.transform({'foo': '165de33266d4'}) 43 | self.assertEqual(actual['foo'], 0) 44 | 45 | 46 | if __name__ == '__main__': 47 | unittest.main() 48 | -------------------------------------------------------------------------------- /databuilder/transformer/timestamp_string_to_epoch.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import logging 5 | from datetime import datetime 6 | 7 | from pyhocon import ConfigFactory 8 | from pyhocon import ConfigTree 9 | from typing import Any, Dict 10 | 11 | from databuilder.transformer.base_transformer import Transformer 12 | 13 | TIMESTAMP_FORMAT = 'timestamp_format' 14 | FIELD_NAME = 'field_name' 15 | 16 | LOGGER = logging.getLogger(__name__) 17 | 18 | DEFAULT_CONFIG = ConfigFactory.from_dict({TIMESTAMP_FORMAT: '%Y-%m-%dT%H:%M:%S.%fZ'}) 19 | 20 | 21 | class TimestampStringToEpoch(Transformer): 22 | """ 23 | Transforms string timestamp into epoch 24 | """ 25 | 26 | def init(self, conf: ConfigTree) -> None: 27 | self._conf = conf.with_fallback(DEFAULT_CONFIG) 28 | self._timestamp_format = self._conf.get_string(TIMESTAMP_FORMAT) 29 | self._field_name = self._conf.get_string(FIELD_NAME) 30 | 31 | def transform(self, record: Dict[str, Any]) -> Dict[str, Any]: 32 | timestamp_str = record.get(self._field_name, '') 33 | 34 | if not timestamp_str: 35 | return record 36 | 37 | try: 38 | utc_dt = datetime.strptime(timestamp_str, self._timestamp_format) 39 | except ValueError: 40 | # if the timestamp_str doesn't match format, no conversion, return initial result 41 | record[self._field_name] = 0 42 | return record 43 | 44 | record[self._field_name] = int((utc_dt - datetime(1970, 1, 1)).total_seconds()) 45 | return record 46 | 47 | def get_scope(self) -> str: 48 | return 'transformer.timestamp_str_to_epoch' 49 | -------------------------------------------------------------------------------- /databuilder/extractor/generic_extractor.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import importlib 5 | from typing import Iterable, Any 6 | 7 | from pyhocon import ConfigTree 8 | 9 | from databuilder.extractor.base_extractor import Extractor 10 | 11 | 12 | class GenericExtractor(Extractor): 13 | """ 14 | Extractor to extract any arbitrary values from users. 15 | """ 16 | EXTRACTION_ITEMS = 'extraction_items' 17 | 18 | def init(self, conf: ConfigTree) -> None: 19 | """ 20 | Receives a list of dictionaries which is used for extraction 21 | :param conf: 22 | :return: 23 | """ 24 | self.conf = conf 25 | self.values: Iterable[Any] = conf.get(GenericExtractor.EXTRACTION_ITEMS) 26 | 27 | model_class = conf.get('model_class', None) 28 | if model_class: 29 | module_name, class_name = model_class.rsplit(".", 1) 30 | mod = importlib.import_module(module_name) 31 | self.model_class = getattr(mod, class_name) 32 | results = [self.model_class(**result) 33 | for result in self.values] 34 | 35 | self._iter = iter(results) 36 | else: 37 | self._iter = iter(self.values) 38 | 39 | def extract(self) -> Any: 40 | """ 41 | Fetch one sql result row, convert to {model_class} if specified before 42 | returning. 43 | :return: 44 | """ 45 | try: 46 | result = next(self._iter) 47 | return result 48 | except StopIteration: 49 | return None 50 | 51 | def get_scope(self) -> str: 52 | return 'extractor.generic' 53 | -------------------------------------------------------------------------------- /databuilder/transformer/regex_str_replace_transformer.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import logging 5 | from pyhocon import ConfigTree 6 | from typing import Any 7 | 8 | from databuilder.transformer.base_transformer import Transformer 9 | 10 | 11 | LOGGER = logging.getLogger(__name__) 12 | 13 | 14 | # Config keys 15 | REGEX_REPLACE_TUPLE_LIST = 'regex_replace_tuple_list' 16 | ATTRIBUTE_NAME = 'attribute_name' 17 | 18 | 19 | class RegexStrReplaceTransformer(Transformer): 20 | """ 21 | Generic string replacement transformer using REGEX. 22 | User can pass list of tuples where tuple contains regex and replacement pair. 23 | 24 | Any non-string values will be ignored. 25 | """ 26 | 27 | def init(self, conf: ConfigTree) -> None: 28 | self._regex_replace_tuples = conf.get_list(REGEX_REPLACE_TUPLE_LIST) 29 | self._attribute_name = conf.get_string(ATTRIBUTE_NAME) 30 | 31 | def transform(self, record: Any) -> Any: 32 | 33 | if isinstance(record, dict): 34 | val = record.get(self._attribute_name) 35 | else: 36 | val = getattr(record, self._attribute_name) 37 | 38 | if val is None or not isinstance(val, str): 39 | return record 40 | 41 | for regex_replace_tuple in self._regex_replace_tuples: 42 | val = val.replace(regex_replace_tuple[0], regex_replace_tuple[1]) 43 | 44 | if isinstance(record, dict): 45 | record[self._attribute_name] = val 46 | else: 47 | setattr(record, self._attribute_name, val) 48 | 49 | return record 50 | 51 | def get_scope(self) -> str: 52 | return 'transformer.regex_str_replace' 53 | -------------------------------------------------------------------------------- /tests/unit/resources/extractor/user/bamboohr/testdata.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | Display name 5 | First name 6 | Last name 7 | Preferred name 8 | Gender 9 | Job title 10 | Work Phone 11 | Mobile Phone 12 | Work Email 13 | Department 14 | Location 15 | Work Ext. 16 | Employee photo 17 | Photo URL 18 | Can Upload Photo 19 |
20 | 21 | 22 | Roald Amundsen 23 | Roald 24 | Amundsen 25 | 26 | Male 27 | Antarctic Explorer 28 | 29 | 30 | roald@amundsen.io 31 | 508 Corporate Marketing 32 | Norway 33 | 34 | true 35 | https://upload.wikimedia.org/wikipedia/commons/thumb/6/6f/Amundsen_in_fur_skins.jpg/440px-Amundsen_in_fur_skins.jpg 36 | no 37 | 38 | 39 |
40 | -------------------------------------------------------------------------------- /databuilder/extractor/neo4j_es_last_updated_extractor.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import importlib 5 | import time 6 | from typing import Any 7 | 8 | from pyhocon import ConfigTree 9 | 10 | from databuilder.extractor.generic_extractor import GenericExtractor 11 | 12 | 13 | class Neo4jEsLastUpdatedExtractor(GenericExtractor): 14 | """ 15 | Extractor to extract last updated timestamp for neo4j and Es 16 | """ 17 | 18 | def init(self, conf: ConfigTree) -> None: 19 | """ 20 | Receives a list of dictionaries which is used for extraction 21 | :param conf: 22 | :return: 23 | """ 24 | self.conf = conf 25 | 26 | model_class = conf.get('model_class', None) 27 | if model_class: 28 | module_name, class_name = model_class.rsplit(".", 1) 29 | mod = importlib.import_module(module_name) 30 | self.model_class = getattr(mod, class_name) 31 | last_updated_timestamp = int(time.time()) 32 | result = {'timestamp': last_updated_timestamp} 33 | results = [self.model_class(**result)] 34 | self._iter = iter(results) 35 | else: 36 | raise RuntimeError('model class needs to be provided!') 37 | 38 | def extract(self) -> Any: 39 | """ 40 | Fetch one sql result row, convert to {model_class} if specified before 41 | returning. 42 | :return: 43 | """ 44 | try: 45 | result = next(self._iter) 46 | return result 47 | except StopIteration: 48 | return None 49 | 50 | def get_scope(self) -> str: 51 | return 'extractor.neo4j_es_last_updated' 52 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 16 | 17 | ### Summary of Changes 18 | 19 | _Include a summary of changes then remove this line_ 20 | 21 | ### Tests 22 | 23 | _What tests did you add or modify and why? If no tests were added or modified, explain why. Remove this line_ 24 | 25 | ### Documentation 26 | 27 | _What documentation did you add or modify and why? Add any relevant links then remove this line_ 28 | 29 | ### CheckList 30 | 31 | Make sure you have checked **all** steps below to ensure a timely review. 32 | 33 | - [ ] PR title addresses the issue accurately and concisely. Example: "Updates the version of Flask to v1.0.2" 34 | - In case you are adding a dependency, check if the license complies with the [ASF 3rd Party License Policy](https://www.apache.org/legal/resolved.html#category-x). 35 | - [ ] PR includes a summary of changes. 36 | - [ ] PR adds unit tests, updates existing unit tests, **OR** documents why no test additions or modifications are needed. 37 | - [ ] In case of new functionality, my PR adds documentation that describes how to use it. 38 | - All the public functions and the classes in the PR contain docstrings that explain what it does 39 | - [ ] PR passes `make test` 40 | -------------------------------------------------------------------------------- /databuilder/transformer/bigquery_usage_transformer.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from pyhocon import ConfigTree 5 | from typing import Optional, Tuple 6 | 7 | from databuilder.transformer.base_transformer import Transformer 8 | from databuilder.models.table_column_usage import ColumnReader, TableColumnUsage 9 | from databuilder.extractor.bigquery_usage_extractor import TableColumnUsageTuple 10 | 11 | 12 | class BigqueryUsageTransformer(Transformer): 13 | 14 | def init(self, conf: ConfigTree) -> None: 15 | """ 16 | Transformer to convert TableColumnUsageTuple data to bigquery usage data 17 | which can be uploaded to Neo4j 18 | """ 19 | self.conf = conf 20 | 21 | def transform(self, record: Tuple[TableColumnUsageTuple, int]) -> Optional[TableColumnUsage]: 22 | if not record: 23 | return None 24 | 25 | (key, count) = record 26 | 27 | if not isinstance(key, TableColumnUsageTuple): 28 | raise Exception("BigqueryUsageTransformer expects record of type TableColumnUsageTuple") 29 | 30 | col_readers = [] 31 | col_readers.append(ColumnReader(database=key.database, 32 | cluster=key.cluster, 33 | schema=key.schema, 34 | table=key.table, 35 | column=key.column, 36 | user_email=key.email, 37 | read_count=count)) 38 | 39 | return TableColumnUsage(col_readers=col_readers) 40 | 41 | def get_scope(self) -> str: 42 | return 'transformer.bigquery_usage' 43 | -------------------------------------------------------------------------------- /tests/unit/models/dashboard/test_dashboard_owner.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import unittest 5 | 6 | from databuilder.models.dashboard.dashboard_owner import DashboardOwner 7 | from databuilder.models.graph_serializable import RELATION_START_KEY, RELATION_START_LABEL, RELATION_END_KEY, \ 8 | RELATION_END_LABEL, RELATION_TYPE, RELATION_REVERSE_TYPE 9 | from databuilder.serializers import neo4_serializer 10 | 11 | 12 | class TestDashboardOwner(unittest.TestCase): 13 | 14 | def test_dashboard_owner_nodes(self) -> None: 15 | dashboard_owner = DashboardOwner(email='foo@bar.com', cluster='cluster_id', product='product_id', 16 | dashboard_id='dashboard_id', dashboard_group_id='dashboard_group_id') 17 | 18 | actual = dashboard_owner.create_next_node() 19 | self.assertIsNone(actual) 20 | 21 | def test_dashboard_owner_relations(self) -> None: 22 | dashboard_owner = DashboardOwner(email='foo@bar.com', cluster='cluster_id', product='product_id', 23 | dashboard_id='dashboard_id', dashboard_group_id='dashboard_group_id') 24 | 25 | actual = dashboard_owner.create_next_relation() 26 | actual_serialized = neo4_serializer.serialize_relationship(actual) 27 | expected = {RELATION_END_KEY: 'foo@bar.com', RELATION_START_LABEL: 'Dashboard', RELATION_END_LABEL: 'User', 28 | RELATION_START_KEY: 'product_id_dashboard://cluster_id.dashboard_group_id/dashboard_id', 29 | RELATION_TYPE: 'OWNER', 30 | RELATION_REVERSE_TYPE: 'OWNER_OF'} 31 | assert actual is not None 32 | self.assertDictEqual(actual_serialized, expected) 33 | -------------------------------------------------------------------------------- /tests/unit/filesystem/test_filesystem.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import unittest 5 | from datetime import datetime 6 | 7 | from mock import MagicMock 8 | from pyhocon import ConfigFactory 9 | from pytz import UTC 10 | 11 | from databuilder.filesystem.filesystem import FileSystem 12 | from databuilder.filesystem.metadata import FileMetadata 13 | 14 | 15 | class TestFileSystem(unittest.TestCase): 16 | 17 | def test_is_file(self) -> None: 18 | dask_fs = MagicMock() 19 | dask_fs.ls = MagicMock(return_value=['/foo/bar']) 20 | 21 | fs = FileSystem() 22 | conf = ConfigFactory.from_dict({FileSystem.DASK_FILE_SYSTEM: dask_fs}) 23 | fs.init(conf=conf) 24 | 25 | self.assertTrue(fs.is_file('/foo/bar')) 26 | 27 | dask_fs.ls = MagicMock(return_value=['bar', 'baz']) 28 | 29 | fs = FileSystem() 30 | conf = ConfigFactory.from_dict({FileSystem.DASK_FILE_SYSTEM: dask_fs}) 31 | fs.init(conf=conf) 32 | 33 | self.assertFalse(fs.is_file('foo')) 34 | 35 | def test_info(self) -> None: 36 | dask_fs = MagicMock() 37 | dask_fs.info = MagicMock(return_value={'LastModified': datetime(2018, 8, 14, 4, 12, 3, tzinfo=UTC), 38 | 'Size': 15093}) 39 | fs = FileSystem() 40 | conf = ConfigFactory.from_dict({FileSystem.DASK_FILE_SYSTEM: dask_fs}) 41 | fs.init(conf=conf) 42 | metadata = fs.info('/foo/bar') 43 | 44 | expected = FileMetadata(path='/foo/bar', last_updated=datetime(2018, 8, 14, 4, 12, 3, tzinfo=UTC), size=15093) 45 | 46 | self.assertEqual(metadata.__repr__(), expected.__repr__()) 47 | 48 | 49 | if __name__ == '__main__': 50 | unittest.main() 51 | -------------------------------------------------------------------------------- /databuilder/loader/file_system_csv_loader.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import csv 5 | import logging 6 | 7 | from pyhocon import ConfigTree 8 | from typing import Any 9 | 10 | from databuilder.loader.base_loader import Loader 11 | 12 | 13 | class FileSystemCSVLoader(Loader): 14 | """ 15 | Loader class to write csv files to Local FileSystem 16 | """ 17 | 18 | def init(self, conf: ConfigTree) -> None: 19 | """ 20 | Initialize file handlers from conf 21 | :param conf: 22 | """ 23 | self.conf = conf 24 | self.file_path = self.conf.get_string('file_path') 25 | self.file_mode = self.conf.get_string('mode', 'w') 26 | 27 | self.file_handler = open(self.file_path, self.file_mode) 28 | 29 | def load(self, record: Any) -> None: 30 | """ 31 | Write record object as csv to file 32 | :param record: 33 | :return: 34 | """ 35 | if not record: 36 | return 37 | 38 | if not hasattr(self, 'writer'): 39 | self.writer = csv.DictWriter(self.file_handler, 40 | fieldnames=vars(record).keys()) 41 | self.writer.writeheader() 42 | 43 | self.writer.writerow(vars(record)) 44 | self.file_handler.flush() 45 | 46 | def close(self) -> None: 47 | """ 48 | Close file handlers 49 | :return: 50 | """ 51 | try: 52 | if self.file_handler: 53 | self.file_handler.close() 54 | except Exception as e: 55 | logging.warning("Failed trying to close a file handler! %s", 56 | str(e)) 57 | 58 | def get_scope(self) -> str: 59 | return "loader.filesystem.csv" 60 | -------------------------------------------------------------------------------- /databuilder/utils/closer.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import atexit 5 | 6 | from typing import Callable, List 7 | 8 | 9 | class Closer(object): 10 | """ 11 | A Closer class that is responsible for collecting closeable callable, 12 | and close it in group. Registered closeable callable will be guaranteed 13 | to be called where only last occurred failure will be propagated back. 14 | 15 | Order of closing registered closeable callable will be LIFO 16 | as closeable instance can have dependency each other. 17 | """ 18 | 19 | def __init__(self) -> None: 20 | self._stack: List = [] 21 | atexit.register(self.close) 22 | 23 | def register(self, close_callable: Callable) -> None: 24 | """ 25 | Register closeable callable. 26 | :param close_callable: 27 | :return: None 28 | """ 29 | if not callable(close_callable): 30 | raise RuntimeError('Only callable can be registered: {}'.format( 31 | close_callable)) 32 | 33 | self._stack.append(close_callable) 34 | 35 | def close(self) -> None: 36 | """ 37 | Execute all closeable callable in LIFO order. 38 | All registered callable will be guaranteed to be executed. If there 39 | are multiple failures, only the last one will be propagated. 40 | :return: 41 | """ 42 | if not self._stack: 43 | return 44 | 45 | last_exception = None 46 | while len(self._stack): 47 | try: 48 | close_callable = self._stack.pop() 49 | close_callable() 50 | except Exception as e: 51 | last_exception = e 52 | 53 | if last_exception: 54 | raise last_exception 55 | -------------------------------------------------------------------------------- /databuilder/models/dashboard_elasticsearch_document.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from typing import List, Optional, Union 5 | 6 | from databuilder.models.elasticsearch_document import ElasticsearchDocument 7 | 8 | 9 | class DashboardESDocument(ElasticsearchDocument): 10 | """ 11 | Schema for the ES dashboard ES document 12 | """ 13 | 14 | def __init__(self, 15 | group_name: str, 16 | name: str, 17 | description: Union[str, None], 18 | total_usage: int, 19 | product: Optional[str] = '', 20 | cluster: Optional[str] = '', 21 | group_description: Optional[str] = None, 22 | query_names: Union[List[str], None] = None, 23 | chart_names: Optional[List[str]] = None, 24 | group_url: Optional[str] = None, 25 | url: Optional[str] = None, 26 | uri: Optional[str] = None, 27 | last_successful_run_timestamp: Optional[int] = None, 28 | tags: Optional[List[str]] = None, 29 | badges: Optional[List[str]] = None, 30 | ) -> None: 31 | self.group_name = group_name 32 | self.name = name 33 | self.description = description 34 | self.cluster = cluster 35 | self.product = product 36 | self.group_url = group_url 37 | self.url = url 38 | self.uri = uri 39 | self.last_successful_run_timestamp = last_successful_run_timestamp 40 | self.total_usage = total_usage 41 | self.group_description = group_description 42 | self.query_names = query_names 43 | self.chart_names = chart_names 44 | self.tags = tags 45 | self.badges = badges 46 | -------------------------------------------------------------------------------- /tests/unit/extractor/test_generic_extractor.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import unittest 5 | 6 | from pyhocon import ConfigFactory 7 | 8 | from databuilder import Scoped 9 | from databuilder.extractor.generic_extractor import GenericExtractor 10 | 11 | 12 | class TestGenericExtractor(unittest.TestCase): 13 | 14 | def test_extraction_with_model_class(self) -> None: 15 | """ 16 | Test Extraction using model class 17 | """ 18 | config_dict = { 19 | 'extractor.generic.extraction_items': [{'timestamp': 10000000}], 20 | 'extractor.generic.model_class': 21 | 'databuilder.models.neo4j_es_last_updated.Neo4jESLastUpdated', 22 | } 23 | conf = ConfigFactory.from_dict(config_dict) 24 | 25 | extractor = GenericExtractor() 26 | self.conf = ConfigFactory.from_dict(config_dict) 27 | extractor.init(Scoped.get_scoped_conf(conf=conf, 28 | scope=extractor.get_scope())) 29 | 30 | result = extractor.extract() 31 | self.assertEqual(result.timestamp, 10000000) 32 | 33 | def test_extraction_without_model_class(self) -> None: 34 | """ 35 | Test Extraction using model class 36 | """ 37 | config_dict = { 38 | 'extractor.generic.extraction_items': [{'foo': 1}, {'bar': 2}], 39 | } 40 | conf = ConfigFactory.from_dict(config_dict) 41 | 42 | extractor = GenericExtractor() 43 | self.conf = ConfigFactory.from_dict(config_dict) 44 | extractor.init(Scoped.get_scoped_conf(conf=conf, 45 | scope=extractor.get_scope())) 46 | 47 | self.assertEqual(extractor.extract(), {'foo': 1}) 48 | self.assertEqual(extractor.extract(), {'bar': 2}) 49 | -------------------------------------------------------------------------------- /databuilder/callback/call_back.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import abc 5 | import logging 6 | 7 | from typing import List, Optional 8 | 9 | LOGGER = logging.getLogger(__name__) 10 | 11 | 12 | class Callback(object, metaclass=abc.ABCMeta): 13 | """ 14 | A callback interface that expected to fire "on_success" if the operation is successful, else "on_failure" if 15 | operation failed. 16 | """ 17 | 18 | @abc.abstractmethod 19 | def on_success(self) -> None: 20 | """ 21 | A call back method that will be called when operation is successful 22 | :return: None 23 | """ 24 | pass 25 | 26 | @abc.abstractmethod 27 | def on_failure(self) -> None: 28 | """ 29 | A call back method that will be called when operation failed 30 | :return: None 31 | """ 32 | pass 33 | 34 | 35 | def notify_callbacks(callbacks: List[Callback], is_success: bool) -> None: 36 | """ 37 | A Utility method that notifies callback. If any callback fails it will still go through all the callbacks, 38 | and raise the last exception it experienced. 39 | 40 | :param callbacks: 41 | :param is_success: 42 | :return: 43 | """ 44 | 45 | if not callbacks: 46 | LOGGER.info('No callbacks to notify') 47 | return 48 | 49 | LOGGER.info('Notifying callbacks') 50 | 51 | last_exception: Optional[Exception] = None 52 | for callback in callbacks: 53 | try: 54 | if is_success: 55 | callback.on_success() 56 | else: 57 | callback.on_failure() 58 | except Exception as e: 59 | LOGGER.exception('Failed while notifying callback') 60 | last_exception = e 61 | 62 | if last_exception: 63 | raise last_exception 64 | -------------------------------------------------------------------------------- /databuilder/extractor/postgres_metadata_extractor.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from pyhocon import ConfigFactory, ConfigTree # noqa: F401 5 | from typing import Iterator, Union, Dict, Any # noqa: F401 6 | 7 | from databuilder.extractor.base_postgres_metadata_extractor import BasePostgresMetadataExtractor 8 | 9 | 10 | class PostgresMetadataExtractor(BasePostgresMetadataExtractor): 11 | """ 12 | Extracts Postgres table and column metadata from underlying meta store database using SQLAlchemyExtractor 13 | """ 14 | 15 | def get_sql_statement(self, use_catalog_as_cluster_name, where_clause_suffix): 16 | # type: (bool, str) -> str 17 | if use_catalog_as_cluster_name: 18 | cluster_source = "c.table_catalog" 19 | else: 20 | cluster_source = "'{}'".format(self._cluster) 21 | 22 | return """ 23 | SELECT 24 | {cluster_source} as cluster, c.table_schema as schema, c.table_name as name, pgtd.description as description 25 | ,c.column_name as col_name, c.data_type as col_type 26 | , pgcd.description as col_description, ordinal_position as col_sort_order 27 | FROM INFORMATION_SCHEMA.COLUMNS c 28 | INNER JOIN 29 | pg_catalog.pg_statio_all_tables as st on c.table_schema=st.schemaname and c.table_name=st.relname 30 | LEFT JOIN 31 | pg_catalog.pg_description pgcd on pgcd.objoid=st.relid and pgcd.objsubid=c.ordinal_position 32 | LEFT JOIN 33 | pg_catalog.pg_description pgtd on pgtd.objoid=st.relid and pgtd.objsubid=0 34 | {where_clause_suffix} 35 | ORDER by cluster, schema, name, col_sort_order ; 36 | """.format( 37 | cluster_source=cluster_source, 38 | where_clause_suffix=where_clause_suffix, 39 | ) 40 | 41 | def get_scope(self): 42 | # type: () -> str 43 | return 'extractor.postgres_metadata' 44 | -------------------------------------------------------------------------------- /tests/unit/transformer/test_chained_transformer.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import unittest 5 | 6 | from mock import MagicMock 7 | from pyhocon import ConfigFactory 8 | 9 | from databuilder.transformer.base_transformer import ChainedTransformer 10 | 11 | 12 | class TestChainedTransformer(unittest.TestCase): 13 | 14 | def test_init_not_called(self) -> None: 15 | 16 | mock_transformer1 = MagicMock() 17 | mock_transformer2 = MagicMock() 18 | chained_transformer = ChainedTransformer(transformers=[mock_transformer1, mock_transformer2]) 19 | 20 | config = ConfigFactory.from_dict({}) 21 | chained_transformer.init(conf=config) 22 | 23 | chained_transformer.transform( 24 | { 25 | 'foo': 'bar' 26 | } 27 | ) 28 | 29 | mock_transformer1.init.assert_not_called() 30 | mock_transformer1.transform.assert_called_once() 31 | mock_transformer2.init.assert_not_called() 32 | mock_transformer2.transform.assert_called_once() 33 | 34 | def test_init_called(self) -> None: 35 | 36 | mock_transformer1 = MagicMock() 37 | mock_transformer1.get_scope.return_value = 'foo' 38 | mock_transformer2 = MagicMock() 39 | mock_transformer2.get_scope.return_value = 'bar' 40 | chained_transformer = ChainedTransformer(transformers=[mock_transformer1, mock_transformer2], 41 | is_init_transformers=True) 42 | 43 | config = ConfigFactory.from_dict({}) 44 | chained_transformer.init(conf=config) 45 | 46 | chained_transformer.transform( 47 | { 48 | 'foo': 'bar' 49 | } 50 | ) 51 | 52 | mock_transformer1.init.assert_called_once() 53 | mock_transformer1.transform.assert_called_once() 54 | mock_transformer2.init.assert_called_once() 55 | mock_transformer2.transform.assert_called_once() 56 | -------------------------------------------------------------------------------- /databuilder/transformer/base_transformer.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import abc 5 | 6 | from pyhocon import ConfigTree 7 | from typing import Any, Iterable, Optional 8 | 9 | from databuilder import Scoped 10 | 11 | 12 | class Transformer(Scoped): 13 | """ 14 | A transformer transforms a record 15 | """ 16 | @abc.abstractmethod 17 | def init(self, conf: ConfigTree) -> None: 18 | pass 19 | 20 | @abc.abstractmethod 21 | def transform(self, record: Any) -> Any: 22 | pass 23 | 24 | 25 | class NoopTransformer(Transformer): 26 | """ 27 | A no-op transformer 28 | """ 29 | 30 | def init(self, conf: ConfigTree) -> None: 31 | pass 32 | 33 | def transform(self, record: Any) -> Any: 34 | return record 35 | 36 | def get_scope(self) -> str: 37 | pass 38 | 39 | 40 | class ChainedTransformer(Transformer): 41 | """ 42 | A chained transformer that iterates transformers and transforms a record 43 | """ 44 | 45 | def __init__(self, 46 | transformers: Iterable[Transformer], 47 | is_init_transformers: Optional[bool] = False) -> None: 48 | self.transformers = transformers 49 | self.is_init_transformers = is_init_transformers 50 | 51 | def init(self, conf: ConfigTree) -> None: 52 | if self.is_init_transformers: 53 | for transformer in self.transformers: 54 | transformer.init(Scoped.get_scoped_conf(conf, transformer.get_scope())) 55 | 56 | def transform(self, record: Any) -> Any: 57 | for t in self.transformers: 58 | record = t.transform(record) 59 | # Check filtered record 60 | if not record: 61 | return None 62 | 63 | return record 64 | 65 | def get_scope(self) -> str: 66 | return 'transformer.chained' 67 | 68 | def close(self) -> None: 69 | for t in self.transformers: 70 | t.close() 71 | -------------------------------------------------------------------------------- /databuilder/models/neo4j_es_last_updated.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from typing import List, Union 5 | 6 | from databuilder.models.graph_serializable import GraphSerializable 7 | from databuilder.models.graph_relationship import GraphRelationship 8 | from databuilder.models.graph_node import GraphNode 9 | 10 | 11 | class Neo4jESLastUpdated(GraphSerializable): 12 | """ 13 | Data model to keep track the last updated timestamp for 14 | neo4j and es. 15 | """ 16 | 17 | LABEL = 'Updatedtimestamp' 18 | KEY = 'amundsen_updated_timestamp' 19 | LATEST_TIMESTAMP = 'latest_timestmap' 20 | 21 | def __init__(self, 22 | timestamp: int, 23 | ) -> None: 24 | """ 25 | :param timestamp: epoch for latest updated timestamp for neo4j an es 26 | """ 27 | self.timestamp = timestamp 28 | self._node_iter = iter(self.create_nodes()) 29 | self._rel_iter = iter(self.create_relation()) 30 | 31 | def create_next_node(self) -> Union[GraphNode, None]: 32 | """ 33 | Will create an orphan node for last updated timestamp. 34 | """ 35 | try: 36 | return next(self._node_iter) 37 | except StopIteration: 38 | return None 39 | 40 | def create_nodes(self) -> List[GraphNode]: 41 | """ 42 | Create a list of Neo4j node records. 43 | """ 44 | node = GraphNode( 45 | key=Neo4jESLastUpdated.KEY, 46 | label=Neo4jESLastUpdated.LABEL, 47 | attributes={ 48 | Neo4jESLastUpdated.LATEST_TIMESTAMP: self.timestamp 49 | } 50 | ) 51 | return [node] 52 | 53 | def create_next_relation(self) -> Union[GraphRelationship, None]: 54 | try: 55 | return next(self._rel_iter) 56 | except StopIteration: 57 | return None 58 | 59 | def create_relation(self) -> List[GraphRelationship]: 60 | return [] 61 | -------------------------------------------------------------------------------- /tests/unit/extractor/restapi/test_rest_api_extractor.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import unittest 5 | 6 | from pyhocon import ConfigFactory 7 | 8 | from databuilder.extractor.restapi.rest_api_extractor import RestAPIExtractor, REST_API_QUERY, MODEL_CLASS, \ 9 | STATIC_RECORD_DICT 10 | from databuilder.models.dashboard.dashboard_metadata import DashboardMetadata 11 | from databuilder.rest_api.base_rest_api_query import RestApiQuerySeed 12 | 13 | 14 | class TestRestAPIExtractor(unittest.TestCase): 15 | 16 | def test_static_data(self) -> None: 17 | 18 | conf = ConfigFactory.from_dict( 19 | { 20 | REST_API_QUERY: RestApiQuerySeed(seed_record=[{'foo': 'bar'}]), 21 | STATIC_RECORD_DICT: {'john': 'doe'} 22 | } 23 | ) 24 | extractor = RestAPIExtractor() 25 | extractor.init(conf=conf) 26 | 27 | record = extractor.extract() 28 | expected = {'foo': 'bar', 'john': 'doe'} 29 | 30 | self.assertDictEqual(expected, record) 31 | 32 | def test_model_construction(self) -> None: 33 | conf = ConfigFactory.from_dict( 34 | { 35 | REST_API_QUERY: RestApiQuerySeed( 36 | seed_record=[{'dashboard_group': 'foo', 37 | 'dashboard_name': 'bar', 38 | 'description': 'john', 39 | 'dashboard_group_description': 'doe'}]), 40 | MODEL_CLASS: 'databuilder.models.dashboard.dashboard_metadata.DashboardMetadata', 41 | } 42 | ) 43 | extractor = RestAPIExtractor() 44 | extractor.init(conf=conf) 45 | 46 | record = extractor.extract() 47 | expected = DashboardMetadata(dashboard_group='foo', dashboard_name='bar', description='john', 48 | dashboard_group_description='doe') 49 | 50 | self.assertEqual(expected.__repr__(), record.__repr__()) 51 | -------------------------------------------------------------------------------- /tests/unit/callback/test_call_back.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import unittest 5 | 6 | from mock import MagicMock 7 | from typing import List 8 | 9 | from databuilder.callback.call_back import Callback, notify_callbacks 10 | 11 | 12 | class TestCallBack(unittest.TestCase): 13 | 14 | def test_success_notify(self) -> None: 15 | callback1 = MagicMock() 16 | callback2 = MagicMock() 17 | callbacks: List[Callback] = [callback1, callback2] 18 | 19 | notify_callbacks(callbacks, is_success=True) 20 | 21 | self.assertTrue(callback1.on_success.called) 22 | self.assertTrue(not callback1.on_failure.called) 23 | self.assertTrue(callback2.on_success.called) 24 | self.assertTrue(not callback2.on_failure.called) 25 | 26 | def test_failure_notify(self) -> None: 27 | callback1 = MagicMock() 28 | callback2 = MagicMock() 29 | callbacks: List[Callback] = [callback1, callback2] 30 | 31 | notify_callbacks(callbacks, is_success=False) 32 | 33 | self.assertTrue(not callback1.on_success.called) 34 | self.assertTrue(callback1.on_failure.called) 35 | self.assertTrue(not callback2.on_success.called) 36 | self.assertTrue(callback2.on_failure.called) 37 | 38 | def test_notify_failure(self) -> None: 39 | callback1 = MagicMock() 40 | callback2 = MagicMock() 41 | callback2.on_success.side_effect = Exception('Boom') 42 | callback3 = MagicMock() 43 | callbacks: List[Callback] = [callback1, callback2, callback3] 44 | 45 | try: 46 | notify_callbacks(callbacks, is_success=True) 47 | self.assertTrue(False) 48 | except Exception: 49 | self.assertTrue(True) 50 | 51 | self.assertTrue(callback1.on_success.called) 52 | self.assertTrue(callback2.on_success.called) 53 | self.assertTrue(callback3.on_success.called) 54 | 55 | 56 | if __name__ == '__main__': 57 | unittest.main() 58 | -------------------------------------------------------------------------------- /databuilder/models/table_elasticsearch_document.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from typing import List, Optional 5 | 6 | from databuilder.models.elasticsearch_document import ElasticsearchDocument 7 | 8 | 9 | class TableESDocument(ElasticsearchDocument): 10 | """ 11 | Schema for the Search index document 12 | """ 13 | 14 | def __init__(self, 15 | database: str, 16 | cluster: str, 17 | schema: str, 18 | name: str, 19 | key: str, 20 | description: str, 21 | last_updated_timestamp: Optional[int], 22 | column_names: List[str], 23 | column_descriptions: List[str], 24 | total_usage: int, 25 | unique_usage: int, 26 | tags: List[str], 27 | badges: Optional[List[str]] = None, 28 | display_name: Optional[str] = None, 29 | schema_description: Optional[str] = None, 30 | programmatic_descriptions: List[str] = [], 31 | ) -> None: 32 | self.database = database 33 | self.cluster = cluster 34 | self.schema = schema 35 | self.name = name 36 | self.display_name = display_name if display_name else '{schema}.{table}'.format(schema=schema, table=name) 37 | self.key = key 38 | self.description = description 39 | # todo: use last_updated_timestamp to match the record in metadata 40 | self.last_updated_timestamp = int(last_updated_timestamp) if last_updated_timestamp else None 41 | self.column_names = column_names 42 | self.column_descriptions = column_descriptions 43 | self.total_usage = total_usage 44 | self.unique_usage = unique_usage 45 | # todo: will include tag_type once we have better understanding from UI flow. 46 | self.tags = tags 47 | self.badges = badges 48 | self.schema_description = schema_description 49 | self.programmatic_descriptions = programmatic_descriptions 50 | -------------------------------------------------------------------------------- /tests/unit/transformer/test_regex_str_replace_transformer.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import unittest 5 | 6 | from pyhocon import ConfigFactory 7 | from typing import Any 8 | 9 | from databuilder.transformer.regex_str_replace_transformer import RegexStrReplaceTransformer, \ 10 | REGEX_REPLACE_TUPLE_LIST, ATTRIBUTE_NAME 11 | 12 | 13 | class TestRegexReplacement(unittest.TestCase): 14 | 15 | def test(self) -> None: 16 | transformer = self._default_test_transformer() 17 | 18 | foo = Foo('abc') 19 | actual = transformer.transform(foo) 20 | 21 | self.assertEqual('bba', actual.val) 22 | 23 | def test_numeric_val(self) -> None: 24 | transformer = self._default_test_transformer() 25 | 26 | foo = Foo(6) 27 | actual = transformer.transform(foo) 28 | 29 | self.assertEqual(6, actual.val) 30 | 31 | def test_none_val(self) -> None: 32 | transformer = self._default_test_transformer() 33 | 34 | foo = Foo(None) 35 | actual = transformer.transform(foo) 36 | 37 | self.assertEqual(None, actual.val) 38 | 39 | def _default_test_transformer(self) -> RegexStrReplaceTransformer: 40 | config = ConfigFactory.from_dict({ 41 | REGEX_REPLACE_TUPLE_LIST: [('a', 'b'), ('c', 'a')], 42 | ATTRIBUTE_NAME: 'val' 43 | }) 44 | 45 | transformer = RegexStrReplaceTransformer() 46 | transformer.init(config) 47 | 48 | return transformer 49 | 50 | def test_dict_replace(self) -> None: 51 | config = ConfigFactory.from_dict({ 52 | REGEX_REPLACE_TUPLE_LIST: [('\\', '\\\\')], 53 | ATTRIBUTE_NAME: 'val' 54 | }) 55 | 56 | transformer = RegexStrReplaceTransformer() 57 | transformer.init(config) 58 | 59 | d = {'val': '\\'} 60 | 61 | actual = transformer.transform(d) 62 | 63 | self.assertEqual({'val': '\\\\'}, actual) 64 | 65 | 66 | class Foo(object): 67 | def __init__(self, val: Any) -> None: 68 | self.val = val 69 | 70 | 71 | if __name__ == '__main__': 72 | unittest.main() 73 | -------------------------------------------------------------------------------- /tests/unit/models/test_table_column_usage.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import unittest 5 | 6 | from databuilder.models.table_column_usage import ColumnReader, TableColumnUsage 7 | from typing import no_type_check 8 | from databuilder.serializers import neo4_serializer 9 | 10 | 11 | class TestTableColumnUsage(unittest.TestCase): 12 | 13 | @no_type_check # mypy is somehow complaining on assignment on expected dict. 14 | def test_serialize(self) -> None: 15 | 16 | col_readers = [ColumnReader(database='db', cluster='gold', schema='scm', table='foo', column='*', 17 | user_email='john@example.com')] 18 | table_col_usage = TableColumnUsage(col_readers=col_readers) 19 | 20 | node_row = table_col_usage.next_node() 21 | actual = [] 22 | while node_row: 23 | 24 | actual.append(neo4_serializer.serialize_node(node_row)) 25 | node_row = table_col_usage.next_node() 26 | 27 | expected = [{'first_name': '', 28 | 'last_name': '', 29 | 'full_name': '', 30 | 'employee_type': '', 31 | 'is_active:UNQUOTED': True, 32 | 'updated_at:UNQUOTED': 0, 33 | 'LABEL': 'User', 34 | 'slack_id': '', 35 | 'KEY': 'john@example.com', 36 | 'github_username': '', 37 | 'team_name': '', 38 | 'email': 'john@example.com', 39 | 'role_name': ''}] 40 | self.assertEqual(expected, actual) 41 | 42 | rel_row = table_col_usage.next_relation() 43 | actual = [] 44 | while rel_row: 45 | actual.append(neo4_serializer.serialize_relationship(rel_row)) 46 | rel_row = table_col_usage.next_relation() 47 | 48 | expected = [{'read_count:UNQUOTED': 1, 'END_KEY': 'john@example.com', 'START_LABEL': 'Table', 49 | 'END_LABEL': 'User', 'START_KEY': 'db://gold.scm/foo', 'TYPE': 'READ_BY', 'REVERSE_TYPE': 'READ'}] 50 | self.assertEqual(expected, actual) 51 | 52 | 53 | if __name__ == '__main__': 54 | unittest.main() 55 | -------------------------------------------------------------------------------- /databuilder/loader/file_system_elasticsearch_json_loader.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import os 5 | 6 | from pyhocon import ConfigTree 7 | 8 | from databuilder.loader.base_loader import Loader 9 | from databuilder.models.elasticsearch_document import ElasticsearchDocument 10 | 11 | 12 | class FSElasticsearchJSONLoader(Loader): 13 | """ 14 | Loader class to produce Elasticsearch bulk load file to Local FileSystem 15 | """ 16 | FILE_PATH_CONFIG_KEY = 'file_path' 17 | FILE_MODE_CONFIG_KEY = 'mode' 18 | 19 | def init(self, conf: ConfigTree) -> None: 20 | """ 21 | 22 | :param conf: 23 | :return: 24 | """ 25 | self.conf = conf 26 | self.file_path = self.conf.get_string(FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY) 27 | self.file_mode = self.conf.get_string(FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY, 'w') 28 | 29 | file_dir = self.file_path.rsplit('/', 1)[0] 30 | self._ensure_directory_exists(file_dir) 31 | self.file_handler = open(self.file_path, self.file_mode) 32 | 33 | def _ensure_directory_exists(self, path: str) -> None: 34 | """ 35 | Check to ensure file directory exists; create the directories otherwise 36 | :param path: 37 | :return: None 38 | """ 39 | if os.path.exists(path): 40 | return # nothing to do here 41 | 42 | os.makedirs(path) 43 | 44 | def load(self, record: ElasticsearchDocument) -> None: 45 | """ 46 | Write a record in json format to file 47 | :param record: 48 | :return: 49 | """ 50 | if not record: 51 | return 52 | 53 | if not isinstance(record, ElasticsearchDocument): 54 | raise Exception("Record not of type 'ElasticsearchDocument'!") 55 | 56 | self.file_handler.write(record.to_json()) 57 | self.file_handler.flush() 58 | 59 | def close(self) -> None: 60 | """ 61 | close the file handler 62 | :return: 63 | """ 64 | if self.file_handler: 65 | self.file_handler.close() 66 | 67 | def get_scope(self) -> str: 68 | return 'loader.filesystem.elasticsearch' 69 | -------------------------------------------------------------------------------- /databuilder/serializers/neo4_serializer.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from typing import Dict, Any, Optional 5 | 6 | from databuilder.models.graph_relationship import GraphRelationship 7 | from databuilder.models.graph_node import GraphNode 8 | from databuilder.models.graph_serializable import ( 9 | NODE_LABEL, 10 | NODE_KEY, 11 | RELATION_END_KEY, 12 | RELATION_END_LABEL, 13 | RELATION_REVERSE_TYPE, 14 | RELATION_START_KEY, 15 | RELATION_START_LABEL, 16 | RELATION_TYPE 17 | ) 18 | from databuilder.publisher.neo4j_csv_publisher import UNQUOTED_SUFFIX 19 | 20 | 21 | def serialize_node(node: Optional[GraphNode]) -> Dict[str, Any]: 22 | if node is None: 23 | return {} 24 | 25 | node_dict = { 26 | NODE_LABEL: node.label, 27 | NODE_KEY: node.key 28 | } 29 | for key, value in node.attributes.items(): 30 | key_suffix = _get_neo4j_suffix_value(value) 31 | formatted_key = "{key}{suffix}".format( 32 | key=key, 33 | suffix=key_suffix 34 | ) 35 | node_dict[formatted_key] = value 36 | return node_dict 37 | 38 | 39 | def serialize_relationship(relationship: Optional[GraphRelationship]) -> Dict[str, Any]: 40 | if relationship is None: 41 | return {} 42 | 43 | relationship_dict = { 44 | RELATION_START_KEY: relationship.start_key, 45 | RELATION_START_LABEL: relationship.start_label, 46 | RELATION_END_KEY: relationship.end_key, 47 | RELATION_END_LABEL: relationship.end_label, 48 | RELATION_TYPE: relationship.type, 49 | RELATION_REVERSE_TYPE: relationship.reverse_type, 50 | } 51 | for key, value in relationship.attributes.items(): 52 | key_suffix = _get_neo4j_suffix_value(value) 53 | formatted_key = "{key}{suffix}".format( 54 | key=key, 55 | suffix=key_suffix 56 | ) 57 | relationship_dict[formatted_key] = value 58 | 59 | return relationship_dict 60 | 61 | 62 | def _get_neo4j_suffix_value(value: Any) -> str: 63 | if isinstance(value, int): 64 | return UNQUOTED_SUFFIX 65 | 66 | if isinstance(value, bool): 67 | return UNQUOTED_SUFFIX 68 | 69 | return '' 70 | -------------------------------------------------------------------------------- /tests/unit/models/dashboard/test_dashboard_query.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import unittest 5 | 6 | from databuilder.models.dashboard.dashboard_query import DashboardQuery 7 | from databuilder.models.graph_serializable import NODE_KEY, \ 8 | NODE_LABEL, RELATION_START_KEY, RELATION_START_LABEL, RELATION_END_KEY, \ 9 | RELATION_END_LABEL, RELATION_TYPE, RELATION_REVERSE_TYPE 10 | from databuilder.serializers import neo4_serializer 11 | 12 | 13 | class TestDashboardQuery(unittest.TestCase): 14 | 15 | def test_create_nodes(self) -> None: 16 | 17 | dashboard_query = DashboardQuery(dashboard_group_id='dg_id', 18 | dashboard_id='d_id', 19 | query_id='q_id', 20 | query_name='q_name', 21 | url='http://foo.bar/query/baz', 22 | query_text='SELECT * FROM foo.bar') 23 | 24 | actual = dashboard_query.create_next_node() 25 | actual_serialized = neo4_serializer.serialize_node(actual) 26 | expected = {'url': 'http://foo.bar/query/baz', 'name': 'q_name', 'id': 'q_id', 27 | 'query_text': 'SELECT * FROM foo.bar', 28 | NODE_KEY: '_dashboard://gold.dg_id/d_id/query/q_id', 29 | NODE_LABEL: DashboardQuery.DASHBOARD_QUERY_LABEL} 30 | 31 | self.assertEqual(expected, actual_serialized) 32 | 33 | def test_create_relation(self) -> None: 34 | dashboard_query = DashboardQuery(dashboard_group_id='dg_id', 35 | dashboard_id='d_id', 36 | query_id='q_id', 37 | query_name='q_name') 38 | 39 | actual = dashboard_query.create_next_relation() 40 | actual_serialized = neo4_serializer.serialize_relationship(actual) 41 | expected = {RELATION_END_KEY: '_dashboard://gold.dg_id/d_id/query/q_id', RELATION_START_LABEL: 'Dashboard', 42 | RELATION_END_LABEL: DashboardQuery.DASHBOARD_QUERY_LABEL, 43 | RELATION_START_KEY: '_dashboard://gold.dg_id/d_id', RELATION_TYPE: 'HAS_QUERY', 44 | RELATION_REVERSE_TYPE: 'QUERY_OF'} 45 | 46 | self.assertEqual(expected, actual_serialized) 47 | -------------------------------------------------------------------------------- /databuilder/extractor/restapi/rest_api_extractor.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import logging 5 | import importlib 6 | from typing import Any, Iterator, Dict, Optional 7 | 8 | from pyhocon import ConfigTree 9 | 10 | from databuilder.extractor.base_extractor import Extractor 11 | from databuilder.rest_api.base_rest_api_query import BaseRestApiQuery 12 | 13 | 14 | REST_API_QUERY = 'restapi_query' 15 | MODEL_CLASS = 'model_class' 16 | 17 | # Static record that will be added into extracted record 18 | # For example, DashboardMetadata requires product name (static name) of Dashboard and REST api does not provide 19 | # it. and you can add {'product': 'mode'} so that it will be included in the record. 20 | STATIC_RECORD_DICT = 'static_record_dict' 21 | 22 | LOGGER = logging.getLogger(__name__) 23 | 24 | 25 | class RestAPIExtractor(Extractor): 26 | """ 27 | An Extractor that calls one or more REST API to extract the data. 28 | This extractor almost entirely depends on RestApiQuery. 29 | """ 30 | 31 | def init(self, conf: ConfigTree) -> None: 32 | 33 | self._restapi_query: BaseRestApiQuery = conf.get(REST_API_QUERY) 34 | self._iterator: Optional[Iterator[Dict[str, Any]]] = None 35 | self._static_dict = conf.get(STATIC_RECORD_DICT, dict()) 36 | LOGGER.info('static record: {}'.format(self._static_dict)) 37 | 38 | model_class = conf.get(MODEL_CLASS, None) 39 | if model_class: 40 | module_name, class_name = model_class.rsplit(".", 1) 41 | mod = importlib.import_module(module_name) 42 | self.model_class = getattr(mod, class_name) 43 | 44 | def extract(self) -> Any: 45 | """ 46 | Fetch one result row from RestApiQuery, convert to {model_class} if specified before 47 | returning. 48 | :return: 49 | """ 50 | 51 | if not self._iterator: 52 | self._iterator = self._restapi_query.execute() 53 | 54 | try: 55 | record = next(self._iterator) 56 | except StopIteration: 57 | return None 58 | 59 | if self._static_dict: 60 | record.update(self._static_dict) 61 | 62 | if hasattr(self, 'model_class'): 63 | return self.model_class(**record) 64 | 65 | return record 66 | 67 | def get_scope(self) -> str: 68 | 69 | return 'extractor.restapi' 70 | -------------------------------------------------------------------------------- /databuilder/extractor/user/bamboohr/bamboohr_user_extractor.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | 5 | from pyhocon import ConfigTree 6 | import requests 7 | from requests.auth import HTTPBasicAuth 8 | from typing import Iterator, Optional 9 | from xml.etree import ElementTree 10 | 11 | from databuilder.extractor.base_extractor import Extractor 12 | from databuilder.models.user import User 13 | 14 | 15 | class BamboohrUserExtractor(Extractor): 16 | API_KEY = 'api_key' 17 | SUBDOMAIN = 'subdomain' 18 | 19 | def init(self, conf: ConfigTree) -> None: 20 | self._extract_iter: Optional[Iterator] = None 21 | self._extract_iter = None 22 | 23 | self._api_key = conf.get_string(BamboohrUserExtractor.API_KEY) 24 | self._subdomain = conf.get_string(BamboohrUserExtractor.SUBDOMAIN) 25 | 26 | def extract(self) -> Optional[User]: 27 | if not self._extract_iter: 28 | self._extract_iter = self._get_extract_iter() 29 | try: 30 | return next(self._extract_iter) 31 | except StopIteration: 32 | return None 33 | 34 | def _employee_directory_uri(self) -> str: 35 | return 'https://api.bamboohr.com/api/gateway.php/{subdomain}/v1/employees/directory'.format( 36 | subdomain=self._subdomain 37 | ) 38 | 39 | def _get_extract_iter(self) -> Iterator[User]: 40 | response = requests.get( 41 | self._employee_directory_uri(), auth=HTTPBasicAuth(self._api_key, 'x') 42 | ) 43 | 44 | root = ElementTree.fromstring(response.content) 45 | 46 | for user in root.findall('./employees/employee'): 47 | 48 | def get_field(name: str) -> str: 49 | field = user.find('./field[@id=\'{name}\']'.format(name=name)) 50 | if field is not None and field.text is not None: 51 | return field.text 52 | else: 53 | return '' 54 | 55 | yield User( 56 | email=get_field('workEmail'), 57 | first_name=get_field('firstName'), 58 | last_name=get_field('lastName'), 59 | name=get_field('displayName'), 60 | team_name=get_field('department'), 61 | role_name=get_field('jobTitle'), 62 | ) 63 | 64 | def get_scope(self) -> str: 65 | return 'extractor.bamboohr_user' 66 | -------------------------------------------------------------------------------- /tests/unit/extractor/test_kafka_source_extractor.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import logging 5 | from mock import patch, MagicMock 6 | import unittest 7 | 8 | from pyhocon import ConfigFactory 9 | 10 | from databuilder import Scoped 11 | from databuilder.extractor.kafka_source_extractor import KafkaSourceExtractor 12 | 13 | 14 | class TestKafkaSourceExtractor(unittest.TestCase): 15 | def setUp(self) -> None: 16 | logging.basicConfig(level=logging.INFO) 17 | config_dict = { 18 | 'extractor.kafka_source.consumer_config': {'"group.id"': 'consumer-group', 19 | '"enable.auto.commit"': False}, 20 | 'extractor.kafka_source.{}'.format(KafkaSourceExtractor.RAW_VALUE_TRANSFORMER): 21 | 'databuilder.transformer.base_transformer.NoopTransformer', 22 | 'extractor.kafka_source.{}'.format(KafkaSourceExtractor.TOPIC_NAME_LIST): ['test-topic'], 23 | 'extractor.kafka_source.{}'.format(KafkaSourceExtractor.CONSUMER_TOTAL_TIMEOUT_SEC): 1, 24 | 25 | } 26 | self.conf = ConfigFactory.from_dict(config_dict) 27 | 28 | def test_consume_success(self) -> None: 29 | kafka_extractor = KafkaSourceExtractor() 30 | kafka_extractor.init(Scoped.get_scoped_conf(conf=self.conf, 31 | scope=kafka_extractor.get_scope())) 32 | 33 | with patch.object(kafka_extractor, 'consumer') as mock_consumer: 34 | 35 | mock_poll = MagicMock() 36 | mock_poll.error.return_value = False 37 | # only return once 38 | mock_poll.value.side_effect = ['msg'] 39 | mock_consumer.poll.return_value = mock_poll 40 | 41 | records = kafka_extractor.consume() 42 | self.assertEqual(len(records), 1) 43 | 44 | def test_consume_fail(self) -> None: 45 | kafka_extractor = KafkaSourceExtractor() 46 | kafka_extractor.init(Scoped.get_scoped_conf(conf=self.conf, 47 | scope=kafka_extractor.get_scope())) 48 | 49 | with patch.object(kafka_extractor, 'consumer') as mock_consumer: 50 | mock_poll = MagicMock() 51 | mock_poll.error.return_value = True 52 | mock_consumer.poll.return_value = mock_poll 53 | 54 | records = kafka_extractor.consume() 55 | self.assertEqual(len(records), 0) 56 | -------------------------------------------------------------------------------- /tests/unit/models/test_user_elasticsearch_document.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import json 5 | import unittest 6 | 7 | from databuilder.models.user_elasticsearch_document import UserESDocument 8 | 9 | 10 | class TestUserElasticsearchDocument(unittest.TestCase): 11 | 12 | def test_to_json(self) -> None: 13 | """ 14 | Test string generated from to_json method 15 | """ 16 | test_obj = UserESDocument(email='test@email.com', 17 | first_name='test_firstname', 18 | last_name='test_lastname', 19 | full_name='full_name', 20 | github_username='github_user', 21 | team_name='team', 22 | employee_type='fte', 23 | manager_email='test_manager', 24 | slack_id='test_slack', 25 | role_name='role_name', 26 | is_active=True, 27 | total_read=2, 28 | total_own=3, 29 | total_follow=1) 30 | 31 | expected_document_dict = {"first_name": "test_firstname", 32 | "last_name": "test_lastname", 33 | "full_name": "full_name", 34 | "team_name": "team", 35 | "total_follow": 1, 36 | "total_read": 2, 37 | "is_active": True, 38 | "total_own": 3, 39 | "slack_id": 'test_slack', 40 | "role_name": 'role_name', 41 | "manager_email": "test_manager", 42 | 'github_username': "github_user", 43 | "employee_type": 'fte', 44 | "email": "test@email.com", 45 | } 46 | 47 | result = test_obj.to_json() 48 | results = result.split("\n") 49 | 50 | # verify two new line characters in result 51 | self.assertEqual(len(results), 2, "Result from to_json() function doesn't have a newline!") 52 | 53 | self.assertDictEqual(json.loads(results[0]), expected_document_dict) 54 | -------------------------------------------------------------------------------- /databuilder/publisher/base_publisher.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import abc 5 | 6 | from pyhocon import ConfigTree 7 | from typing import List 8 | 9 | from databuilder import Scoped 10 | from databuilder.callback import call_back 11 | from databuilder.callback.call_back import Callback 12 | 13 | 14 | class Publisher(Scoped): 15 | """ 16 | A Publisher that writes dataset (not a record) in Atomic manner, 17 | if possible. 18 | (Either success or fail, no partial state) 19 | Use case: If you want to use neo4j import util or Load CSV util, 20 | that takes CSV file to load database, you need to first create CSV file. 21 | CSV file holds number of records, and loader can writes multiple records 22 | to it. Once loader finishes writing CSV file, you have complete CSV file, 23 | ready to publish to Neo4j. Publisher can take the location of CSV file, 24 | and push to Neo4j. 25 | 26 | """ 27 | 28 | def __init__(self) -> None: 29 | self.call_backs: List[Callback] = [] 30 | 31 | @abc.abstractmethod 32 | def init(self, conf: ConfigTree) -> None: 33 | pass 34 | 35 | def publish(self) -> None: 36 | try: 37 | self.publish_impl() 38 | except Exception as e: 39 | call_back.notify_callbacks(self.call_backs, is_success=False) 40 | raise e 41 | call_back.notify_callbacks(self.call_backs, is_success=True) 42 | 43 | @abc.abstractmethod 44 | def publish_impl(self) -> None: 45 | """ 46 | An implementation of publish method. Subclass of publisher is expected to write publish logic by overriding 47 | this method 48 | :return: None 49 | """ 50 | pass 51 | 52 | def register_call_back(self, callback: Callback) -> None: 53 | """ 54 | Register any callback method that needs to be notified when publisher is either able to successfully publish 55 | or failed to publish 56 | :param callback: 57 | :return: None 58 | """ 59 | self.call_backs.append(callback) 60 | 61 | def get_scope(self) -> str: 62 | return 'publisher' 63 | 64 | 65 | class NoopPublisher(Publisher): 66 | def __init__(self) -> None: 67 | super(NoopPublisher, self).__init__() 68 | 69 | def init(self, conf: ConfigTree) -> None: 70 | pass 71 | 72 | def publish_impl(self) -> None: 73 | pass 74 | 75 | def get_scope(self) -> str: 76 | return 'publisher.noop' 77 | -------------------------------------------------------------------------------- /databuilder/extractor/sql_alchemy_extractor.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import importlib 5 | from sqlalchemy import create_engine 6 | 7 | from pyhocon import ConfigTree 8 | from typing import Any 9 | 10 | from databuilder.extractor.base_extractor import Extractor 11 | 12 | 13 | class SQLAlchemyExtractor(Extractor): 14 | # Config keys 15 | CONN_STRING = 'conn_string' 16 | EXTRACT_SQL = 'extract_sql' 17 | """ 18 | An Extractor that extracts records via SQLAlchemy. Database that supports SQLAlchemy can use this extractor 19 | """ 20 | 21 | def init(self, conf: ConfigTree) -> None: 22 | """ 23 | Establish connections and import data model class if provided 24 | :param conf: 25 | """ 26 | self.conf = conf 27 | self.conn_string = conf.get_string(SQLAlchemyExtractor.CONN_STRING) 28 | self.connection = self._get_connection() 29 | 30 | self.extract_sql = conf.get_string(SQLAlchemyExtractor.EXTRACT_SQL) 31 | 32 | model_class = conf.get('model_class', None) 33 | if model_class: 34 | module_name, class_name = model_class.rsplit(".", 1) 35 | mod = importlib.import_module(module_name) 36 | self.model_class = getattr(mod, class_name) 37 | self._execute_query() 38 | 39 | def _get_connection(self) -> Any: 40 | """ 41 | Create a SQLAlchemy connection to Database 42 | """ 43 | engine = create_engine(self.conn_string) 44 | conn = engine.connect() 45 | return conn 46 | 47 | def _execute_query(self) -> None: 48 | """ 49 | Create an iterator to execute sql. 50 | """ 51 | if not hasattr(self, 'results'): 52 | self.results = self.connection.execute(self.extract_sql) 53 | 54 | if hasattr(self, 'model_class'): 55 | results = [self.model_class(**result) 56 | for result in self.results] 57 | else: 58 | results = self.results 59 | self.iter = iter(results) 60 | 61 | def extract(self) -> Any: 62 | """ 63 | Yield the sql result one at a time. 64 | convert the result to model if a model_class is provided 65 | """ 66 | try: 67 | return next(self.iter) 68 | except StopIteration: 69 | return None 70 | except Exception as e: 71 | raise e 72 | 73 | def get_scope(self) -> str: 74 | return 'extractor.sqlalchemy' 75 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from setuptools import setup, find_packages 5 | 6 | 7 | __version__ = '4.0.3' 8 | 9 | 10 | requirements = [ 11 | "neo4j-driver>=1.7.2,<4.0", 12 | "pytz>=2018.4", 13 | "statsd>=3.2.1", 14 | "retrying>=1.3.3", 15 | "requests>=2.23.0,<3.0", 16 | "elasticsearch>=6.2.0,<7.0", 17 | "pyhocon>=0.3.42", 18 | "unidecode", 19 | "Jinja2>=2.10.0,<2.12", 20 | "pandas>=0.21.0,<1.2.0" 21 | ] 22 | 23 | kafka = ['confluent-kafka==1.0.0'] 24 | 25 | cassandra = ['cassandra-driver==3.20.1'] 26 | 27 | glue = ['boto3==1.10.1'] 28 | 29 | snowflake = [ 30 | 'snowflake-connector-python', 31 | 'snowflake-sqlalchemy' 32 | ] 33 | 34 | athena = ['PyAthena[SQLAlchemy]>=1.0.0'] 35 | 36 | # Python API client for google 37 | # License: Apache Software License 38 | # Upstream url: https://github.com/googleapis/google-api-python-client 39 | bigquery = [ 40 | 'google-api-python-client>=1.6.0, <2.0.0dev', 41 | 'google-auth-httplib2>=0.0.1' 42 | 'google-auth>=1.0.0, <2.0.0dev' 43 | ] 44 | 45 | jsonpath = ['jsonpath_rw==1.4.0'] 46 | 47 | db2 = [ 48 | 'ibm_db==3.0.1', 49 | 'ibm-db-sa-py3==0.3.1-1' 50 | ] 51 | 52 | dremio = [ 53 | 'pyodbc==4.0.30' 54 | ] 55 | 56 | druid = [ 57 | 'pydruid' 58 | ] 59 | 60 | spark = [ 61 | 'pyspark == 3.0.1' 62 | ] 63 | 64 | all_deps = requirements + kafka + cassandra + glue + snowflake + athena + \ 65 | bigquery + jsonpath + db2 + dremio + druid + spark 66 | 67 | setup( 68 | name='amundsen-databuilder', 69 | version=__version__, 70 | description='Amundsen Data builder', 71 | url='https://www.github.com/amundsen-io/amundsendatabuilder', 72 | maintainer='Amundsen TSC', 73 | maintainer_email='amundsen-tsc@lists.lfai.foundation', 74 | packages=find_packages(exclude=['tests*']), 75 | dependency_links=[], 76 | install_requires=requirements, 77 | python_requires='>=3.6', 78 | extras_require={ 79 | 'all': all_deps, 80 | 'kafka': kafka, # To use with Kafka source extractor 81 | 'cassandra': cassandra, 82 | 'glue': glue, 83 | 'snowflake': snowflake, 84 | 'athena': athena, 85 | 'bigquery': bigquery, 86 | 'jsonpath': jsonpath, 87 | 'db2': db2, 88 | 'dremio': dremio, 89 | 'druid': druid, 90 | 'delta-lake': spark 91 | }, 92 | classifiers=[ 93 | 'Programming Language :: Python :: 3.6', 94 | 'Programming Language :: Python :: 3.7', 95 | ], 96 | ) 97 | -------------------------------------------------------------------------------- /databuilder/extractor/dashboard/mode_analytics/mode_dashboard_usage_extractor.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import logging 5 | 6 | from pyhocon import ConfigTree 7 | from typing import Any 8 | 9 | from databuilder.extractor.base_extractor import Extractor 10 | from databuilder.extractor.dashboard.mode_analytics.mode_dashboard_utils import ModeDashboardUtils 11 | from databuilder.rest_api.mode_analytics.mode_paginated_rest_api_query import ModePaginatedRestApiQuery 12 | from databuilder.rest_api.rest_api_query import RestApiQuery 13 | 14 | LOGGER = logging.getLogger(__name__) 15 | 16 | 17 | class ModeDashboardUsageExtractor(Extractor): 18 | """ 19 | A Extractor that extracts Mode dashboard's accumulated view count 20 | """ 21 | 22 | def init(self, conf: ConfigTree) -> None: 23 | self._conf = conf 24 | 25 | restapi_query = self._build_restapi_query() 26 | self._extractor = ModeDashboardUtils.create_mode_rest_api_extractor(restapi_query=restapi_query, 27 | conf=self._conf) 28 | 29 | def extract(self) -> Any: 30 | return self._extractor.extract() 31 | 32 | def get_scope(self) -> str: 33 | return 'extractor.mode_dashboard_usage' 34 | 35 | def _build_restapi_query(self) -> RestApiQuery: 36 | """ 37 | Build REST API Query. To get Mode Dashboard usage, it needs to call two APIs (spaces API and reports 38 | API) joining together. 39 | :return: A RestApiQuery that provides Mode Dashboard metadata 40 | """ 41 | 42 | # https://mode.com/developer/api-reference/analytics/reports/#listReportsInSpace 43 | reports_url_template = 'https://app.mode.com/api/{organization}/spaces/{dashboard_group_id}/reports' 44 | 45 | spaces_query = ModeDashboardUtils.get_spaces_query_api(conf=self._conf) 46 | params = ModeDashboardUtils.get_auth_params(conf=self._conf) 47 | 48 | # Reports 49 | # JSONPATH expression. it goes into array which is located in _embedded.reports and then extracts token, 50 | # and view_count 51 | json_path = '_embedded.reports[*].[token,view_count]' 52 | field_names = ['dashboard_id', 'accumulated_view_count'] 53 | reports_query = ModePaginatedRestApiQuery(query_to_join=spaces_query, url=reports_url_template, params=params, 54 | json_path=json_path, field_names=field_names, skip_no_result=True) 55 | return reports_query 56 | -------------------------------------------------------------------------------- /tests/unit/transformer/test_bigquery_usage_transformer.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import unittest 5 | 6 | from pyhocon import ConfigFactory 7 | 8 | from databuilder.transformer.bigquery_usage_transformer import BigqueryUsageTransformer 9 | from databuilder.extractor.bigquery_usage_extractor import TableColumnUsageTuple 10 | from databuilder.models.table_column_usage import TableColumnUsage 11 | 12 | 13 | class TestBigQueryUsageTransform(unittest.TestCase): 14 | 15 | DATABASE = 'bigquery' 16 | CLUSTER = 'your-project-here' 17 | DATASET = 'dataset' 18 | TABLE = 'table' 19 | COLUMN = '*' 20 | EMAIL = 'your-user-here@test.com' 21 | READ_COUNT = 305 22 | 23 | def test_transform_function(self) -> None: 24 | config = ConfigFactory.from_dict({}) 25 | 26 | transformer = BigqueryUsageTransformer() 27 | transformer.init(config) 28 | 29 | key = TableColumnUsageTuple(database=TestBigQueryUsageTransform.DATABASE, 30 | cluster=TestBigQueryUsageTransform.CLUSTER, 31 | schema=TestBigQueryUsageTransform.DATASET, 32 | table=TestBigQueryUsageTransform.TABLE, 33 | column=TestBigQueryUsageTransform.COLUMN, 34 | email=TestBigQueryUsageTransform.EMAIL) 35 | 36 | t1 = (key, TestBigQueryUsageTransform.READ_COUNT) 37 | xformed = transformer.transform(t1) 38 | 39 | assert xformed is not None 40 | self.assertIsInstance(xformed, TableColumnUsage) 41 | col_readers = list(xformed.col_readers) 42 | self.assertEqual(len(col_readers), 1) 43 | col_reader = col_readers[0] 44 | self.assertEqual(col_reader.cluster, TestBigQueryUsageTransform.CLUSTER) 45 | self.assertEqual(col_reader.database, TestBigQueryUsageTransform.DATABASE) 46 | self.assertEqual(col_reader.schema, TestBigQueryUsageTransform.DATASET) 47 | self.assertEqual(col_reader.table, TestBigQueryUsageTransform.TABLE) 48 | self.assertEqual(col_reader.column, TestBigQueryUsageTransform.COLUMN) 49 | self.assertEqual(col_reader.user_email, TestBigQueryUsageTransform.EMAIL) 50 | self.assertEqual(col_reader.read_count, TestBigQueryUsageTransform.READ_COUNT) 51 | 52 | def test_scope(self) -> None: 53 | config = ConfigFactory.from_dict({}) 54 | 55 | transformer = BigqueryUsageTransformer() 56 | transformer.init(config) 57 | 58 | self.assertEqual(transformer.get_scope(), 'transformer.bigquery_usage') 59 | -------------------------------------------------------------------------------- /tests/unit/extractor/dashboard/mode_analytics/batch/test_mode_dashboard_charts_batch_extractor.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import unittest 5 | from mock import patch 6 | from pyhocon import ConfigFactory 7 | 8 | from databuilder import Scoped 9 | from databuilder.extractor.dashboard.mode_analytics.batch.\ 10 | mode_dashboard_charts_batch_extractor import ModeDashboardChartsBatchExtractor 11 | 12 | 13 | class TestModeDashboardChartsBatchExtractor(unittest.TestCase): 14 | def setUp(self) -> None: 15 | config = ConfigFactory.from_dict({ 16 | 'extractor.mode_dashboard_chart_batch.organization': 'amundsen', 17 | 'extractor.mode_dashboard_chart_batch.mode_user_token': 'amundsen_user_token', 18 | 'extractor.mode_dashboard_chart_batch.mode_password_token': 'amundsen_password_token', 19 | 'extractor.mode_dashboard_chart_batch.mode_bearer_token': 'amundsen_bearer_token', 20 | }) 21 | self.config = config 22 | 23 | def test_dashboard_chart_extractor_empty_record(self) -> None: 24 | extractor = ModeDashboardChartsBatchExtractor() 25 | extractor.init(Scoped.get_scoped_conf(conf=self.config, scope=extractor.get_scope())) 26 | 27 | with patch('databuilder.rest_api.rest_api_query.requests.get'): 28 | record = extractor.extract() 29 | self.assertIsNone(record) 30 | 31 | def test_dashboard_chart_extractor_actual_record(self) -> None: 32 | extractor = ModeDashboardChartsBatchExtractor() 33 | extractor.init(Scoped.get_scoped_conf(conf=self.config, scope=extractor.get_scope())) 34 | 35 | with patch('databuilder.extractor.restapi.rest_api_extractor.RestAPIExtractor.extract') as mock_get: 36 | mock_get.return_value = { 37 | 'organization': 'amundsen', 38 | 'is_active': None, 39 | 'updated_at': None, 40 | 'do_not_update_empty_attribute': True, 41 | 'dashboard_group_id': 'ggg', 42 | 'dashboard_id': 'ddd', 43 | 'query_id': 'yyy', 44 | 'chart_id': 'xxx', 45 | 'chart_name': 'some chart', 46 | 'chart_type': 'bigNumber', 47 | 'product': 'mode' 48 | } 49 | 50 | record = extractor.extract() 51 | self.assertEqual(record._dashboard_group_id, 'ggg') 52 | self.assertEqual(record._dashboard_id, 'ddd') 53 | self.assertEqual(record._chart_name, 'some chart') 54 | self.assertEqual(record._product, 'mode') 55 | 56 | 57 | if __name__ == '__main__': 58 | unittest.main() 59 | -------------------------------------------------------------------------------- /databuilder/task/task.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import logging 5 | 6 | from pyhocon import ConfigTree 7 | 8 | from databuilder import Scoped 9 | from databuilder.extractor.base_extractor import Extractor 10 | from databuilder.loader.base_loader import Loader 11 | from databuilder.task.base_task import Task 12 | from databuilder.transformer.base_transformer import Transformer 13 | from databuilder.transformer.base_transformer \ 14 | import NoopTransformer 15 | from databuilder.utils.closer import Closer 16 | 17 | 18 | LOGGER = logging.getLogger(__name__) 19 | 20 | 21 | class DefaultTask(Task): 22 | """ 23 | A default task expecting to extract, transform and load. 24 | 25 | """ 26 | 27 | # Determines the frequency of the log on task progress 28 | PROGRESS_REPORT_FREQUENCY = 'progress_report_frequency' 29 | 30 | def __init__(self, 31 | extractor: Extractor, 32 | loader: Loader, 33 | transformer: Transformer = NoopTransformer()) -> None: 34 | self.extractor = extractor 35 | self.transformer = transformer 36 | self.loader = loader 37 | 38 | self._closer = Closer() 39 | self._closer.register(self.extractor.close) 40 | self._closer.register(self.transformer.close) 41 | self._closer.register(self.loader.close) 42 | 43 | def init(self, conf: ConfigTree) -> None: 44 | self._progress_report_frequency = \ 45 | conf.get_int('{}.{}'.format(self.get_scope(), DefaultTask.PROGRESS_REPORT_FREQUENCY), 500) 46 | 47 | self.extractor.init(Scoped.get_scoped_conf(conf, self.extractor.get_scope())) 48 | self.transformer.init(Scoped.get_scoped_conf(conf, self.transformer.get_scope())) 49 | self.loader.init(Scoped.get_scoped_conf(conf, self.loader.get_scope())) 50 | 51 | def run(self) -> None: 52 | """ 53 | Runs a task 54 | :return: 55 | """ 56 | LOGGER.info('Running a task') 57 | try: 58 | record = self.extractor.extract() 59 | count = 1 60 | while record: 61 | record = self.transformer.transform(record) 62 | if not record: 63 | record = self.extractor.extract() 64 | continue 65 | self.loader.load(record) 66 | record = self.extractor.extract() 67 | count += 1 68 | if count > 0 and count % self._progress_report_frequency == 0: 69 | LOGGER.info('Extracted {} records so far'.format(count)) 70 | 71 | finally: 72 | self._closer.close() 73 | -------------------------------------------------------------------------------- /databuilder/models/schema/schema.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from typing import Any, Union, Iterator 5 | 6 | from databuilder.models.graph_serializable import (GraphSerializable) 7 | from databuilder.models.schema.schema_constant import SCHEMA_NODE_LABEL, SCHEMA_NAME_ATTR 8 | from databuilder.models.table_metadata import DescriptionMetadata 9 | from databuilder.models.graph_node import GraphNode 10 | from databuilder.models.graph_relationship import GraphRelationship 11 | 12 | 13 | class SchemaModel(GraphSerializable): 14 | 15 | def __init__(self, 16 | schema_key: str, 17 | schema: str, 18 | description: str=None, 19 | description_source: str=None, 20 | **kwargs: Any 21 | ) -> None: 22 | self._schema_key = schema_key 23 | self._schema = schema 24 | self._description = DescriptionMetadata.create_description_metadata(text=description, 25 | source=description_source) \ 26 | if description else None 27 | self._node_iterator = self._create_node_iterator() 28 | self._relation_iterator = self._create_relation_iterator() 29 | 30 | def create_next_node(self) -> Union[GraphNode, None]: 31 | try: 32 | return next(self._node_iterator) 33 | except StopIteration: 34 | return None 35 | 36 | def _create_node_iterator(self) -> Iterator[GraphNode]: 37 | node = GraphNode( 38 | key=self._schema_key, 39 | label=SCHEMA_NODE_LABEL, 40 | attributes={ 41 | SCHEMA_NAME_ATTR: self._schema, 42 | } 43 | ) 44 | yield node 45 | 46 | if self._description: 47 | yield self._description.get_node(self._get_description_node_key()) 48 | 49 | def create_next_relation(self) -> Union[GraphRelationship, None]: 50 | try: 51 | return next(self._relation_iterator) 52 | except StopIteration: 53 | return None 54 | 55 | def _get_description_node_key(self) -> str: 56 | desc = self._description.get_description_id() if self._description is not None else '' 57 | return '{}/{}'.format(self._schema_key, desc) 58 | 59 | def _create_relation_iterator(self) -> Iterator[GraphRelationship]: 60 | if self._description: 61 | yield self._description.get_relation(start_node=SCHEMA_NODE_LABEL, 62 | start_key=self._schema_key, 63 | end_key=self._get_description_node_key()) 64 | -------------------------------------------------------------------------------- /tests/unit/transformer/test_table_tag_transformer.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import unittest 5 | 6 | from pyhocon import ConfigFactory 7 | 8 | from databuilder.transformer.table_tag_transformer import TableTagTransformer 9 | from databuilder.models.table_metadata import TableMetadata 10 | 11 | 12 | class TestTableTagTransformer(unittest.TestCase): 13 | def test_single_tag(self) -> None: 14 | transformer = TableTagTransformer() 15 | config = ConfigFactory.from_dict({ 16 | TableTagTransformer.TAGS: 'foo', 17 | }) 18 | transformer.init(conf=config) 19 | 20 | result = transformer.transform(TableMetadata( 21 | database='test_db', 22 | cluster='test_cluster', 23 | schema='test_schema', 24 | name='test_table', 25 | description='', 26 | )) 27 | 28 | self.assertEqual(result.tags, ['foo']) 29 | 30 | def test_multiple_tags_comma_delimited(self) -> None: 31 | transformer = TableTagTransformer() 32 | config = ConfigFactory.from_dict({ 33 | TableTagTransformer.TAGS: 'foo,bar', 34 | }) 35 | transformer.init(conf=config) 36 | 37 | result = transformer.transform(TableMetadata( 38 | database='test_db', 39 | cluster='test_cluster', 40 | schema='test_schema', 41 | name='test_table', 42 | description='', 43 | )) 44 | 45 | self.assertEqual(result.tags, ['foo', 'bar']) 46 | 47 | def test_add_tag_to_existing_tags(self) -> None: 48 | transformer = TableTagTransformer() 49 | config = ConfigFactory.from_dict({ 50 | TableTagTransformer.TAGS: 'baz', 51 | }) 52 | transformer.init(conf=config) 53 | 54 | result = transformer.transform(TableMetadata( 55 | database='test_db', 56 | cluster='test_cluster', 57 | schema='test_schema', 58 | name='test_table', 59 | description='', 60 | tags='foo,bar', 61 | )) 62 | self.assertEqual(result.tags, ['foo', 'bar', 'baz']) 63 | 64 | def test_tags_not_added_to_other_objects(self) -> None: 65 | transformer = TableTagTransformer() 66 | config = ConfigFactory.from_dict({ 67 | TableTagTransformer.TAGS: 'new_tag', 68 | }) 69 | transformer.init(conf=config) 70 | 71 | class NotATable(): 72 | tags = 'existing_tag' 73 | 74 | result = transformer.transform(NotATable()) 75 | 76 | self.assertEqual(result.tags, 'existing_tag') 77 | 78 | 79 | if __name__ == '__main__': 80 | unittest.main() 81 | -------------------------------------------------------------------------------- /databuilder/extractor/db_api_extractor.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import importlib 5 | import logging 6 | from typing import Iterable, Any 7 | 8 | from pyhocon import ConfigTree 9 | 10 | from databuilder.extractor.base_extractor import Extractor 11 | 12 | 13 | LOGGER = logging.getLogger(__name__) 14 | 15 | 16 | class DBAPIExtractor(Extractor): 17 | """ 18 | Generic DB API extractor. 19 | """ 20 | CONNECTION_CONFIG_KEY = 'connection' 21 | SQL_CONFIG_KEY = 'sql' 22 | 23 | def init(self, conf: ConfigTree) -> None: 24 | """ 25 | Receives a {Connection} object and {sql} to execute. 26 | An optional model class can be passed, in which, sql result row 27 | would be converted to a class instance and returned to calling 28 | function 29 | :param conf: 30 | :return: 31 | """ 32 | self.conf = conf 33 | self.connection: Any = conf.get(DBAPIExtractor.CONNECTION_CONFIG_KEY) 34 | self.cursor = self.connection.cursor() 35 | self.sql = conf.get(DBAPIExtractor.SQL_CONFIG_KEY) 36 | 37 | model_class = conf.get('model_class', None) 38 | if model_class: 39 | module_name, class_name = model_class.rsplit(".", 1) 40 | mod = importlib.import_module(module_name) 41 | self.model_class = getattr(mod, class_name) 42 | 43 | self._iter = iter(self._execute_query()) 44 | 45 | def _execute_query(self) -> Iterable[Any]: 46 | """ 47 | Use cursor to execute the {sql} 48 | :return: 49 | """ 50 | LOGGER.info('Executing query: \n{}'.format(self.sql)) 51 | self.cursor.execute(self.sql) 52 | return self.cursor.fetchall() 53 | 54 | def extract(self) -> Any: 55 | """ 56 | Fetch one sql result row, convert to {model_class} if specified before 57 | returning. 58 | :return: 59 | """ 60 | 61 | try: 62 | result = next(self._iter) 63 | except StopIteration: 64 | return None 65 | 66 | if hasattr(self, 'model_class'): 67 | obj = self.model_class(*result[:len(result)]) 68 | return obj 69 | else: 70 | return result 71 | 72 | def close(self) -> None: 73 | """ 74 | close cursor and connection handlers 75 | :return: 76 | """ 77 | try: 78 | self.cursor.close() 79 | self.connection.close() 80 | except Exception as e: 81 | LOGGER.warning("Exception encountered while closing up connection handler!", e) 82 | 83 | def get_scope(self) -> str: 84 | return 'extractor.dbapi' 85 | -------------------------------------------------------------------------------- /tests/unit/models/test_dashboard_elasticsearch_document.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import json 5 | import unittest 6 | 7 | from databuilder.models.dashboard_elasticsearch_document import DashboardESDocument 8 | 9 | 10 | class TestDashboardElasticsearchDocument(unittest.TestCase): 11 | 12 | def test_to_json(self) -> None: 13 | """ 14 | Test string generated from to_json method 15 | """ 16 | test_obj = DashboardESDocument(group_name='test_dashboard_group', 17 | name='test_dashboard_name', 18 | description='test_description', 19 | product='mode', 20 | cluster='gold', 21 | group_description='work space group', 22 | query_names=['query1'], 23 | chart_names=['chart1'], 24 | group_url='mode_group_url', 25 | url='mode_report_url', 26 | uri='mode_dashboard://gold.cluster/dashboard_group/dashboard', 27 | last_successful_run_timestamp=10, 28 | total_usage=10, 29 | tags=['test'], 30 | badges=['test_badge']) 31 | 32 | expected_document_dict = {"group_name": "test_dashboard_group", 33 | "name": "test_dashboard_name", 34 | "description": "test_description", 35 | "product": "mode", 36 | "cluster": "gold", 37 | "group_url": "mode_group_url", 38 | "url": "mode_report_url", 39 | "uri": "mode_dashboard://gold.cluster/dashboard_group/dashboard", 40 | "query_names": ['query1'], 41 | "chart_names": ['chart1'], 42 | "last_successful_run_timestamp": 10, 43 | "group_description": "work space group", 44 | "total_usage": 10, 45 | "tags": ["test"], 46 | "badges": ["test_badge"], 47 | 48 | } 49 | 50 | result = test_obj.to_json() 51 | results = result.split("\n") 52 | 53 | # verify two new line characters in result 54 | self.assertEqual(len(results), 2, "Result from to_json() function doesn't have a newline!") 55 | self.assertDictEqual(json.loads(results[0]), expected_document_dict) 56 | -------------------------------------------------------------------------------- /tests/unit/models/test_table_elasticsearch_document.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import json 5 | import unittest 6 | 7 | from databuilder.models.table_elasticsearch_document import TableESDocument 8 | 9 | 10 | class TestTableElasticsearchDocument(unittest.TestCase): 11 | 12 | def test_to_json(self) -> None: 13 | """ 14 | Test string generated from to_json method 15 | """ 16 | test_obj = TableESDocument(database='test_database', 17 | cluster='test_cluster', 18 | schema='test_schema', 19 | name='test_table', 20 | key='test_table_key', 21 | last_updated_timestamp=123456789, 22 | description='test_table_description', 23 | column_names=['test_col1', 'test_col2'], 24 | column_descriptions=['test_description1', 'test_description2'], 25 | total_usage=100, 26 | unique_usage=10, 27 | tags=['test'], 28 | programmatic_descriptions=['test'], 29 | badges=['badge1'], 30 | schema_description='schema description') 31 | 32 | expected_document_dict = {"database": "test_database", 33 | "cluster": "test_cluster", 34 | "schema": "test_schema", 35 | "name": "test_table", 36 | "display_name": "test_schema.test_table", 37 | "key": "test_table_key", 38 | "last_updated_timestamp": 123456789, 39 | "description": "test_table_description", 40 | "column_names": ["test_col1", "test_col2"], 41 | "column_descriptions": ["test_description1", "test_description2"], 42 | "total_usage": 100, 43 | "unique_usage": 10, 44 | "tags": ["test"], 45 | "programmatic_descriptions": ['test'], 46 | "badges": ["badge1"], 47 | 'schema_description': 'schema description' 48 | } 49 | 50 | result = test_obj.to_json() 51 | results = result.split("\n") 52 | 53 | # verify two new line characters in result 54 | self.assertEqual(len(results), 2, "Result from to_json() function doesn't have a newline!") 55 | self.assertDictEqual(json.loads(results[0]), expected_document_dict) 56 | -------------------------------------------------------------------------------- /databuilder/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import abc 5 | 6 | from pyhocon import ConfigTree, ConfigFactory 7 | 8 | 9 | class Scoped(object, metaclass=abc.ABCMeta): 10 | _EMPTY_CONFIG = ConfigFactory.from_dict({}) 11 | """ 12 | An interface for class that works with scoped (nested) config. 13 | https://github.com/chimpler/pyhocon 14 | A scoped instance will use config within its scope. This is a way to 15 | distribute configuration to its implementation instead of having it in 16 | one central place. 17 | This is very useful for DataBuilder as it has different components 18 | (extractor, transformer, loader, publisher) and its component itself 19 | could have different implementation. 20 | For example these can be a configuration for two different extractors 21 | "extractor.mysql.url" for MySQLExtractor 22 | "extractor.filesystem.source_path" for FileSystemExtractor 23 | 24 | For MySQLExtractor, if you defined scope as "extractor.mysql", scoped 25 | config will basically reduce it to the config that is only for MySQL. 26 | config.get("extractor.mysql") provides you all the config within 27 | 'extractor.mysql'. By removing outer context from the config, 28 | MySQLExtractor is highly reusable. 29 | """ 30 | 31 | @abc.abstractmethod 32 | def init(self, conf: ConfigTree) -> None: 33 | """ 34 | All scoped instance is expected to be lazily initialized. Means that 35 | __init__ should not have any heavy operation such as service call. 36 | The reason behind is that Databuilder is a code at the same time, 37 | code itself is used as a configuration. For example, you can 38 | instantiate scoped instance with all the parameters already set, 39 | ready to run, and actual execution will be executing init() and 40 | execute. 41 | 42 | :param conf: Typesafe config instance 43 | :return: None 44 | """ 45 | pass 46 | 47 | @abc.abstractmethod 48 | def get_scope(self) -> str: 49 | """ 50 | A scope for the config. Typesafe config supports nested config. 51 | Scope, string, is used to basically peel off nested config 52 | :return: 53 | """ 54 | return '' 55 | 56 | def close(self) -> None: 57 | """ 58 | Anything that needs to be cleaned up after the use of the instance. 59 | :return: None 60 | """ 61 | pass 62 | 63 | @classmethod 64 | def get_scoped_conf(cls, conf: ConfigTree, scope: str) -> ConfigTree: 65 | """ 66 | Convenient method to provide scoped method. 67 | 68 | :param conf: Type safe config instance 69 | :param scope: scope string 70 | :return: Type safe config instance 71 | """ 72 | if not scope: 73 | return Scoped._EMPTY_CONFIG 74 | 75 | return conf.get(scope, Scoped._EMPTY_CONFIG) 76 | -------------------------------------------------------------------------------- /databuilder/extractor/redshift_metadata_extractor.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from pyhocon import ConfigFactory, ConfigTree # noqa: F401 5 | from typing import Iterator, Union, Dict, Any # noqa: F401 6 | 7 | from databuilder.extractor.base_postgres_metadata_extractor import BasePostgresMetadataExtractor 8 | 9 | 10 | class RedshiftMetadataExtractor(BasePostgresMetadataExtractor): 11 | """ 12 | Extracts Redshift table and column metadata from underlying meta store database using SQLAlchemyExtractor 13 | 14 | This differs from the PostgresMetadataExtractor because in order to support Redshift's late binding views, 15 | we need to join the INFORMATION_SCHEMA data against the function PG_GET_LATE_BINDING_VIEW_COLS(). 16 | """ 17 | 18 | def get_sql_statement(self, use_catalog_as_cluster_name, where_clause_suffix): 19 | # type: (bool, str) -> str 20 | if use_catalog_as_cluster_name: 21 | cluster_source = "CURRENT_DATABASE()" 22 | else: 23 | cluster_source = "'{}'".format(self._cluster) 24 | 25 | return """ 26 | SELECT 27 | * 28 | FROM ( 29 | SELECT 30 | {cluster_source} as cluster, 31 | c.table_schema as schema, 32 | c.table_name as name, 33 | pgtd.description as description, 34 | c.column_name as col_name, 35 | c.data_type as col_type, 36 | pgcd.description as col_description, 37 | ordinal_position as col_sort_order 38 | FROM INFORMATION_SCHEMA.COLUMNS c 39 | INNER JOIN 40 | pg_catalog.pg_statio_all_tables as st on c.table_schema=st.schemaname and c.table_name=st.relname 41 | LEFT JOIN 42 | pg_catalog.pg_description pgcd on pgcd.objoid=st.relid and pgcd.objsubid=c.ordinal_position 43 | LEFT JOIN 44 | pg_catalog.pg_description pgtd on pgtd.objoid=st.relid and pgtd.objsubid=0 45 | 46 | UNION 47 | 48 | SELECT 49 | {cluster_source} as cluster, 50 | view_schema as schema, 51 | view_name as name, 52 | NULL as description, 53 | column_name as col_name, 54 | data_type as col_type, 55 | NULL as col_description, 56 | ordinal_position as col_sort_order 57 | FROM 58 | PG_GET_LATE_BINDING_VIEW_COLS() 59 | COLS(view_schema NAME, view_name NAME, column_name NAME, data_type VARCHAR, ordinal_position INT) 60 | ) 61 | 62 | {where_clause_suffix} 63 | ORDER by cluster, schema, name, col_sort_order ; 64 | """.format( 65 | cluster_source=cluster_source, 66 | where_clause_suffix=where_clause_suffix, 67 | ) 68 | 69 | def get_scope(self): 70 | # type: () -> str 71 | return 'extractor.redshift_metadata' 72 | -------------------------------------------------------------------------------- /databuilder/rest_api/mode_analytics/mode_paginated_rest_api_query.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import logging 5 | 6 | import requests 7 | from jsonpath_rw import parse 8 | from typing import Any, Dict 9 | 10 | from databuilder.rest_api.rest_api_query import RestApiQuery 11 | 12 | # How many records considers as full and indicating there might be next page? In list reports on space API, it's 30. 13 | DEFAULT_MAX_RECORD_SIZE = 30 14 | PAGE_SUFFIX_TEMPLATE = '?page={}' 15 | LIST_REPORTS_PAGINATION_JSON_PATH = '_embedded.reports[*]' # So far this is the only paginated API that we need. 16 | 17 | LOGGER = logging.getLogger(__name__) 18 | 19 | 20 | class ModePaginatedRestApiQuery(RestApiQuery): 21 | """ 22 | Certain API such as get list of reports on a space is paginated with query term page. 23 | https://mode.com/developer/api-cookbook/management/get-all-reports/ 24 | 25 | This subclass makes sure to detect if there's more page and update URL to get next page. 26 | """ 27 | 28 | def __init__(self, 29 | pagination_json_path: str = LIST_REPORTS_PAGINATION_JSON_PATH, 30 | max_record_size: int = DEFAULT_MAX_RECORD_SIZE, 31 | **kwargs: Any 32 | ): 33 | # type (...) -> None 34 | super(ModePaginatedRestApiQuery, self).__init__(**kwargs) 35 | 36 | self._original_url = self._url 37 | self._max_record_size = max_record_size 38 | self._current_page = 1 39 | self._pagination_jsonpath_expr = parse(pagination_json_path) 40 | 41 | def _preprocess_url(self, 42 | record: Dict[str, Any], 43 | ) -> str: 44 | """ 45 | Updates URL with page information 46 | :param record: 47 | :return: a URL that is ready to be called. 48 | """ 49 | page_suffix = PAGE_SUFFIX_TEMPLATE.format(self._current_page) # example: ?page=2 50 | 51 | # example: http://foo.bar/resources?page=2 52 | self._url = self._original_url + '{page_suffix}'.format(page_suffix=page_suffix) 53 | return self._url.format(**record) 54 | 55 | def _post_process(self, 56 | response: requests.Response, 57 | ) -> None: 58 | """ 59 | Updates trigger to pagination (self._more_pages) as well as current_page (self._current_page) 60 | Mode does not have explicit indicator that it just the number of records need to be certain number that 61 | implying that there could be more records on next page. 62 | :return: 63 | """ 64 | 65 | result_list = [match.value for match in self._pagination_jsonpath_expr.find(response.json())] 66 | 67 | if result_list and len(result_list) >= self._max_record_size: 68 | self._more_pages = True 69 | self._current_page = self._current_page + 1 70 | return 71 | 72 | self._more_pages = False 73 | self._current_page = 1 74 | -------------------------------------------------------------------------------- /databuilder/extractor/dashboard/mode_analytics/mode_dashboard_last_successful_executions_extractor.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import logging 5 | 6 | from pyhocon import ConfigTree, ConfigFactory 7 | 8 | from databuilder.extractor.dashboard.mode_analytics.mode_dashboard_executions_extractor import \ 9 | ModeDashboardExecutionsExtractor 10 | from databuilder.extractor.dashboard.mode_analytics.mode_dashboard_utils import ModeDashboardUtils 11 | from databuilder.extractor.restapi.rest_api_extractor import STATIC_RECORD_DICT 12 | from databuilder.models.dashboard.dashboard_execution import DashboardExecution 13 | from databuilder.rest_api.mode_analytics.mode_paginated_rest_api_query import ModePaginatedRestApiQuery 14 | from databuilder.rest_api.rest_api_query import RestApiQuery 15 | 16 | LOGGER = logging.getLogger(__name__) 17 | 18 | 19 | class ModeDashboardLastSuccessfulExecutionExtractor(ModeDashboardExecutionsExtractor): 20 | """ 21 | A Extractor that extracts Mode dashboard's last successful run (execution) timestamp. 22 | 23 | """ 24 | 25 | def __init__(self) -> None: 26 | super(ModeDashboardLastSuccessfulExecutionExtractor, self).__init__() 27 | 28 | def init(self, conf: ConfigTree) -> None: 29 | conf = conf.with_fallback( 30 | ConfigFactory.from_dict({ 31 | STATIC_RECORD_DICT: {'product': 'mode', 32 | 'execution_state': 'succeeded', 33 | 'execution_id': DashboardExecution.LAST_SUCCESSFUL_EXECUTION_ID} 34 | }) 35 | ) 36 | super(ModeDashboardLastSuccessfulExecutionExtractor, self).init(conf) 37 | 38 | def get_scope(self) -> str: 39 | return 'extractor.mode_dashboard_last_successful_execution' 40 | 41 | def _build_restapi_query(self) -> RestApiQuery: 42 | """ 43 | Build REST API Query. To get Mode Dashboard last successful execution, it needs to call two APIs (spaces API, 44 | and reports API) joining together. 45 | :return: A RestApiQuery that provides Mode Dashboard last successful execution (run) 46 | """ 47 | 48 | spaces_query = ModeDashboardUtils.get_spaces_query_api(conf=self._conf) 49 | params = ModeDashboardUtils.get_auth_params(conf=self._conf) 50 | 51 | # Reports 52 | # https://mode.com/developer/api-reference/analytics/reports/#listReportsInSpace 53 | url = 'https://app.mode.com/api/{organization}/spaces/{dashboard_group_id}/reports' 54 | json_path = '_embedded.reports[*].[token,last_successfully_run_at]' 55 | field_names = ['dashboard_id', 'execution_timestamp'] 56 | last_successful_run_query = ModePaginatedRestApiQuery(query_to_join=spaces_query, url=url, params=params, 57 | json_path=json_path, field_names=field_names, 58 | skip_no_result=True) 59 | 60 | return last_successful_run_query 61 | -------------------------------------------------------------------------------- /databuilder/models/dashboard/dashboard_owner.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import logging 5 | 6 | from typing import Optional, Any, Union, Iterator 7 | 8 | from databuilder.models.dashboard.dashboard_metadata import DashboardMetadata 9 | from databuilder.models.graph_serializable import ( 10 | GraphSerializable) 11 | from databuilder.models.owner_constants import OWNER_OF_OBJECT_RELATION_TYPE, OWNER_RELATION_TYPE 12 | from databuilder.models.user import User 13 | 14 | from databuilder.models.graph_node import GraphNode 15 | from databuilder.models.graph_relationship import GraphRelationship 16 | 17 | LOGGER = logging.getLogger(__name__) 18 | 19 | 20 | class DashboardOwner(GraphSerializable): 21 | """ 22 | A model that encapsulate Dashboard's owner. 23 | Note that it does not create new user as it has insufficient information about user but it builds relation 24 | between User and Dashboard 25 | """ 26 | 27 | DASHBOARD_EXECUTION_RELATION_TYPE = 'LAST_EXECUTED' 28 | EXECUTION_DASHBOARD_RELATION_TYPE = 'LAST_EXECUTION_OF' 29 | 30 | def __init__(self, 31 | dashboard_group_id: str, 32 | dashboard_id: str, 33 | email: str, 34 | product: Optional[str] = '', 35 | cluster: str = 'gold', 36 | **kwargs: Any 37 | ) -> None: 38 | self._dashboard_group_id = dashboard_group_id 39 | self._dashboard_id = dashboard_id 40 | self._email = email 41 | self._product = product 42 | self._cluster = cluster 43 | 44 | self._relation_iterator = self._create_relation_iterator() 45 | 46 | def create_next_node(self) -> Union[GraphNode, None]: 47 | return None 48 | 49 | def create_next_relation(self) -> Union[GraphRelationship, None]: 50 | try: 51 | return next(self._relation_iterator) 52 | except StopIteration: 53 | return None 54 | 55 | def _create_relation_iterator(self) -> Iterator[GraphRelationship]: 56 | relationship = GraphRelationship( 57 | start_label=DashboardMetadata.DASHBOARD_NODE_LABEL, 58 | end_label=User.USER_NODE_LABEL, 59 | start_key=DashboardMetadata.DASHBOARD_KEY_FORMAT.format( 60 | product=self._product, 61 | cluster=self._cluster, 62 | dashboard_group=self._dashboard_group_id, 63 | dashboard_name=self._dashboard_id 64 | ), 65 | end_key=User.get_user_model_key(email=self._email), 66 | type=OWNER_RELATION_TYPE, 67 | reverse_type=OWNER_OF_OBJECT_RELATION_TYPE, 68 | attributes={} 69 | ) 70 | yield relationship 71 | 72 | def __repr__(self) -> str: 73 | return 'DashboardOwner({!r}, {!r}, {!r}, {!r}, {!r})'.format( 74 | self._dashboard_group_id, 75 | self._dashboard_id, 76 | self._email, 77 | self._product, 78 | self._cluster 79 | ) 80 | -------------------------------------------------------------------------------- /tests/unit/extractor/dashboard/tableau/test_tableau_dashboard_query_extractor.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import logging 5 | import unittest 6 | from typing import Any, Dict 7 | 8 | from mock import patch 9 | from pyhocon import ConfigFactory 10 | 11 | from databuilder import Scoped 12 | from databuilder.extractor.dashboard.tableau.tableau_dashboard_query_extractor import TableauDashboardQueryExtractor 13 | from databuilder.extractor.dashboard.tableau.tableau_dashboard_utils \ 14 | import TableauDashboardAuth, TableauGraphQLApiExtractor 15 | 16 | 17 | logging.basicConfig(level=logging.INFO) 18 | 19 | 20 | def mock_query(*_args: Any, **_kwargs: Any) -> Dict[str, Any]: 21 | return { 22 | 'customSQLTables': [ 23 | { 24 | 'id': 'fake-query-id', 25 | 'name': 'Test Query', 26 | 'query': 'SELECT * FROM foo', 27 | 'downstreamWorkbooks': [ 28 | { 29 | 'name': 'Test Workbook', 30 | 'projectName': 'Test Project' 31 | } 32 | ] 33 | } 34 | ] 35 | } 36 | 37 | 38 | def mock_token(*_args: Any, **_kwargs: Any) -> str: 39 | return '123-abc' 40 | 41 | 42 | class TestTableauDashboardQuery(unittest.TestCase): 43 | 44 | @patch.object(TableauDashboardAuth, '_authenticate', mock_token) 45 | @patch.object(TableauGraphQLApiExtractor, 'execute_query', mock_query) 46 | def test_dashboard_query_extractor(self) -> None: 47 | 48 | config = ConfigFactory.from_dict({ 49 | 'extractor.tableau_dashboard_query.api_base_url': 'api_base_url', 50 | 'extractor.tableau_dashboard_query.api_version': 'tableau_api_version', 51 | 'extractor.tableau_dashboard_query.site_name': 'tableau_site_name', 52 | 'extractor.tableau_dashboard_query.tableau_personal_access_token_name': 53 | 'tableau_personal_access_token_name', 54 | 'extractor.tableau_dashboard_query.tableau_personal_access_token_secret': 55 | 'tableau_personal_access_token_secret', 56 | 'extractor.tableau_dashboard_query.excluded_projects': [], 57 | 'extractor.tableau_dashboard_query.cluster': 'tableau_dashboard_cluster', 58 | 'extractor.tableau_dashboard_query.database': 'tableau_dashboard_database', 59 | 'extractor.tableau_dashboard_query.transformer.timestamp_str_to_epoch.timestamp_format': 60 | '%Y-%m-%dT%H:%M:%SZ', 61 | 62 | }) 63 | 64 | extractor = TableauDashboardQueryExtractor() 65 | extractor.init(Scoped.get_scoped_conf(conf=config, scope=extractor.get_scope())) 66 | record = extractor.extract() 67 | 68 | self.assertEqual(record._query_name, 'Test Query') 69 | self.assertEqual(record._query_text, 'SELECT * FROM foo') 70 | self.assertEqual(record._dashboard_id, 'Test Workbook') 71 | self.assertEqual(record._dashboard_group_id, 'Test Project') 72 | 73 | 74 | if __name__ == '__main__': 75 | unittest.main() 76 | -------------------------------------------------------------------------------- /tests/unit/models/test_table_lineage.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import unittest 5 | 6 | from databuilder.models.table_lineage import TableLineage 7 | from databuilder.models.graph_serializable import RELATION_START_KEY, RELATION_START_LABEL, RELATION_END_KEY, \ 8 | RELATION_END_LABEL, RELATION_TYPE, RELATION_REVERSE_TYPE 9 | from databuilder.serializers import neo4_serializer 10 | 11 | 12 | DB = 'hive' 13 | SCHEMA = 'base' 14 | TABLE = 'test' 15 | CLUSTER = 'default' 16 | 17 | 18 | class TestTableLineage(unittest.TestCase): 19 | 20 | def setUp(self) -> None: 21 | super(TestTableLineage, self).setUp() 22 | self.table_lineage = TableLineage(db_name='hive', 23 | schema=SCHEMA, 24 | table_name=TABLE, 25 | cluster=CLUSTER, 26 | downstream_deps=['hive://default.test_schema/test_table1', 27 | 'hive://default.test_schema/test_table2']) 28 | 29 | def test_get_table_model_key(self) -> None: 30 | metadata = self.table_lineage.get_table_model_key(db=DB, 31 | cluster=CLUSTER, 32 | schema=SCHEMA, 33 | table=TABLE) 34 | self.assertEqual(metadata, 'hive://default.base/test') 35 | 36 | def test_create_nodes(self) -> None: 37 | nodes = self.table_lineage.create_nodes() 38 | self.assertEqual(len(nodes), 0) 39 | 40 | def test_create_relation(self) -> None: 41 | relations = self.table_lineage.create_relation() 42 | self.assertEqual(len(relations), 2) 43 | 44 | start_key = '{db}://{cluster}.{schema}/{tbl}'.format(db=DB, 45 | schema=SCHEMA, 46 | tbl=TABLE, 47 | cluster=CLUSTER) 48 | end_key1 = '{db}://{cluster}.{schema}/{tbl}'.format(db=DB, 49 | schema='test_schema', 50 | tbl='test_table1', 51 | cluster=CLUSTER) 52 | 53 | relation = { 54 | RELATION_START_KEY: start_key, 55 | RELATION_START_LABEL: 'Table', 56 | RELATION_END_KEY: end_key1, 57 | RELATION_END_LABEL: 'Table', 58 | RELATION_TYPE: TableLineage.ORIGIN_DEPENDENCY_RELATION_TYPE, 59 | RELATION_REVERSE_TYPE: TableLineage.DEPENDENCY_ORIGIN_RELATION_TYPE 60 | } 61 | actual_relations = [ 62 | neo4_serializer.serialize_relationship(relation) 63 | for relation in relations 64 | ] 65 | self.assertTrue(len(relations), 2) 66 | self.assertTrue(relation in actual_relations) 67 | -------------------------------------------------------------------------------- /tests/unit/extractor/dashboard/tableau/test_tableau_dashboard_last_modified_extractor.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import logging 5 | import unittest 6 | from typing import Any, Dict 7 | 8 | from mock import patch 9 | from pyhocon import ConfigFactory 10 | 11 | from databuilder import Scoped 12 | from databuilder.extractor.dashboard.tableau.tableau_dashboard_last_modified_extractor \ 13 | import TableauDashboardLastModifiedExtractor 14 | from databuilder.extractor.dashboard.tableau.tableau_dashboard_utils \ 15 | import TableauDashboardAuth, TableauGraphQLApiExtractor 16 | 17 | 18 | logging.basicConfig(level=logging.INFO) 19 | 20 | 21 | def mock_query(*_args: Any, **_kwargs: Any) -> Dict[str, Any]: 22 | return { 23 | 'workbooks': [ 24 | { 25 | 'id': 'fake-workbook-id', 26 | 'name': 'Test Workbook', 27 | 'projectName': 'Test Project', 28 | 'updatedAt': '2020-08-04T20:16:05Z' 29 | } 30 | ] 31 | } 32 | 33 | 34 | def mock_token(*_args: Any, **_kwargs: Any) -> str: 35 | return '123-abc' 36 | 37 | 38 | class TestTableauDashboardLastModified(unittest.TestCase): 39 | 40 | @patch.object(TableauDashboardAuth, '_authenticate', mock_token) 41 | @patch.object(TableauGraphQLApiExtractor, 'execute_query', mock_query) 42 | def test_dashboard_last_modified_extractor(self) -> None: 43 | 44 | config = ConfigFactory.from_dict({ 45 | 'extractor.tableau_dashboard_last_modified.api_base_url': 'api_base_url', 46 | 'extractor.tableau_dashboard_last_modified.api_version': 'tableau_api_version', 47 | 'extractor.tableau_dashboard_last_modified.site_name': 'tableau_site_name', 48 | 'extractor.tableau_dashboard_last_modified.tableau_personal_access_token_name': 49 | 'tableau_personal_access_token_name', 50 | 'extractor.tableau_dashboard_last_modified.tableau_personal_access_token_secret': 51 | 'tableau_personal_access_token_secret', 52 | 'extractor.tableau_dashboard_last_modified.excluded_projects': [], 53 | 'extractor.tableau_dashboard_last_modified.cluster': 'tableau_dashboard_cluster', 54 | 'extractor.tableau_dashboard_last_modified.database': 'tableau_dashboard_database', 55 | 'extractor.tableau_dashboard_last_modified.transformer.timestamp_str_to_epoch.timestamp_format': 56 | '%Y-%m-%dT%H:%M:%SZ', 57 | 58 | }) 59 | 60 | extractor = TableauDashboardLastModifiedExtractor() 61 | extractor.init(Scoped.get_scoped_conf(conf=config, scope=extractor.get_scope())) 62 | record = extractor.extract() 63 | 64 | self.assertEqual(record._dashboard_id, 'Test Workbook') 65 | self.assertEqual(record._dashboard_group_id, 'Test Project') 66 | self.assertEqual(record._product, 'tableau') 67 | self.assertEqual(record._cluster, 'tableau_dashboard_cluster') 68 | self.assertEqual(record._last_modified_timestamp, 1596572165) 69 | 70 | 71 | if __name__ == '__main__': 72 | unittest.main() 73 | -------------------------------------------------------------------------------- /tests/unit/models/dashboard/test_dashboard_last_modified.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import unittest 5 | 6 | from typing import Any, Dict 7 | 8 | from databuilder.models.dashboard.dashboard_last_modified import DashboardLastModifiedTimestamp 9 | from databuilder.models.graph_serializable import RELATION_START_KEY, RELATION_START_LABEL, RELATION_END_KEY, \ 10 | RELATION_END_LABEL, RELATION_TYPE, RELATION_REVERSE_TYPE 11 | from databuilder.serializers import neo4_serializer 12 | 13 | 14 | class TestDashboardLastModifiedTimestamp(unittest.TestCase): 15 | 16 | def test_dashboard_timestamp_nodes(self) -> None: 17 | dashboard_last_modified = DashboardLastModifiedTimestamp(last_modified_timestamp=123456789, 18 | cluster='cluster_id', 19 | product='product_id', 20 | dashboard_id='dashboard_id', 21 | dashboard_group_id='dashboard_group_id') 22 | 23 | actual = dashboard_last_modified.create_next_node() 24 | actual_serialized = neo4_serializer.serialize_node(actual) 25 | expected: Dict[str, Any] = { 26 | 'timestamp:UNQUOTED': 123456789, 27 | 'name': 'last_updated_timestamp', 28 | 'KEY': 'product_id_dashboard://cluster_id.dashboard_group_id/dashboard_id/_last_modified_timestamp', 29 | 'LABEL': 'Timestamp' 30 | } 31 | 32 | assert actual is not None 33 | self.assertDictEqual(actual_serialized, expected) 34 | self.assertIsNone(dashboard_last_modified.create_next_node()) 35 | 36 | def test_dashboard_owner_relations(self) -> None: 37 | dashboard_last_modified = DashboardLastModifiedTimestamp(last_modified_timestamp=123456789, 38 | cluster='cluster_id', 39 | product='product_id', 40 | dashboard_id='dashboard_id', 41 | dashboard_group_id='dashboard_group_id') 42 | 43 | actual = dashboard_last_modified.create_next_relation() 44 | actual_serialized = neo4_serializer.serialize_relationship(actual) 45 | expected: Dict[str, Any] = { 46 | RELATION_END_KEY: 'product_id_dashboard://cluster_id.dashboard_group_id/dashboard_id' 47 | '/_last_modified_timestamp', 48 | RELATION_START_LABEL: 'Dashboard', 49 | RELATION_END_LABEL: 'Timestamp', 50 | RELATION_START_KEY: 'product_id_dashboard://cluster_id.dashboard_group_id/dashboard_id', 51 | RELATION_TYPE: 'LAST_UPDATED_AT', 52 | RELATION_REVERSE_TYPE: 'LAST_UPDATED_TIME_OF' 53 | } 54 | 55 | assert actual is not None 56 | self.assertDictEqual(actual_serialized, expected) 57 | self.assertIsNone(dashboard_last_modified.create_next_relation()) 58 | -------------------------------------------------------------------------------- /tests/unit/models/test_table_stats.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import unittest 5 | from databuilder.models.table_stats import TableColumnStats 6 | 7 | from databuilder.models.graph_serializable import NODE_KEY, \ 8 | NODE_LABEL, RELATION_START_KEY, RELATION_START_LABEL, RELATION_END_KEY, \ 9 | RELATION_END_LABEL, RELATION_TYPE, RELATION_REVERSE_TYPE 10 | from databuilder.serializers import neo4_serializer 11 | 12 | 13 | class TestTableStats(unittest.TestCase): 14 | 15 | def setUp(self) -> None: 16 | super(TestTableStats, self).setUp() 17 | self.table_stats = TableColumnStats(table_name='base.test', 18 | col_name='col', 19 | stat_name='avg', 20 | stat_val='1', 21 | start_epoch='1', 22 | end_epoch='2',) 23 | 24 | self.expected_node_result = { 25 | NODE_KEY: 'hive://gold.base/test/col/avg/', 26 | NODE_LABEL: 'Stat', 27 | 'stat_val': '1', 28 | 'stat_name': 'avg', 29 | 'start_epoch': '1', 30 | 'end_epoch': '2', 31 | } 32 | 33 | self.expected_relation_result = { 34 | RELATION_START_KEY: 'hive://gold.base/test/col/avg/', 35 | RELATION_START_LABEL: 'Stat', 36 | RELATION_END_KEY: 'hive://gold.base/test/col', 37 | RELATION_END_LABEL: 'Column', 38 | RELATION_TYPE: 'STAT_OF', 39 | RELATION_REVERSE_TYPE: 'STAT' 40 | } 41 | 42 | def test_get_table_stat_model_key(self) -> None: 43 | table_stats = self.table_stats.get_table_stat_model_key() 44 | self.assertEqual(table_stats, 'hive://gold.base/test/col/avg/') 45 | 46 | def test_get_col_key(self) -> None: 47 | metadata = self.table_stats.get_col_key() 48 | self.assertEqual(metadata, 'hive://gold.base/test/col') 49 | 50 | def test_create_nodes(self) -> None: 51 | nodes = self.table_stats.create_nodes() 52 | self.assertEquals(len(nodes), 1) 53 | serialized_node = neo4_serializer.serialize_node(nodes[0]) 54 | self.assertEquals(serialized_node, self.expected_node_result) 55 | 56 | def test_create_relation(self) -> None: 57 | relation = self.table_stats.create_relation() 58 | 59 | self.assertEquals(len(relation), 1) 60 | serialized_relation = neo4_serializer.serialize_relationship(relation[0]) 61 | self.assertEquals(serialized_relation, self.expected_relation_result) 62 | 63 | def test_create_next_node(self) -> None: 64 | next_node = self.table_stats.create_next_node() 65 | serialized_node = neo4_serializer.serialize_node(next_node) 66 | self.assertEquals(serialized_node, self.expected_node_result) 67 | 68 | def test_create_next_relation(self) -> None: 69 | next_relation = self.table_stats.create_next_relation() 70 | serialized_relation = neo4_serializer.serialize_relationship(next_relation) 71 | self.assertEquals(serialized_relation, self.expected_relation_result) 72 | -------------------------------------------------------------------------------- /databuilder/extractor/dashboard/mode_analytics/mode_dashboard_last_modified_timestamp_extractor.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import logging 5 | 6 | from pyhocon import ConfigTree, ConfigFactory 7 | 8 | from databuilder.extractor.dashboard.mode_analytics.mode_dashboard_executions_extractor import \ 9 | ModeDashboardExecutionsExtractor 10 | from databuilder.extractor.dashboard.mode_analytics.mode_dashboard_utils import ModeDashboardUtils 11 | from databuilder.extractor.restapi.rest_api_extractor import STATIC_RECORD_DICT 12 | from databuilder.rest_api.mode_analytics.mode_paginated_rest_api_query import ModePaginatedRestApiQuery 13 | from databuilder.rest_api.rest_api_query import RestApiQuery 14 | from databuilder.transformer.dict_to_model import DictToModel, MODEL_CLASS 15 | from databuilder.transformer.timestamp_string_to_epoch import TimestampStringToEpoch, FIELD_NAME 16 | 17 | LOGGER = logging.getLogger(__name__) 18 | 19 | 20 | class ModeDashboardLastModifiedTimestampExtractor(ModeDashboardExecutionsExtractor): 21 | """ 22 | A Extractor that extracts Mode dashboard's last modified timestamp. 23 | 24 | """ 25 | 26 | def __init__(self) -> None: 27 | super(ModeDashboardLastModifiedTimestampExtractor, self).__init__() 28 | 29 | def init(self, conf: ConfigTree) -> None: 30 | conf = conf.with_fallback( 31 | ConfigFactory.from_dict({ 32 | STATIC_RECORD_DICT: {'product': 'mode'}, 33 | '{}.{}'.format(DictToModel().get_scope(), MODEL_CLASS): 34 | 'databuilder.models.dashboard.dashboard_last_modified.DashboardLastModifiedTimestamp', 35 | '{}.{}'.format(TimestampStringToEpoch().get_scope(), FIELD_NAME): 36 | 'last_modified_timestamp' 37 | }) 38 | ) 39 | super(ModeDashboardLastModifiedTimestampExtractor, self).init(conf) 40 | 41 | def get_scope(self) -> str: 42 | return 'extractor.mode_dashboard_last_modified_timestamp_execution' 43 | 44 | def _build_restapi_query(self) -> RestApiQuery: 45 | """ 46 | Build REST API Query. To get Mode Dashboard last modified timestamp, it needs to call two APIs (spaces API, 47 | and reports API) joining together. 48 | :return: A RestApiQuery that provides Mode Dashboard last successful execution (run) 49 | """ 50 | 51 | spaces_query = ModeDashboardUtils.get_spaces_query_api(conf=self._conf) 52 | params = ModeDashboardUtils.get_auth_params(conf=self._conf) 53 | 54 | # Reports 55 | # https://mode.com/developer/api-reference/analytics/reports/#listReportsInSpace 56 | url = 'https://app.mode.com/api/{organization}/spaces/{dashboard_group_id}/reports' 57 | json_path = '_embedded.reports[*].[token,edited_at]' 58 | field_names = ['dashboard_id', 'last_modified_timestamp'] 59 | last_modified_query = ModePaginatedRestApiQuery(query_to_join=spaces_query, url=url, params=params, 60 | json_path=json_path, field_names=field_names, 61 | skip_no_result=True) 62 | 63 | return last_modified_query 64 | -------------------------------------------------------------------------------- /tests/unit/models/test_table_source.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import unittest 5 | 6 | from databuilder.models.table_source import TableSource 7 | from databuilder.models.graph_serializable import RELATION_START_KEY, RELATION_START_LABEL, RELATION_END_KEY, \ 8 | RELATION_END_LABEL, RELATION_TYPE, RELATION_REVERSE_TYPE 9 | from databuilder.serializers import neo4_serializer 10 | 11 | 12 | DB = 'hive' 13 | SCHEMA = 'base' 14 | TABLE = 'test' 15 | CLUSTER = 'default' 16 | SOURCE = '/etl/sql/file.py' 17 | 18 | 19 | class TestTableSource(unittest.TestCase): 20 | 21 | def setUp(self) -> None: 22 | super(TestTableSource, self).setUp() 23 | self.table_source = TableSource(db_name='hive', 24 | schema=SCHEMA, 25 | table_name=TABLE, 26 | cluster=CLUSTER, 27 | source=SOURCE) 28 | 29 | def test_get_source_model_key(self) -> None: 30 | source = self.table_source.get_source_model_key() 31 | self.assertEqual(source, '{db}://{cluster}.{schema}/{tbl}/_source'.format(db=DB, 32 | schema=SCHEMA, 33 | tbl=TABLE, 34 | cluster=CLUSTER, 35 | )) 36 | 37 | def test_get_metadata_model_key(self) -> None: 38 | metadata = self.table_source.get_metadata_model_key() 39 | self.assertEqual(metadata, 'hive://default.base/test') 40 | 41 | def test_create_nodes(self) -> None: 42 | nodes = self.table_source.create_nodes() 43 | self.assertEqual(len(nodes), 1) 44 | 45 | def test_create_relation(self) -> None: 46 | relations = self.table_source.create_relation() 47 | self.assertEquals(len(relations), 1) 48 | serialized_relation = neo4_serializer.serialize_relationship(relations[0]) 49 | 50 | start_key = '{db}://{cluster}.{schema}/{tbl}/_source'.format(db=DB, 51 | schema=SCHEMA, 52 | tbl=TABLE, 53 | cluster=CLUSTER) 54 | end_key = '{db}://{cluster}.{schema}/{tbl}'.format(db=DB, 55 | schema=SCHEMA, 56 | tbl=TABLE, 57 | cluster=CLUSTER) 58 | 59 | expected_relation = { 60 | RELATION_START_KEY: start_key, 61 | RELATION_START_LABEL: TableSource.LABEL, 62 | RELATION_END_KEY: end_key, 63 | RELATION_END_LABEL: 'Table', 64 | RELATION_TYPE: TableSource.SOURCE_TABLE_RELATION_TYPE, 65 | RELATION_REVERSE_TYPE: TableSource.TABLE_SOURCE_RELATION_TYPE 66 | } 67 | 68 | self.assertDictEqual(expected_relation, serialized_relation) 69 | -------------------------------------------------------------------------------- /databuilder/extractor/glue_extractor.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import boto3 5 | 6 | from pyhocon import ConfigFactory, ConfigTree 7 | from typing import Iterator, Union, Dict, Any, List 8 | 9 | from databuilder.extractor.base_extractor import Extractor 10 | from databuilder.models.table_metadata import TableMetadata, ColumnMetadata 11 | 12 | 13 | class GlueExtractor(Extractor): 14 | """ 15 | Extracts tables and columns metadata from AWS Glue metastore 16 | """ 17 | 18 | CLUSTER_KEY = 'cluster' 19 | FILTER_KEY = 'filters' 20 | DEFAULT_CONFIG = ConfigFactory.from_dict({CLUSTER_KEY: 'gold', FILTER_KEY: None}) 21 | 22 | def init(self, conf: ConfigTree) -> None: 23 | conf = conf.with_fallback(GlueExtractor.DEFAULT_CONFIG) 24 | self._cluster = '{}'.format(conf.get_string(GlueExtractor.CLUSTER_KEY)) 25 | self._filters = conf.get(GlueExtractor.FILTER_KEY) 26 | self._glue = boto3.client('glue') 27 | self._extract_iter: Union[None, Iterator] = None 28 | 29 | def extract(self) -> Union[TableMetadata, None]: 30 | if not self._extract_iter: 31 | self._extract_iter = self._get_extract_iter() 32 | try: 33 | return next(self._extract_iter) 34 | except StopIteration: 35 | return None 36 | 37 | def get_scope(self) -> str: 38 | return 'extractor.glue' 39 | 40 | def _get_extract_iter(self) -> Iterator[TableMetadata]: 41 | """ 42 | It gets all tables and yields TableMetadata 43 | :return: 44 | """ 45 | for row in self._get_raw_extract_iter(): 46 | columns, i = [], 0 47 | 48 | for column in row['StorageDescriptor']['Columns'] \ 49 | + row.get('PartitionKeys', []): 50 | columns.append(ColumnMetadata( 51 | column['Name'], 52 | column['Comment'] if 'Comment' in column else None, 53 | column['Type'], 54 | i 55 | )) 56 | i += 1 57 | 58 | yield TableMetadata( 59 | 'glue', 60 | self._cluster, 61 | row['DatabaseName'], 62 | row['Name'], 63 | row.get('Description') or row.get('Parameters', {}).get('comment'), 64 | columns, 65 | row.get('TableType') == 'VIRTUAL_VIEW', 66 | ) 67 | 68 | def _get_raw_extract_iter(self) -> Iterator[Dict[str, Any]]: 69 | """ 70 | Provides iterator of results row from glue client 71 | :return: 72 | """ 73 | tables = self._search_tables() 74 | return iter(tables) 75 | 76 | def _search_tables(self) -> List[Dict[str, Any]]: 77 | tables = [] 78 | kwargs = {} 79 | if self._filters is not None: 80 | kwargs['Filters'] = self._filters 81 | data = self._glue.search_tables(**kwargs) 82 | tables += data['TableList'] 83 | while 'NextToken' in data: 84 | token = data['NextToken'] 85 | kwargs['NextToken'] = token 86 | data = self._glue.search_tables(**kwargs) 87 | tables += data['TableList'] 88 | return tables 89 | -------------------------------------------------------------------------------- /tests/unit/models/test_table_last_updated.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import unittest 5 | 6 | from databuilder.models.graph_serializable import NODE_KEY, \ 7 | NODE_LABEL, RELATION_START_KEY, RELATION_START_LABEL, RELATION_END_KEY, \ 8 | RELATION_END_LABEL, RELATION_TYPE, RELATION_REVERSE_TYPE 9 | from databuilder.models.table_last_updated import TableLastUpdated 10 | from databuilder.models.timestamp import timestamp_constants 11 | from databuilder.serializers import neo4_serializer 12 | 13 | 14 | class TestTableLastUpdated(unittest.TestCase): 15 | 16 | def setUp(self) -> None: 17 | super(TestTableLastUpdated, self).setUp() 18 | 19 | self.tableLastUpdated = TableLastUpdated(table_name='test_table', 20 | last_updated_time_epoch=25195665, 21 | schema='default') 22 | 23 | self.expected_node_result = { 24 | NODE_KEY: 'hive://gold.default/test_table/timestamp', 25 | NODE_LABEL: 'Timestamp', 26 | 'last_updated_timestamp:UNQUOTED': 25195665, 27 | timestamp_constants.TIMESTAMP_PROPERTY + ":UNQUOTED": 25195665, 28 | 'name': 'last_updated_timestamp' 29 | } 30 | 31 | self.expected_relation_result = { 32 | RELATION_START_KEY: 'hive://gold.default/test_table', 33 | RELATION_START_LABEL: 'Table', 34 | RELATION_END_KEY: 'hive://gold.default/test_table/timestamp', 35 | RELATION_END_LABEL: 'Timestamp', 36 | RELATION_TYPE: 'LAST_UPDATED_AT', 37 | RELATION_REVERSE_TYPE: 'LAST_UPDATED_TIME_OF' 38 | } 39 | 40 | def test_create_next_node(self) -> None: 41 | next_node = self.tableLastUpdated.create_next_node() 42 | next_node_serialized = neo4_serializer.serialize_node(next_node) 43 | self.assertEqual(next_node_serialized, self.expected_node_result) 44 | 45 | def test_create_next_relation(self) -> None: 46 | next_relation = self.tableLastUpdated.create_next_relation() 47 | next_relation_serialized = neo4_serializer.serialize_relationship(next_relation) 48 | self.assertEqual(next_relation_serialized, self.expected_relation_result) 49 | 50 | def test_get_table_model_key(self) -> None: 51 | table = self.tableLastUpdated.get_table_model_key() 52 | self.assertEqual(table, 'hive://gold.default/test_table') 53 | 54 | def test_get_last_updated_model_key(self) -> None: 55 | last_updated = self.tableLastUpdated.get_last_updated_model_key() 56 | self.assertEqual(last_updated, 'hive://gold.default/test_table/timestamp') 57 | 58 | def test_create_nodes(self) -> None: 59 | nodes = self.tableLastUpdated.create_nodes() 60 | self.assertEquals(len(nodes), 1) 61 | serialize_node = neo4_serializer.serialize_node(nodes[0]) 62 | self.assertEquals(serialize_node, self.expected_node_result) 63 | 64 | def test_create_relation(self) -> None: 65 | relation = self.tableLastUpdated.create_relation() 66 | self.assertEquals(len(relation), 1) 67 | serialized_relation = neo4_serializer.serialize_relationship(relation[0]) 68 | self.assertEquals(serialized_relation, self.expected_relation_result) 69 | -------------------------------------------------------------------------------- /tests/unit/models/dashboard/test_dashboard_usage.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import unittest 5 | 6 | from typing import Any, Dict 7 | 8 | from databuilder.models.dashboard.dashboard_usage import DashboardUsage 9 | from databuilder.models.graph_serializable import RELATION_START_KEY, RELATION_START_LABEL, RELATION_END_KEY, \ 10 | RELATION_END_LABEL, RELATION_TYPE, RELATION_REVERSE_TYPE 11 | from databuilder.serializers import neo4_serializer 12 | 13 | 14 | class TestDashboardOwner(unittest.TestCase): 15 | 16 | def test_dashboard_usage_user_nodes(self) -> None: 17 | dashboard_usage = DashboardUsage(dashboard_group_id='dashboard_group_id', dashboard_id='dashboard_id', 18 | email='foo@bar.com', view_count=123, cluster='cluster_id', 19 | product='product_id', should_create_user_node=True) 20 | 21 | actual = dashboard_usage.create_next_node() 22 | actual_serialized = neo4_serializer.serialize_node(actual) 23 | expected: Dict[str, Any] = { 24 | 'is_active:UNQUOTED': True, 25 | 'last_name': '', 26 | 'full_name': '', 27 | 'employee_type': '', 28 | 'first_name': '', 29 | 'updated_at:UNQUOTED': 0, 30 | 'LABEL': 'User', 31 | 'slack_id': '', 32 | 'KEY': 'foo@bar.com', 33 | 'github_username': '', 34 | 'team_name': '', 35 | 'email': 'foo@bar.com', 36 | 'role_name': '' 37 | } 38 | 39 | assert actual is not None 40 | self.assertDictEqual(expected, actual_serialized) 41 | self.assertIsNone(dashboard_usage.create_next_node()) 42 | 43 | def test_dashboard_usage_no_user_nodes(self) -> None: 44 | dashboard_usage = DashboardUsage(dashboard_group_id='dashboard_group_id', dashboard_id='dashboard_id', 45 | email='foo@bar.com', view_count=123, 46 | should_create_user_node=False, cluster='cluster_id', 47 | product='product_id') 48 | 49 | self.assertIsNone(dashboard_usage.create_next_node()) 50 | 51 | def test_dashboard_owner_relations(self) -> None: 52 | dashboard_usage = DashboardUsage(dashboard_group_id='dashboard_group_id', dashboard_id='dashboard_id', 53 | email='foo@bar.com', view_count=123, cluster='cluster_id', 54 | product='product_id') 55 | 56 | actual = dashboard_usage.create_next_relation() 57 | actual_serialized = neo4_serializer.serialize_relationship(actual) 58 | expected: Dict[str, Any] = { 59 | 'read_count:UNQUOTED': 123, 60 | RELATION_END_KEY: 'foo@bar.com', 61 | RELATION_START_LABEL: 'Dashboard', 62 | RELATION_END_LABEL: 'User', 63 | RELATION_START_KEY: 'product_id_dashboard://cluster_id.dashboard_group_id/dashboard_id', 64 | RELATION_TYPE: 'READ_BY', 65 | RELATION_REVERSE_TYPE: 'READ' 66 | } 67 | 68 | assert actual is not None 69 | self.assertDictEqual(expected, actual_serialized) 70 | self.assertIsNone(dashboard_usage.create_next_relation()) 71 | -------------------------------------------------------------------------------- /tests/unit/extractor/dashboard/tableau/test_tableau_dashboard_extractor.py: -------------------------------------------------------------------------------- 1 | # Copyright Contributors to the Amundsen project. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import logging 5 | import unittest 6 | from typing import Any, Dict 7 | 8 | from mock import patch 9 | from pyhocon import ConfigFactory 10 | 11 | from databuilder import Scoped 12 | from databuilder.extractor.dashboard.tableau.tableau_dashboard_extractor import TableauDashboardExtractor 13 | from databuilder.extractor.dashboard.tableau.tableau_dashboard_utils \ 14 | import TableauDashboardAuth, TableauGraphQLApiExtractor 15 | 16 | 17 | logging.basicConfig(level=logging.INFO) 18 | 19 | 20 | def mock_query(*_args: Any, **_kwargs: Any) -> Dict[str, Any]: 21 | return { 22 | 'workbooks': [ 23 | { 24 | 'id': 'fake-id', 25 | 'name': 'Test Workbook', 26 | 'createdAt': '2020-04-08T05:32:01Z', 27 | 'description': '', 28 | 'projectName': 'Test Project', 29 | 'projectVizportalUrlId': 123, 30 | 'vizportalUrlId': 456 31 | } 32 | ] 33 | } 34 | 35 | 36 | def mock_token(*_args: Any, **_kwargs: Any) -> str: 37 | return '123-abc' 38 | 39 | 40 | class TestTableauDashboardExtractor(unittest.TestCase): 41 | 42 | @patch.object(TableauDashboardAuth, '_authenticate', mock_token) 43 | @patch.object(TableauGraphQLApiExtractor, 'execute_query', mock_query) 44 | def test_dashboard_metadata_extractor(self) -> None: 45 | 46 | config = ConfigFactory.from_dict({ 47 | 'extractor.tableau_dashboard_metadata.api_base_url': 'api_base_url', 48 | 'extractor.tableau_dashboard_metadata.tableau_base_url': 'tableau_base_url', 49 | 'extractor.tableau_dashboard_metadata.api_version': 'tableau_api_version', 50 | 'extractor.tableau_dashboard_metadata.site_name': 'tableau_site_name', 51 | 'extractor.tableau_dashboard_metadata.tableau_personal_access_token_name': 52 | 'tableau_personal_access_token_name', 53 | 'extractor.tableau_dashboard_metadata.tableau_personal_access_token_secret': 54 | 'tableau_personal_access_token_secret', 55 | 'extractor.tableau_dashboard_metadata.excluded_projects': [], 56 | 'extractor.tableau_dashboard_metadata.cluster': 'tableau_dashboard_cluster', 57 | 'extractor.tableau_dashboard_metadata.database': 'tableau_dashboard_database', 58 | 'extractor.tableau_dashboard_metadata.transformer.timestamp_str_to_epoch.timestamp_format': 59 | '%Y-%m-%dT%H:%M:%SZ', 60 | 61 | }) 62 | 63 | extractor = TableauDashboardExtractor() 64 | extractor.init(Scoped.get_scoped_conf(conf=config, scope=extractor.get_scope())) 65 | record = extractor.extract() 66 | 67 | self.assertEqual(record.dashboard_id, 'Test Workbook') 68 | self.assertEqual(record.dashboard_name, 'Test Workbook') 69 | self.assertEqual(record.dashboard_group_id, 'Test Project') 70 | self.assertEqual(record.dashboard_group, 'Test Project') 71 | self.assertEqual(record.product, 'tableau') 72 | self.assertEqual(record.cluster, 'tableau_dashboard_cluster') 73 | self.assertEqual(record.created_timestamp, 1586323921) 74 | 75 | 76 | if __name__ == '__main__': 77 | unittest.main() 78 | --------------------------------------------------------------------------------